# Data pre-processing

After we get the original data, we need to do some pre-processing, e.g. convert time into datetime format, change column name, filter the time range we want, transform the categorical column and boolean column.

In [25]:
import json
import os
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt

def getFileName(filePath):
    return os.listdir(filePath)

def readJson(filePath):
    print("Start Reading file ...", filePath)
    reviews = {}
    i = 0 
    for line in open(filePath, 'r'):
        reviews[i] = json.loads(line)
        i += 1
    return pd.DataFrame.from_dict(reviews, orient='index')

In [22]:
def convertTime(string):
    m, d, y = string.split(' ')
    d = d[:-1]
    time_string = '{}/{}/{}'.format(m, d, y)
    return datetime.strptime(time_string, '%m/%d/%Y')

def dataClean(data):
    data["reviewTime"] = data[["reviewTime"]].applymap(convertTime)
    data = data[data["reviewTime"] >= '2016-01-01']
    data = data[['overall', 'verified', 'reviewTime', 'reviewerID', 'asin', 'reviewText', 'summary', 'image']].rename(columns={"asin": "productID"})
    data['image'] = ~ data['image'].isna()
    return data

def saveJson(data, file_name):
    file_name = 'data/' + file_name
    
    # Store the datetime as string time in order to store in json
    data["reviewTime"] = data.reviewTime.dt.strftime('%Y-%m-%d')
    data.to_json(file_name, orient='records')
    print("Saving..., file name:", file_name)

In [27]:
filePath = 'original'
for file_name in os.listdir(filePath):
    if "json" != file_name[-4:]:
        continue
    data = readJson(filePath + "/" + file_name)
    clean_data = dataClean(data)
    department = file_name[:-7]
    clean_data["department"] = department
    saveJson(clean_data, department + ".json")

Start Reading file ... original/Video_Games_5.json
Saving..., file name: data/Video_Games.json
Start Reading file ... original/Toys_and_Games_5.json
Saving..., file name: data/Toys_and_Games.json
Start Reading file ... original/Luxury_Beauty_5.json
Saving..., file name: data/Luxury_Beauty.json
Start Reading file ... original/Industrial_and_Scientific_5.json
Saving..., file name: data/Industrial_and_Scientific.json
Start Reading file ... original/Software_5.json
Saving..., file name: data/Software.json
Start Reading file ... original/Patio_Lawn_and_Garden_5.json
Saving..., file name: data/Patio_Lawn_and_Garden.json
Start Reading file ... original/Pet_Supplies_5.json
Saving..., file name: data/Pet_Supplies.json
Start Reading file ... original/Musical_Instruments_5.json
Saving..., file name: data/Musical_Instruments.json
Start Reading file ... original/Office_Products_5.json
Saving..., file name: data/Office_Products.json
Start Reading file ... original/Prime_Pantry_5.json
Saving..., file