In [None]:
import pandas as pd
import os
import gzip
import json

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/CF_Resources

/content/drive/MyDrive/CF_Resources


In [None]:
# specify the path to your json.gz file
file_path = "Appliances.json.gz"

In [None]:
data = []

with gzip.open(file_path, 'rt', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if line:
            obj = json.loads(line)
            data.append(obj)

In [None]:
df = pd.DataFrame(data)

In [None]:
df.head()

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image
0,5.0,2.0,False,"11 27, 2013",A3NHUQ33CFH3VM,1118461304,{'Format:': ' Hardcover'},Greeny,Not one thing in this book seemed an obvious o...,Clear on what leads to innovation,1385510400,
1,5.0,,False,"11 1, 2013",A3SK6VNBQDNBJE,1118461304,{'Format:': ' Kindle Edition'},Leif C. Ulstrup,I have enjoyed Dr. Alan Gregerman's weekly blo...,Becoming more innovative by opening yourself t...,1383264000,
2,5.0,,False,"10 10, 2013",A3SOFHUR27FO3K,1118461304,{'Format:': ' Hardcover'},Harry Gilbert Miller III,Alan Gregerman believes that innovation comes ...,The World from Different Perspectives,1381363200,
3,5.0,,False,"10 9, 2013",A1HOG1PYCAE157,1118461304,{'Format:': ' Hardcover'},Rebecca Ripley,"Alan Gregerman is a smart, funny, entertaining...",Strangers are Your New Best Friends,1381276800,
4,5.0,10.0,False,"09 7, 2013",A26JGAM6GZMM4V,1118461304,{'Format:': ' Hardcover'},Robert Morris,"As I began to read this book, I was again remi...","How and why it is imperative to engage, learn ...",1378512000,


In [None]:
df = df.drop(columns=[ 'vote', 'style', 'image', 'reviewTime'])

In [None]:
df.shape

(602777, 8)

In [None]:
df.head()

Unnamed: 0,overall,verified,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime
0,5.0,False,A3NHUQ33CFH3VM,1118461304,Greeny,Not one thing in this book seemed an obvious o...,Clear on what leads to innovation,1385510400
1,5.0,False,A3SK6VNBQDNBJE,1118461304,Leif C. Ulstrup,I have enjoyed Dr. Alan Gregerman's weekly blo...,Becoming more innovative by opening yourself t...,1383264000
2,5.0,False,A3SOFHUR27FO3K,1118461304,Harry Gilbert Miller III,Alan Gregerman believes that innovation comes ...,The World from Different Perspectives,1381363200
3,5.0,False,A1HOG1PYCAE157,1118461304,Rebecca Ripley,"Alan Gregerman is a smart, funny, entertaining...",Strangers are Your New Best Friends,1381276800
4,5.0,False,A26JGAM6GZMM4V,1118461304,Robert Morris,"As I began to read this book, I was again remi...","How and why it is imperative to engage, learn ...",1378512000


In [None]:
df['reviewerID'] = pd.factorize(df['reviewerID'])[0]
df['asin'] = pd.factorize(df['asin'])[0]
df.dropna(subset=['reviewText'], inplace=True)
df.head()

Unnamed: 0,overall,verified,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime
0,5.0,False,0,0,Greeny,Not one thing in this book seemed an obvious o...,Clear on what leads to innovation,1385510400
1,5.0,False,1,0,Leif C. Ulstrup,I have enjoyed Dr. Alan Gregerman's weekly blo...,Becoming more innovative by opening yourself t...,1383264000
2,5.0,False,2,0,Harry Gilbert Miller III,Alan Gregerman believes that innovation comes ...,The World from Different Perspectives,1381363200
3,5.0,False,3,0,Rebecca Ripley,"Alan Gregerman is a smart, funny, entertaining...",Strangers are Your New Best Friends,1381276800
4,5.0,False,4,0,Robert Morris,"As I began to read this book, I was again remi...","How and why it is imperative to engage, learn ...",1378512000


# New section

In [None]:
def processAmazonReviews(reviews_df):

    # Create a dictionary of word to word_id
    word_id_dict = {}
    with open("WordDict.out", "w") as f:
        idx = 0
        for review in reviews_df["reviewText"]:
            if(type(review)== str):
              review_words = review.split()
              for word in review_words:
                  if word not in word_id_dict:
                      word_id_dict[word] = idx
                      idx += 1
                      f.write(word + "\t" + str(word_id_dict[word]) + "\n")


    # Create user review document and item review document files
    with open("UserReviews.out", "w") as f_user, open("ItemReviews.out", "w") as f_item:
        for index, row in reviews_df.iterrows():
            user_review = str(row["reviewerID"]) + "\t" + " ".join([str(word_id_dict[word]) for word in row["reviewText"].split()]) + "\n"
            item_review = str(row["asin"]) + "\t" + " ".join([str(word_id_dict[word]) for word in row["reviewText"].split()]) + "\n"
            f_user.write(user_review)
            f_item.write(item_review)

    # Create training, validation, and testing rating files
    ratings_df = reviews_df[["reviewerID", "asin", "overall", "unixReviewTime"]]

    #ratings_df = ratings_df.sort_values(by="unixReviewTime")
    total_size = len(reviews_df)
    train_size = int(total_size * 0.8)
    val_size = int(total_size * 0.1)

    with open("TrainInteraction.out", "w") as f_train, \
         open("ValInteraction.out", "w") as f_val, \
         open("TestInteraction.out", "w") as f_test:
        for i, row in ratings_df.iterrows():

            if i < train_size:
                f_train.write(str(int(row["reviewerID"])) + "\t" + str(int(row["asin"])) + "\t" + str(row["overall"]) + "\t" + str(int(row["unixReviewTime"])) + "\n")
            elif i < train_size + val_size:
                f_val.write(str(int(row["reviewerID"])) + "\t" + str(int(row["asin"])) + "\t" + str(row["overall"]) + "\t" + str(int(row["unixReviewTime"])) + "\n")
            else:
                f_test.write(str(int(row["reviewerID"])) + "\t" + str(int(row["asin"])) + "\t" + str(row["overall"]) + "\t" + str(int(row["unixReviewTime"])) + "\n")

    print("Processing Amazon reviews dataset finished.")


In [None]:
%cd /content
processAmazonReviews(df)

/content
Processing Amazon reviews dataset finished.


In [None]:
! python Carl.py