In [1]:
import pandas as pd
import numpy as np
import gzip
import json
import csv
from sklearn.preprocessing import LabelEncoder 

In [2]:
class AmazonSoftwareRatingMatrix:
    def __init__(self):
        
        data = [] #loading the data into an array 
        with gzip.open('Software.json.gz') as file: 
            for l in file:
                data.append(json.loads(l.strip()))
        
        useCols = ['overall','verified','reviewerID','asin']#Selecting  only data that is required for the generating the user-item matrix 
        self.dataSet = pd.DataFrame.from_dict(data)[useCols]
        
        self.dataSet.dropna(axis = 0, how = 'any', inplace = True)#Dropping any rows with empty elements 
        self.dataSet = self.dataSet[self.dataSet['verified']]#removing any unverified reviews
        
        numberOfUserReviews = self.dataSet['reviewerID'].value_counts()#removing users with under 5 reviews 
        self.dataSet = self.dataSet[self.dataSet['reviewerID'].isin(numberOfUserReviews[numberOfUserReviews >= 5].index)]
        
        numberOfItemReviews = self.dataSet['asin'].value_counts()#removing items with under 5 reviews 
        self.dataSet = self.dataSet[self.dataSet['asin'].isin(numberOfItemReviews[numberOfItemReviews >= 5].index)]
        
        self.encodedDataset = self.dataSet.copy()
        
        encoder = LabelEncoder()#encoding the data
        self.encodedDataset['reviewerID'] = encoder.fit_transform( self.encodedDataset['reviewerID'])
        self.encodedDataset['asin'] = encoder.fit_transform( self.encodedDataset['asin'])
        
        #print(self.encodedDataset)
        
    def createRatingMatrix(self):
        self.encodedDataset = self.encodedDataset.drop_duplicates(['reviewerID', 'asin', 'overall'])
        self.ratingMatrix = self.encodedDataset.pivot_table(index = 'reviewerID', columns ='asin', values = 'overall').fillna(0)
        
        ratingMatrix = self.ratingMatrix.to_numpy()  
        self.numberOfItems = len(ratingMatrix[0])
        self.numberOfUsers = len(ratingMatrix)
        
        with open('Amazon_Software.csv','w') as file:
            docWriter = csv.writer(file, delimiter = ',')
            docWriter.writerows(ratingMatrix)
    
        return ratingMatrix

In [3]:
#test = AmazonRatingMatrix()
#testArr = test.createRatingMatrix()
#print(testArr)