In [10]:
import gzip
import pandas as pd
from collections import defaultdict

def readGz(f):
    for l in gzip.open(f):
        yield eval(l)

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

train_df = getDF('train.json.gz')
test_df = getDF('test_Helpful.json.gz')

In [11]:
train_df.head(5)

Unnamed: 0,categoryID,categories,itemID,reviewerID,rating,reviewText,reviewHash,reviewTime,summary,unixReviewTime,helpful,price
0,0,"[[Clothing, Shoes & Jewelry, Women], [Clothing...",I655355328,U745881038,3.0,"These are cute, but they are a little small. ...",R115160670,"05 20, 2014",Cute,1400544000,"{'outOf': 0, 'nHelpful': 0}",
1,0,"[[Clothing, Shoes & Jewelry, Women, Clothing, ...",I241092314,U023577405,4.0,"I love the look of this bra, it is what I want...",R800651687,"02 7, 2013",Beautiful but size runs small,1360195200,"{'outOf': 0, 'nHelpful': 0}",
2,0,"[[Clothing, Shoes & Jewelry, Wedding Party Gif...",I408260822,U441384838,3.0,it's better on a man's hand.I didn't find it v...,R345042616,"05 13, 2014",Good price but...,1399939200,"{'outOf': 2, 'nHelpful': 2}",19.99
3,0,"[[Clothing, Shoes & Jewelry, Women, Clothing, ...",I770448753,U654041297,4.0,Comfortable and easy to wear for a day of shop...,R875466866,"05 25, 2014","Easy, breezy",1400976000,"{'outOf': 0, 'nHelpful': 0}",14.95
4,0,"[[Clothing, Shoes & Jewelry, Women, Plus-Size,...",I919238161,U096604734,5.0,I'm quite small and the XS fits me like a regu...,R317526520,"07 30, 2013",Great shirt,1375142400,"{'outOf': 1, 'nHelpful': 1}",


In [12]:
import itertools

train_df['categories'] = train_df.categories.apply(lambda x: list(itertools.chain(*x))).apply(lambda x: [each.split(',') for each in x]).apply(lambda x: list(set(itertools.chain(*x))))
test_df['categories'] = test_df.categories.apply(lambda x: list(itertools.chain(*x))).apply(lambda x: [each.split(',') for each in x]).apply(lambda x: list(set(itertools.chain(*x))))

In [4]:
categories_fit = train_df.categories.to_list()
test_categories_fit = test_df.categories.to_list()

In [5]:
from sklearn import preprocessing
lb = preprocessing.MultiLabelBinarizer()

lb.fit(categories_fit)
train_cats = pd.DataFrame(lb.transform(categories_fit),columns = lb.classes_)
test_cats = pd.DataFrame(lb.transform(test_categories_fit),columns = lb.classes_)

In [6]:
train_df.drop('categories',axis = 1,inplace = True)
test_df.drop('categories',axis = 1, inplace = True)

In [7]:
train = pd.concat([train_df,train_cats],axis = 1)
test = pd.concat([test_df,test_cats],axis = 1)

# Uncomment to run without categories
# train = train_df
# test = test_df

In [53]:
helpfulNess_train = pd.DataFrame(train_df.helpful.apply(lambda x: [x['outOf'],x['nHelpful']]).to_list(),columns = ['outOf','nHelpful'])

helpfulNess_test = pd.DataFrame(test_df.helpful.apply(lambda x: [x['outOf']]).to_list(),columns = ['outOf'])

In [9]:
train = pd.concat([train,helpfulNess_train],axis = 1)
test = pd.concat([test,helpfulNess_test],axis = 1)

In [10]:
train.drop(['helpful','reviewHash','reviewText','price'],axis = 1,inplace = True)
test.drop(['helpful','reviewHash','reviewText','price'],axis = 1,inplace = True)

In [11]:
month = train.reviewTime.apply(lambda x: [each.replace(',','') for each in str(x).split(' ')]).apply(lambda x:x[0])
day = train.reviewTime.apply(lambda x: [each.replace(',','') for each in str(x).split(' ')]).apply(lambda x:x[1])
year = train.reviewTime.apply(lambda x: [each.replace(',','') for each in str(x).split(' ')]).apply(lambda x:x[2])

month_test = test.reviewTime.apply(lambda x: [each.replace(',','') for each in str(x).split(' ')]).apply(lambda x:x[0])
day_test = test.reviewTime.apply(lambda x: [each.replace(',','') for each in str(x).split(' ')]).apply(lambda x:x[1])
year_test = test.reviewTime.apply(lambda x: [each.replace(',','') for each in str(x).split(' ')]).apply(lambda x:x[2])

In [12]:
train.drop(['reviewTime'],axis = 1,inplace = True)
test.drop(['reviewTime'],axis = 1,inplace = True)
train.drop(['unixReviewTime'],axis = 1,inplace = True)
test.drop(['unixReviewTime'],axis = 1,inplace = True)

train['day'] = day.astype('int')
train['month'] = month.astype('int')
train['year'] = year.astype('int')

test['day'] = day_test.astype('int')
test['month'] = month_test.astype('int')
test['year'] = year_test.astype('int')

### Adding a feature for how many words there are in a Review

In [13]:
import nltk
nltk.download('punkt')
train['textLength'] = train_df.reviewText.apply(lambda x:len(nltk.word_tokenize(x)))
test['textLength'] = test_df.reviewText.apply(lambda x:len(nltk.word_tokenize(x)))

## Summary length does not work as well

#train['summaryLength'] = train_df.summary.apply(lambda x:len(x.split(' ')))
#test['summaryLength'] = test_df.summary.apply(lambda x:len(x.split(' ')))

[nltk_data] Downloading package punkt to /home/akash/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
train.drop(['summary'],axis = 1,inplace = True)
test.drop(['summary'], axis = 1,inplace = True)

In [None]:
train_df.reviewText

## Items will never be new, but reviewers will be.

In [15]:
from sklearn.preprocessing import LabelEncoder
l_item = LabelEncoder()
l_item.fit(train.itemID)

LabelEncoder()

In [16]:
train['itemID'] = l_item.transform(train.itemID)

In [17]:
test['itemID'] = l_item.transform(test.itemID)

In [18]:
l_review = LabelEncoder()
l_review.fit(train.reviewerID)

train['reviewerID'] = l_review.transform(train.reviewerID)
#test['reviewerID'] = l_review.transform(test.reviewerID)

In [19]:
import numpy as np
l_review.classes_ = np.append(l_review.classes_,'Unknown')

In [20]:
unknowns = set(test.reviewerID).difference(l_review.classes_)
test['reviewerID'] = test.reviewerID.apply(lambda x: 'Unknown' if x in unknowns else x)
test['reviewerID'] = l_review.transform(test.reviewerID)

### Tfidf vectorizer analysis

In [42]:
from nltk.corpus import stopwords

In [43]:
from sklearn.feature_extraction.text import CountVectorizer

# Taking only max 5000 features into account. 
count_vectorizer = CountVectorizer(stop_words=stopwords.words('english'),lowercase=True,max_features=5000)
train_summary = count_vectorizer.fit_transform(train_df.summary)
test_summary = count_vectorizer.transform(test_df.summary)

### No need to scale Random Forests

In [23]:
# from sklearn.preprocessing import StandardScaler

# scalers = {}
# for i in range(train.shape[1]):
#     colname = train.columns[i]is
#     scalers[colname] = StandardScaler()
#     train[colname] = scalers[colname].fit_transform(train[colname].to_numpy().reshape(-1,1))
    
# for i in range(test.shape[1]):
#     colname = test.columns[i]
#     test[colname] = scalers[colname].transform(test[colname].to_numpy().reshape(-1,1)) 

In [24]:
help_ = train['nHelpful']
train = train.drop('nHelpful',axis = 1)

In [25]:
import scipy
from scipy.sparse import hstack

train = scipy.sparse.csr_matrix(train.values)
test = scipy.sparse.csr_matrix(test.values)

# Comment to remove summary
train = hstack((train,train_summary))
test = hstack((test,test_summary))

In [26]:
from sklearn.model_selection import train_test_split

train_csr_new,val_new, train_y , val_y = train_test_split(train,help_,test_size = 0.2,random_state = 42)

In [27]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(verbose = 1)
rf.fit(train_csr_new,train_y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  1.9min finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=1, warm_start=False)

In [30]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(val_y,smooth_predictions(rf.predict(val_new)))
#mean_absolute_error(scalers['nHelpful'].inverse_transform(val_y),smooth_predictions(scalers['nHelpful'].inverse_transform(rf.predict(val_new))))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


0.179975

In [29]:
def smooth_predictions(preds):
    import numpy as np
    for i in range(0,preds.shape[0]):
        x = preds[i]
        if x % 1 > 0.5:
            preds[i] = np.ceil(x) 
        else: 
            preds[i] = np.floor(x)
    return preds

In [31]:
def make_predictions(preds,message):
    outof = pd.DataFrame(test_df.helpful.apply(lambda x: [x['outOf']]).to_list(),columns = ['outOf'])['outOf']
    predictions = pd.DataFrame([test_df.reviewerID,test_df.itemID,outof,preds]).T
    predictions.rename(columns = {'reviewerID':0,'itemID':1,'Unnamed 0':2,'Unnamed 1':3},inplace = True)
    predictions[0] = predictions[0] + '-' + predictions[1] + '-'
    predictions[0] = predictions[0].apply(lambda x: x.strip())
    predictions[0] = predictions[0] + predictions['outOf'].apply(lambda x: str(int(x)).strip())
    predictions.drop([1,'outOf'],axis = 1 ,inplace = True)
    predictions.rename(columns = {0:'userID-itemID-outOf',2:'prediction'},inplace = True)
    predictions.to_csv('predictions_first.csv',sep= ',',index = False)
    !kaggle competitions submit -c dse220 -f predictions_first.csv -m 'Simplest Random Forsts ,without price ,category and anything'

In [55]:
from keras.datasets import imdb

Using TensorFlow backend.


In [54]:
# Re-training with entire training data
rf = RandomForestRegressor(n_estimators = 100 ,verbose = 1)
rf.fit(train,help_)

KeyboardInterrupt: 

In [None]:
preds = smooth_predictions(rf.predict(test))

In [None]:
make_predictions(preds,'nothing')

In [45]:
!kaggle competitions submit -c dse220 -f predictions_latest.csv -m 'ngrams with n from 3 to 4!'

100%|████████████████████████████████████████| 384k/384k [00:10<00:00, 37.5kB/s]
Successfully submitted to DSE 220 - Project