In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

In [None]:
train_df = pd.read_csv('train_df.csv', index_col=0)
train_df.head()

In [None]:
test_df = pd.read_csv('test_df.csv', index_col=0)
test_df.head()

In [None]:
train_x, train_y = train_df.drop('star_rating_x', axis=1), train_df['star_rating_x']
test_x, test_y = test_df.drop('star_rating_x', axis=1), test_df['star_rating_x']

In [None]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(train_x, train_y)

In [None]:
rf.score(test_x, test_y)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(train_x, train_y)

In [None]:
knn.score(test_x, test_y)

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(train_x, train_y)

In [None]:
gnb.score(train_x, train_y)

In [2]:
from keras import models
from keras import layers
from keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [None]:
np.shape(train_x)

In [None]:
network = models.Sequential()
network.add(layers.Dense(12, activation='relu', input_shape=(16,)))
#network.add(layers.Dense(128, activation='relu', input_shape=(256,)))
#network.add(layers.Dense(64, activation='relu', input_shape=(128,)))
#network.add(layers.Dense(32, activation='relu', input_shape=(64,)))
network.add(layers.Dense(6, activation='softmax', input_shape=(16,)))
network.summary()

In [None]:
train_y_cat = to_categorical(train_y)
test_y_cat = to_categorical(test_y)

In [None]:
network.compile(optimizer='adam',
                loss='categorical_crossentropy',
                metrics=['accuracy'])

epochs = 50
history = network.fit(train_x, 
                      train_y_cat, 
                      epochs=epochs, 
                      batch_size=128, 
                      validation_data=(test_x, test_y_cat))

In [None]:
history_dict = history.history
loss_values = history_dict['loss']
test_loss_values = history_dict['val_loss']
epochs_range = range(1, epochs + 1)
plt.plot(epochs_range, loss_values, 'bo', label='Training loss')
plt.plot(epochs_range, test_loss_values, 'ro', label='Test loss')
plt.title('Training and test loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
acc_values = history_dict['accuracy']
test_acc_values = history_dict['val_accuracy']
plt.plot(epochs_range, acc_values, 'bo', label='Training accuracy')
plt.plot(epochs_range, test_acc_values, 'ro', label='Test accuracy')
plt.title('Training and test accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [4]:
svd_train_df = pd.read_csv('train_raw.csv', index_col=0)
svd_train_df = pd.concat([svd_train_df, pd.read_csv('train_raw_max.csv', index_col=0)])
svd_test_df = pd.read_csv('test_raw.csv', index_col=0)
svd_test_df = pd.concat([svd_train_df, pd.read_csv('test_raw_max.csv', index_col=0)])

svd_train_df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
267779,US,3387522,R2JEMKCQT9QTVM,B008SBZD5U,150118808,Kingdom Hearts HD 1.5 Remix,Video Games,4,0,0,N,Y,KH!>% RULE!,Its A pretty good game and it came on time but...,2015-02-01
267796,US,15137367,R3IM5TXYMA7DCQ,B00IXMF5CU,292308774,Terraria - Xbox 360,Video Games,5,1,2,N,Y,Five Stars,Excellent,2015-02-01
267818,US,15137367,R2PFXXB4EP23B,B00EM5UFEK,461081395,Plants vs. Zombies Garden Warfare,Video Games,4,0,0,N,Y,Four Stars,Excellent,2015-02-01
267871,US,4354770,R1A9DB178PFXLH,B004OPYLTS,245894499,Fishdom - Nintendo DS,Video Games,5,0,0,N,Y,Fun Game,"Fun game, levels get harder as you progress.",2015-02-01
267892,US,6698937,R1RS2GAXENP9LF,B00BGA9X9W,943154724,DualShock 4 Wireless Controller for PlayStatio...,Video Games,4,0,0,N,Y,Four Stars,Great controller,2015-02-01


In [39]:
svd_test_df.loc[pd.isnull(svd_test_df.product_parent)]

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date


In [29]:
from surprise import Dataset
from surprise import Reader

svd_train_df[['customer_id', 'product_parent', 'star_rating', 'review_date']].to_csv('svd_train_df.csv')
svd_test_df[['customer_id', 'product_parent', 'star_rating', 'review_date']].to_csv('svd_test_df.csv')

In [40]:
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
data = Dataset.load_from_folds([('svd_train_df.csv', 'svd_test_df.csv')], reader=reader)

In [42]:
from surprise import accuracy
from surprise import SVD
from surprise.model_selection import PredefinedKFold

pkf = PredefinedKFold()

algo = SVD()

for trainset, testset in pkf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    mean_rmse = accuracy.rmse(predictions, verbose=True)
    mean_mae = accuracy.mae(predictions, verbose=True)

RMSE: 574860256.0221
MAE:  497298730.2990


In [47]:
from surprise.model_selection import cross_validate
data2 = Dataset.load_from_file('svd_train_df.csv', reader=reader)
cross_validate(SVD(), data2, cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    574621501.2857574794186.3784575082686.8433574964339.3232574688931.8325574830329.1326171202.6344
MAE (testset)     496990258.9794496982335.8266497732051.8351497450024.9884497392032.2987497309340.7856287770.7438
Fit time          29.10   28.96   29.41   29.00   29.48   29.19   0.21    
Test time         0.65    1.27    1.32    0.63    1.29    1.04    0.32    


{'test_rmse': array([5.74621501e+08, 5.74794186e+08, 5.75082687e+08, 5.74964339e+08,
        5.74688932e+08]),
 'test_mae': array([4.96990259e+08, 4.96982336e+08, 4.97732052e+08, 4.97450025e+08,
        4.97392032e+08]),
 'fit_time': (29.097650051116943,
  28.964919567108154,
  29.41121006011963,
  29.003693342208862,
  29.483509063720703),
 'test_time': (0.6547908782958984,
  1.2748396396636963,
  1.322972297668457,
  0.6313126087188721,
  1.2920618057250977)}

In [23]:
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

RMSE: 0.7747
MAE:  0.5614
