In [158]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from scipy.sparse import hstack

In [159]:
train = pd.read_csv('Train.csv')
train.head()

Unnamed: 0,rating,title,text,asin,parent_asin,timestamp,helpful_vote,verified_purchase
0,5,Unique and authentic,A very classy and chic look. This necklace is ...,B0107QYW14,B0107QYW14,1460000000000.0,2,True
1,2,Uncomfortable,The cloth bands tend to scrunch up under my wi...,B08WX159BW,B08WX159BW,1630000000000.0,0,True
2,3,Something has changed for the worse,Really loved this product the first time I bou...,B083F76L79,B083F76L79,1630000000000.0,1,True
3,5,Five Stars,beautiful,B00FBXJ11K,B00FBXJ11K,1440000000000.0,0,True
4,3,1/4 and 3/4,I was so excited to get these. I opened the b...,B00F029PWC,B00F029PWC,1520000000000.0,0,True


In [160]:
train.shape

(39986, 8)

In [161]:
train['rating'].value_counts()

rating
5    24105
4     5499
1     4297
3     3678
2     2407
Name: count, dtype: int64

In [162]:
train.isnull().sum()

rating               0
title                0
text                 0
asin                 0
parent_asin          0
timestamp            0
helpful_vote         0
verified_purchase    0
dtype: int64

In [163]:
test = pd.read_csv('Test.csv')
test.head()

Unnamed: 0,title,text,asin,parent_asin,timestamp,helpful_vote,verified_purchase,rating
0,just a brush,I didn't realize I was buying just a brush. I...,B07S29RTQZ,B07S29RTQZ,1610000000000.0,0,True,
1,Easy fast and convenient to stay clean,Great for your purse and camping! When the ki...,B07PWLXFR5,B07PWLXFR5,1600000000000.0,0,True,
2,AVOID! Simply thicker personal wipes.,"I expected a larger ""towel."" Disappointed. Th...",B01LXQTVNE,B01LXQTVNE,1620000000000.0,1,True,
3,Five Stars,Great product!,B00N11BR2A,B00N11BR2A,1520000000000.0,2,True,
4,It looks like the picture.,"I don't love it. But, I can and will wear it....",B08D7Q5J6Y,B08D7Q5J6Y,1630000000000.0,0,True,


In [164]:
test.shape

(200, 8)

In [165]:
test.isnull().sum()

title                  0
text                   0
asin                   0
parent_asin            0
timestamp              0
helpful_vote           0
verified_purchase      0
rating               200
dtype: int64

In [166]:
combined = pd.concat([train, test], axis = 0)
combined.head()

Unnamed: 0,rating,title,text,asin,parent_asin,timestamp,helpful_vote,verified_purchase
0,5.0,Unique and authentic,A very classy and chic look. This necklace is ...,B0107QYW14,B0107QYW14,1460000000000.0,2,True
1,2.0,Uncomfortable,The cloth bands tend to scrunch up under my wi...,B08WX159BW,B08WX159BW,1630000000000.0,0,True
2,3.0,Something has changed for the worse,Really loved this product the first time I bou...,B083F76L79,B083F76L79,1630000000000.0,1,True
3,5.0,Five Stars,beautiful,B00FBXJ11K,B00FBXJ11K,1440000000000.0,0,True
4,3.0,1/4 and 3/4,I was so excited to get these. I opened the b...,B00F029PWC,B00F029PWC,1520000000000.0,0,True


In [167]:
combined.shape

(40186, 8)

In [168]:
combined["timestamp"] = pd.to_datetime(combined["timestamp"], unit="ms")
combined["year"] = combined["timestamp"].dt.year
combined["month"] = combined["timestamp"].dt.month
combined["day"] = combined["timestamp"].dt.day

In [169]:
combined["asin"] = LabelEncoder().fit_transform(combined["asin"])
combined["parent_asin"] = LabelEncoder().fit_transform(combined["parent_asin"])
combined["verified_purchase"] = combined["verified_purchase"].astype(int)

In [170]:
combined["helpful_vote"] = MinMaxScaler().fit_transform(combined[["helpful_vote"]])

In [171]:
newtrain = combined.iloc[0:39986, :]
newtest = combined.iloc[39986:, :]

In [172]:
newtrain.shape

(39986, 11)

In [173]:
newtest = newtest.drop('rating', axis = 1)

In [174]:
newtest.shape

(200, 10)

In [175]:
tfidf = TfidfVectorizer(max_features=5000, stop_words="english")
tfidf_matrix = tfidf.fit_transform(newtrain["title"] + " " + newtrain["text"])

In [176]:
num_features = ["year", "month", "day", "asin", "parent_asin", "verified_purchase", "helpful_vote"]

In [177]:
X = hstack([newtrain[num_features], tfidf_matrix])
y = newtrain["rating"]

In [178]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [200]:
xgb_model = XGBRegressor( random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_val)
rmse_xgb = np.sqrt(mean_squared_error(y_val, y_pred_xgb))
print(f"XGBoost RMSE: {rmse_xgb}")

XGBoost RMSE: 0.9493889436848301


In [208]:
tfidf = TfidfVectorizer(max_features=5000, stop_words="english")
tfidf_matrix_train = tfidf.fit_transform(newtrain["title"] + " " + newtrain["text"])
tfidf_matrix_test = tfidf.transform(newtest["title"] + " " + newtest["text"])

In [210]:
X_train = hstack([newtrain[num_features], tfidf_matrix_train])
X_test = hstack([newtest[num_features], tfidf_matrix_test])
y_train = newtrain["rating"]

In [214]:
xgb_model = XGBRegressor( random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

In [216]:
solution = pd.DataFrame(y_pred_xgb, columns= ['rating'])
solution.to_csv('Solution.csv', index = False)