In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score


In [2]:
review_data = pd.read_csv('../data/all_reviews_without_unicode.csv')

  review_data = pd.read_csv('../data/all_reviews_without_unicode.csv')


In [32]:
review_data.head()

Unnamed: 0,productID,prodSiteID,review_header,reviewer_name,review_content,review_star_rating,review_helpful_votes,verified_purchase,review_subjectivity,review_polarity,...,url,review_lang,site,site_outlier,site_outlier_en,global_outlier,global_outlier_en,helpful_votes_scaled,review_length_scaled,verified_purchase_num
0,1,Amazon1,It is renewed but looked like new,Andre,"It was packaged very well, Amazon packaging, n...",5,1,True,0.588232,0.258266,...,,en,Amazon,0,0,0,0,0.00157,0.041273,1
1,1,Amazon1,Worth it,Alice,"The phone came in perfect conditions, without ...",5,2,True,0.430909,0.447273,...,,en,Amazon,0,0,0,0,0.00314,0.019776,1
2,1,Amazon1,Amazon Warehouse for the Win!,Mrs. K,I got a great deal on this at Amazon Warehouse...,5,2,True,0.554959,0.394215,...,,en,Amazon,0,0,0,0,0.00314,0.05589,1
3,1,Amazon1,Awesome,David Chaves Rojas,"Got my phone, 10/10. No scratches, still have ...",5,0,True,1.0,1.0,...,,en,Amazon,2,2,0,0,0.0,0.011178,1
4,1,Amazon1,Like new iPhone 14,Rick,The iPhone 14 came with a battery that had 100...,5,0,True,0.636364,0.424242,...,,en,Amazon,0,0,0,0,0.0,0.025795,1


In [3]:
review_data.dtypes

productID                 int64
prodSiteID               object
review_header            object
reviewer_name            object
review_content           object
review_star_rating        int64
review_helpful_votes      int64
verified_purchase          bool
review_subjectivity     float64
review_polarity         float64
review_length           float64
url                      object
review_lang              object
site                     object
site_outlier              int64
site_outlier_en           int64
global_outlier            int64
global_outlier_en         int64
dtype: object

In [10]:
scaler = MinMaxScaler()
review_data['helpful_votes_scaled'] = scaler.fit_transform(review_data[['review_helpful_votes']])

review_data['review_length_scaled'] = scaler.fit_transform(review_data[['review_length']])

for i in review_data['verified_purchase']:

    if i == True:
        review_data['verified_purchase_num'] = 1
    else:
        review_data['verified_purchase_num'] = 0


In [11]:
review_data.shape

(48081, 21)

### Approach 1: Splitting the dataset into reviews of common products on all sites for training and testing on review of products unique to a particular site

In [12]:
BB = review_data[review_data['site']=='BestBuy']
AMZ = review_data[review_data['site']=='Amazon']
TGT = review_data[review_data['site']=='Target']

In [13]:
BB = BB['productID'].unique().tolist()
AMZ = AMZ['productID'].unique().tolist()
TGT = TGT['productID'].unique().tolist()

In [14]:
all = set(BB).union(set(AMZ),set(TGT))

In [15]:
common = set(BB).intersection(set(AMZ), set(TGT))
lst_common = sorted(list(common))
lst_uncommon = sorted(list(all - common))

In [16]:
lst_common

[1, 2, 3, 5, 6, 8, 9, 11, 12, 13, 14, 15]

In [17]:
lst_uncommon

[4, 7, 10, 16, 17, 18, 19, 20, 21, 22, 23, 24, 40, 41]

In [25]:
df_train_lst = []
for i in lst_common:
    filtered_df_train = review_data[review_data['productID'] == i]
    df_train_lst.append(filtered_df_train)

df_train = pd.concat(df_train_lst)
df_train = df_train[['review_star_rating','verified_purchase_num','review_subjectivity','review_polarity','review_length_scaled', 'helpful_votes_scaled']]
df_train.dropna(inplace=True)

In [26]:
df_train.head()

Unnamed: 0,review_star_rating,verified_purchase_num,review_subjectivity,review_polarity,review_length_scaled,helpful_votes_scaled
0,5,1,0.588232,0.258266,0.041273,0.00157
1,5,1,0.430909,0.447273,0.019776,0.00314
2,5,1,0.554959,0.394215,0.05589,0.00314
3,5,1,1.0,1.0,0.011178,0.0
4,5,1,0.636364,0.424242,0.025795,0.0


In [27]:
df_train.shape

(35191, 6)

In [28]:
df_test_lst = []
for i in lst_uncommon:
    filtered_df_test = review_data[review_data['productID'] == i]
    df_test_lst.append(filtered_df_test)

df_test = pd.concat(df_test_lst)
df_test = df_test[['review_star_rating','verified_purchase_num','review_subjectivity','review_polarity','review_length_scaled', 'helpful_votes_scaled']]
df_test.dropna(inplace=True)

In [29]:
df_test.shape

(12888, 6)

In [30]:
X_train = df_train[['review_star_rating','verified_purchase_num','review_subjectivity','review_polarity','review_length_scaled']]
y_train = df_train[['helpful_votes_scaled']]

X_test = df_test[['review_star_rating','verified_purchase_num','review_subjectivity','review_polarity','review_length_scaled']]
y_test = df_test[['helpful_votes_scaled']]


In [31]:
# Step 4: Model Selection and Training
model = GradientBoostingRegressor()
model.fit(X_train, y_train)

# Step 6: Evaluate the Model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
r2 = r2_score(y_test, y_pred)
print("R2 score is:", r2)

  y = column_or_1d(y, warn=True)


Mean Squared Error: 7.433756243718388e-05
R2 score is: -2.2828029702008705


### Aprroach 2: Using the whole dataset, just shuffling the rows

In [245]:
review_data_reduced = review_data[['review_star_rating','verified_purchase_num','review_subjectivity','review_polarity','review_length', 'helpful_votes_scaled']]
review_data_reduced.dropna(inplace=True)
review_data_reduced.sample(frac=1)
review_data_reduced.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_data_reduced.dropna(inplace=True)


(48079, 6)

In [248]:
X, y= review_data_reduced[['review_star_rating','verified_purchase_num','review_subjectivity','review_polarity','review_length']], review_data_reduced[['helpful_votes_scaled']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [249]:
model = GradientBoostingRegressor()
model.fit(X_train, y_train)

# Step 6: Evaluate the Model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
r2 = r2_score(y_test, y_pred)
print("R2 score is:", r2)

  y = column_or_1d(y, warn=True)


Mean Squared Error: 0.00013888365030410312
R2 score is: -2.0641634935510216


### Approach: 3


In [34]:
X, y= df_train.drop(['helpful_votes_scaled'], axis=1), df_train['helpful_votes_scaled']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
model = GradientBoostingRegressor()
model.fit(X_train, y_train)

# Step 6: Evaluate the Model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
r2 = r2_score(y_test, y_pred)
print("R2 score is:", r2)

Mean Squared Error: 0.00023469688874923033
R2 score is: -0.2005035968195208
