In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/steam-20-processed-version/steam_reviews_subset_20_processed_4_25.csv
/kaggle/input/steam-processed-5-new/steam_subset_5_processed_new.csv


In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import re

# import spacy
# spacy.prefer_gpu()
# nlp = spacy.load("en_core_web_sm")

In [3]:
# cleaned and processed 20% data sample taken from steam reviews
# data = pd.read_csv("/kaggle/input/steam-20-processed-version/steam_reviews_subset_20_processed_4_25.csv")

In [4]:
# cleaned and processed 5% data sample taken from steam reviews
data = pd.read_csv("/kaggle/input/steam-processed-5-new/steam_subset_5_processed_new.csv")

In [5]:
data.head()

Unnamed: 0,app_id,app_name,review_text,review_score,review_votes,review_processed
0,237930,Transistor,I think the main thing that surprised me about...,1,0,think main thing surprise game intigration mus...
1,270450,Robot Roller-Derby Disco Dodgeball,Cute little FPS that takes a little practice t...,1,0,cute little fps take little practice learn cat...
2,49520,Borderlands 2,This game is absolutely the best! Totally wort...,1,0,game absolutely good totally worth get cheap s...
3,211400,Deadlight,"Nope, just a nope. Clunky control, I can't sa...",0,0,nope nope clunky control frustrating play ga...
4,219640,Chivalry: Medieval Warfare,I went into Chiv expecting something amazing. ...,1,0,go chiv expect amazing exactally think go ga...


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195564 entries, 0 to 195563
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   app_id            195564 non-null  int64 
 1   app_name          195564 non-null  object
 2   review_text       195564 non-null  object
 3   review_score      195564 non-null  int64 
 4   review_votes      195564 non-null  int64 
 5   review_processed  195452 non-null  object
dtypes: int64(3), object(3)
memory usage: 9.0+ MB


In [7]:
data.isnull().sum()

app_id                0
app_name              0
review_text           0
review_score          0
review_votes          0
review_processed    112
dtype: int64

In [8]:
# data["app_name"].value_counts(normalize=True).head(20).plot(kind="bar")
# plt.xlabel("Game Name")
# plt.title("Top 20 Review Games on Steam")
# plt.show()

In [9]:
# remove rows without any data in processed review
data = data[~data["review_processed"].isnull()].copy(deep=True)

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 195452 entries, 0 to 195563
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   app_id            195452 non-null  int64 
 1   app_name          195452 non-null  object
 2   review_text       195452 non-null  object
 3   review_score      195452 non-null  int64 
 4   review_votes      195452 non-null  int64 
 5   review_processed  195452 non-null  object
dtypes: int64(3), object(3)
memory usage: 10.4+ MB


## Data Processing and Modeling: Imbalanced Sampling Techniques

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
# from sklearn import tree
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
# from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgbm
from sklearn.ensemble import AdaBoostClassifier
# from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

from imblearn.over_sampling import SMOTE

In [12]:
# function to show results
def show_metrics(y_test, predictions):
    print("Confusion Matrix: ")
    cmat = metrics.confusion_matrix(y_test, predictions)
    print(cmat)
    print("True Positives (TP) = ", cmat[0,0])
    print("True Negatives (TN) = ", cmat[1,1])
    print("False Positives (FP) = ", cmat[0,1])
    print("False Negatives (FN) = ", cmat[1,0])
    print("Accuracy: ")
    print(metrics.accuracy_score(y_test, predictions))
    print("Precision: ")
    print(metrics.precision_score(y_test, predictions))
    print("F1 Score: ")
    print(metrics.f1_score(y_test, predictions))
    print("Classification Report: ")
    print(metrics.classification_report(y_test, predictions))

In [13]:
X = data["review_processed"]
y = data["review_score"]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

In [15]:
pipe = Pipeline([("cvector", CountVectorizer()), 
                 ("tfidf", TfidfTransformer())
                ])

X_train_tfvec = pipe.fit_transform(X_train)

X_test_tfvec = pipe.transform(X_test)

print("Training Data Shape: ", X_train_tfvec.shape)

Training Data Shape:  (156361, 91863)


In [16]:
sm = SMOTE(random_state=42)

X_train_sm, y_train_sm = sm.fit_resample(X_train_tfvec, y_train)

## Logistic Regression

In [17]:
%%time
lr = LogisticRegression(max_iter=2000, random_state=42)
lr.fit(X_train_sm, y_train_sm)

lr_pred = lr.predict(X_test_tfvec)

print("Logistic Regression: ")
lr_train = lr.predict(X_train_sm)
print("Training Accuracy: ", metrics.accuracy_score(y_train_sm, lr_train))

show_metrics(y_test, lr_pred)

Logistic Regression: 
Training Accuracy:  0.8881159197478669
Confusion Matrix: 
[[ 5051  1527]
 [ 5091 27422]]
True Positives (TP) =  5051
True Negatives (TN) =  27422
False Positives (FP) =  1527
False Negatives (FN) =  5091
Accuracy: 
0.8307027192960016
Precision: 
0.947252063974576
F1 Score: 
0.892323712212424
Classification Report: 
              precision    recall  f1-score   support

           0       0.50      0.77      0.60      6578
           1       0.95      0.84      0.89     32513

    accuracy                           0.83     39091
   macro avg       0.72      0.81      0.75     39091
weighted avg       0.87      0.83      0.84     39091

CPU times: user 58.1 s, sys: 55.2 s, total: 1min 53s
Wall time: 29.6 s


In [18]:
%%time
# tuning with gridsearch
param_grid = {"C": [0.01, 0.1, 1]}

lr_cv = GridSearchCV(lr, param_grid, cv=5)
lr_cv.fit(X_train_sm, y_train_sm)

print("Best Params: ", lr_cv.best_params_)
print("Best Score: ", lr_cv.best_score_)

Best Params:  {'C': 1}
Best Score:  0.8684833576754555
CPU times: user 5min 58s, sys: 5min 46s, total: 11min 45s
Wall time: 3min 5s


## Decision Tree

In [19]:
%%time
dtc_clf = DecisionTreeClassifier(random_state=42)
dtc_clf.fit(X_train_sm, y_train_sm)

dtc_pred = dtc_clf.predict(X_test_tfvec)

print("Decision Tree: ")
dtc_train = dtc_clf.predict(X_train_sm)
print("Training Accuracy: ", metrics.accuracy_score(y_train_sm, dtc_train))

show_metrics(y_test, dtc_pred)

Decision Tree: 
Training Accuracy:  0.9974402336843724
Confusion Matrix: 
[[ 3335  3243]
 [ 5565 26948]]
True Positives (TP) =  3335
True Negatives (TN) =  26948
False Positives (FP) =  3243
False Negatives (FN) =  5565
Accuracy: 
0.7746795937683866
Precision: 
0.892583882614024
F1 Score: 
0.8595304924725694
Classification Report: 
              precision    recall  f1-score   support

           0       0.37      0.51      0.43      6578
           1       0.89      0.83      0.86     32513

    accuracy                           0.77     39091
   macro avg       0.63      0.67      0.65     39091
weighted avg       0.81      0.77      0.79     39091

CPU times: user 9min 49s, sys: 218 ms, total: 9min 49s
Wall time: 9min 50s


In [20]:
%%time
# tuning with gridsearch
param_grid = {"max_depth": [2, 5, 8, 12, 18], 
              "min_samples_leaf": [5, 10, 20, 35, 60], 
              "criterion": ["gini", "entropy"]}

dtc_cv = GridSearchCV(dtc_clf, param_grid, cv=5)
dtc_cv.fit(X_train_sm, y_train_sm)

print("Best Params: ", dtc_cv.best_params_)
print("Best Score: ", dtc_cv.best_score_)

Best Params:  {'criterion': 'gini', 'max_depth': 18, 'min_samples_leaf': 5}
Best Score:  0.7609193635175647
CPU times: user 2h 7min 50s, sys: 9.34 s, total: 2h 7min 59s
Wall time: 2h 8min 9s


## Random Forest

In [21]:
%%time
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train_sm, y_train_sm)

rf_pred = rf_clf.predict(X_test_tfvec)

print("Random Forest: ")
rf_train = rf_clf.predict(X_train_sm)
print("Training Accuracy: ", metrics.accuracy_score(y_train_sm, rf_train))

show_metrics(y_test, rf_pred)

Random Forest: 
Training Accuracy:  0.9974402336843724
Confusion Matrix: 
[[ 3322  3256]
 [ 2475 30038]]
True Positives (TP) =  3322
True Negatives (TN) =  30038
False Positives (FP) =  3256
False Negatives (FN) =  2475
Accuracy: 
0.8533933642014786
Precision: 
0.9022046014296871
F1 Score: 
0.9129120002431351
Classification Report: 
              precision    recall  f1-score   support

           0       0.57      0.51      0.54      6578
           1       0.90      0.92      0.91     32513

    accuracy                           0.85     39091
   macro avg       0.74      0.71      0.72     39091
weighted avg       0.85      0.85      0.85     39091

CPU times: user 57min 55s, sys: 1.63 s, total: 57min 56s
Wall time: 58min


In [22]:
# takes a long time with SMOTE
# %%time
# # tuning with gridsearch
# param_grid = {"max_depth": [2, 5, 8, 12, 18], 
#               "min_samples_leaf": [5, 10, 20, 35, 60], 
#               "n_estimators": [30, 50, 80, 150]}

# rf_cv = GridSearchCV(rf_clf, param_grid, cv=5)
# rf_cv.fit(X_train_sm, y_train_sm)

# print("Best Params: ", rf_cv.best_params_)
# print("Best Score: ", rf_cv.best_score_)

## XGBoost

In [23]:
%%time
xgb_clf = XGBClassifier(random_state=42) # default n_estimators = 100
xgb_clf.fit(X_train_sm, y_train_sm)

xgb_pred = xgb_clf.predict(X_test_tfvec)

print("XGBoost: ")
xgb_train = xgb_clf.predict(X_train_sm)
print("Training Accuracy: ", metrics.accuracy_score(y_train_sm, xgb_train))

show_metrics(y_test, xgb_pred)

XGBoost: 
Training Accuracy:  0.8676377892228457
Confusion Matrix: 
[[ 4347  2231]
 [ 4636 27877]]
True Positives (TP) =  4347
True Negatives (TN) =  27877
False Positives (FP) =  2231
False Negatives (FN) =  4636
Accuracy: 
0.8243329666675194
Precision: 
0.9259000929985386
F1 Score: 
0.8903403011769214
Classification Report: 
              precision    recall  f1-score   support

           0       0.48      0.66      0.56      6578
           1       0.93      0.86      0.89     32513

    accuracy                           0.82     39091
   macro avg       0.70      0.76      0.72     39091
weighted avg       0.85      0.82      0.83     39091

CPU times: user 3min 11s, sys: 1.58 s, total: 3min 12s
Wall time: 3min 13s


In [24]:
%%time
xgb_clf = XGBClassifier(n_estimators=200, random_state=42)
xgb_clf.fit(X_train_sm, y_train_sm)

xgb_pred = xgb_clf.predict(X_test_tfvec)

print("XGBoost 200: ")
xgb_train = xgb_clf.predict(X_train_sm)
print("Training Accuracy: ", metrics.accuracy_score(y_train_sm, xgb_train))

show_metrics(y_test, xgb_pred)

XGBoost 200: 
Training Accuracy:  0.8920670305173342
Confusion Matrix: 
[[ 4408  2170]
 [ 4019 28494]]
True Positives (TP) =  4408
True Negatives (TN) =  28494
False Positives (FP) =  2170
False Negatives (FN) =  4019
Accuracy: 
0.8416771123788084
Precision: 
0.9292329767805896
F1 Score: 
0.9020371337670355
Classification Report: 
              precision    recall  f1-score   support

           0       0.52      0.67      0.59      6578
           1       0.93      0.88      0.90     32513

    accuracy                           0.84     39091
   macro avg       0.73      0.77      0.74     39091
weighted avg       0.86      0.84      0.85     39091

CPU times: user 5min 32s, sys: 1.19 s, total: 5min 33s
Wall time: 5min 34s


## Naive Bayes

In [25]:
%%time
clf = MultinomialNB()
clf.fit(X_train_sm, y_train_sm)

nb_pred = clf.predict(X_test_tfvec)

print("Naive Bayes: ")
nb_train = clf.predict(X_train_sm)
print("Training Accuracy: ", metrics.accuracy_score(y_train_sm, nb_train))

show_metrics(y_test, nb_pred)

Naive Bayes: 
Training Accuracy:  0.85450457375663
Confusion Matrix: 
[[ 4883  1695]
 [ 4726 27787]]
True Positives (TP) =  4883
True Negatives (TN) =  27787
False Positives (FP) =  1695
False Negatives (FN) =  4726
Accuracy: 
0.8357422424599013
Precision: 
0.9425072925853063
F1 Score: 
0.8964271312202597
Classification Report: 
              precision    recall  f1-score   support

           0       0.51      0.74      0.60      6578
           1       0.94      0.85      0.90     32513

    accuracy                           0.84     39091
   macro avg       0.73      0.80      0.75     39091
weighted avg       0.87      0.84      0.85     39091

CPU times: user 336 ms, sys: 0 ns, total: 336 ms
Wall time: 334 ms


## K Nearest Neighbors

In [26]:
# %%time
# knn = KNeighborsClassifier(n_neighbors=3)
# knn.fit(X_train_tfvec, y_train)

# knn_pred = knn.predict(X_test_tfvec)

# show_metrics(y_test, knn_pred)

In [27]:
# %%time
# # tuning with gridsearch
# param_grid = {"n_neighbors" : [3, 4, 5, 6, 7, 8]}

# knn_cv = GridSearchCV(knn, param_grid, cv=5)
# knn_cv.fit(X_train_tfvec, y_train)

# print("Best Params: ", knn_cv.best_params_)
# print("Best Score: ", knn_cv.best_score_)

## LightGBM

In [28]:
# %%time
# lgb_clf = lgbm.LGBMClassifier(random_state=42)
# lgb_clf.fit(X_train_sm, y_train_sm)

# lgb_pred = lgb_clf.predict(X_test_tfvec)

# lgb_train = lgb_clf.predict(X_train_sm)
# print("Training Accuracy: ", metrics.accuracy_score(y_train_sm, lgb_train))

# show_metrics(y_test, lgb_pred)

In [29]:
# %%time
# # tuning with gridsearch
# param_grid = {"n_estimators": [50, 100, 150, 200], 
#               "learning_rate": [0.001, 0.01, 0.1, 1], 
#               "boosting_type": ["gbdt", "dart", "rf"]}

# lgb_cv = GridSearchCV(lgb_clf, param_grid, cv=5)
# lgb_cv.fit(X_train_sm, y_train_sm)

# print("Best Params: ", lgb_cv.best_params_)
# print("Best Score: ", lgb_cv.best_score_)

## AdaBoost

In [30]:
%%time
ada = AdaBoostClassifier(random_state=42) # default n_estimators = 50
ada.fit(X_train_sm, y_train_sm)

ada_pred = ada.predict(X_test_tfvec)

print("AdaBoost: ")
ada_train = ada.predict(X_train_sm)
print("Training Accuracy: ", metrics.accuracy_score(y_train_sm, ada_train))

show_metrics(y_test, ada_pred)

AdaBoost: 
Training Accuracy:  0.7805711430548082
Confusion Matrix: 
[[ 4501  2077]
 [ 8750 23763]]
True Positives (TP) =  4501
True Negatives (TN) =  23763
False Positives (FP) =  2077
False Negatives (FN) =  8750
Accuracy: 
0.7230308766723799
Precision: 
0.9196207430340557
F1 Score: 
0.814456840265282
Classification Report: 
              precision    recall  f1-score   support

           0       0.34      0.68      0.45      6578
           1       0.92      0.73      0.81     32513

    accuracy                           0.72     39091
   macro avg       0.63      0.71      0.63     39091
weighted avg       0.82      0.72      0.75     39091

CPU times: user 2min 51s, sys: 41 ms, total: 2min 51s
Wall time: 2min 52s


In [31]:
%%time
ada = AdaBoostClassifier(n_estimators=100, random_state=42)
ada.fit(X_train_sm, y_train_sm)

ada_pred = ada.predict(X_test_tfvec)

print("AdaBoost 100: ")
ada_train = ada.predict(X_train_sm)
print("Training Accuracy: ", metrics.accuracy_score(y_train_sm, ada_train))

show_metrics(y_test, ada_pred)

AdaBoost 100: 
Training Accuracy:  0.8046698439541856
Confusion Matrix: 
[[ 4556  2022]
 [ 7382 25131]]
True Positives (TP) =  4556
True Negatives (TN) =  25131
False Positives (FP) =  2022
False Negatives (FN) =  7382
Accuracy: 
0.759433117597401
Precision: 
0.925533090266269
F1 Score: 
0.842389300439111
Classification Report: 
              precision    recall  f1-score   support

           0       0.38      0.69      0.49      6578
           1       0.93      0.77      0.84     32513

    accuracy                           0.76     39091
   macro avg       0.65      0.73      0.67     39091
weighted avg       0.83      0.76      0.78     39091

CPU times: user 5min 53s, sys: 65.1 ms, total: 5min 53s
Wall time: 5min 53s


In [32]:
%%time
ada = AdaBoostClassifier(n_estimators=150, random_state=42)
ada.fit(X_train_sm, y_train_sm)

ada_pred = ada.predict(X_test_tfvec)

print("AdaBoost 150: ")
ada_train = ada.predict(X_train_sm)
print("Training Accuracy: ", metrics.accuracy_score(y_train_sm, ada_train))

show_metrics(y_test, ada_pred)

AdaBoost 150: 
Training Accuracy:  0.8175724498424168
Confusion Matrix: 
[[ 4656  1922]
 [ 6778 25735]]
True Positives (TP) =  4656
True Negatives (TN) =  25735
False Positives (FP) =  1922
False Negatives (FN) =  6778
Accuracy: 
0.7774423780409813
Precision: 
0.9305058393896662
F1 Score: 
0.8554096725943161
Classification Report: 
              precision    recall  f1-score   support

           0       0.41      0.71      0.52      6578
           1       0.93      0.79      0.86     32513

    accuracy                           0.78     39091
   macro avg       0.67      0.75      0.69     39091
weighted avg       0.84      0.78      0.80     39091

CPU times: user 8min 48s, sys: 89 ms, total: 8min 48s
Wall time: 8min 49s


In [33]:
%%time
ada = AdaBoostClassifier(n_estimators=250, random_state=42)
ada.fit(X_train_sm, y_train_sm)

ada_pred = ada.predict(X_test_tfvec)

print("AdaBoost 250: ")
ada_train = ada.predict(X_train_sm)
print("Training Accuracy: ", metrics.accuracy_score(y_train_sm, ada_train))

show_metrics(y_test, ada_pred)

AdaBoost 250: 
Training Accuracy:  0.8332500576523945
Confusion Matrix: 
[[ 4630  1948]
 [ 5963 26550]]
True Positives (TP) =  4630
True Negatives (TN) =  26550
False Positives (FP) =  1948
False Negatives (FN) =  5963
Accuracy: 
0.7976260520324371
Precision: 
0.9316443259176083
F1 Score: 
0.8703348576486208
Classification Report: 
              precision    recall  f1-score   support

           0       0.44      0.70      0.54      6578
           1       0.93      0.82      0.87     32513

    accuracy                           0.80     39091
   macro avg       0.68      0.76      0.70     39091
weighted avg       0.85      0.80      0.81     39091

CPU times: user 14min 20s, sys: 186 ms, total: 14min 21s
Wall time: 14min 22s


In [34]:
# %%time
# # tuning with gridsearch
# param_grid = {"n_estimators" : [50, 70, 90, 110, 140, 180], 
#               "learning_rate" : [0.001, 0.01, 0.1, 1, 10]}

# ada_cv = GridSearchCV(ada, param_grid, cv=5)
# ada_cv.fit(X_train_sm, y_train_sm)

# print("Best Params: ", ada_cv.best_params_)
# print("Best Score: ", ada_cv.best_score_)

In [35]:
# %%time
# # use svc as a base learner
# svc = SVC(probability=True, kernel="linear", random_state=42)

# ada_svc = AdaBoostClassifier(base_estimator=svc, random_state=42)
# ada_svc.fit(X_train_tfvec, y_train)

# ada_svc_pred = ada_svc.predict(X_test_tfvec)

# show_metrics(y_test, ada_svc_pred)