In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import string
import nltk
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

sns.set_style(style='whitegrid')

pd.options.display.max_colwidth = None

**Read in cleaned data**

In [2]:
df_train = pd.read_csv('/content/drive/MyDrive/CapstoneData/amazon_review_polarity_csv/train_clean_20000.csv')
df_test = pd.read_csv('/content/drive/MyDrive/CapstoneData/amazon_review_polarity_csv/test_clean_4000.csv')

df_train.head()

Unnamed: 0,label,title+review_clean
0,1,stun even nongamer sound track beautiful paint senery mind well would recomend even people hate vid game music play game chrono cross game ever play best music back away crude keyboarding take fresher step grate guitars soulful orchestras would impress anyone care listen _
1,1,best soundtrack ever anything read lot review say best game soundtrack figure id write review disagree bite opinino yasunori mitsudas ultimate masterpiece music timeless listen years beauty simply refuse fadethe price tag pretty stagger must say go buy cd much money one feel would worth every penny
2,1,amaze soundtrack favorite music time hand intense sadness prisoners fate mean play game hope distant promise girl steal star important inspiration personally throughout teen years higher energy track like chrono cross time scar time dreamwatch chronomantique indefinably remeniscent chrono trigger absolutely superb wellthis soundtrack amaze music probably best composers work not hear xenogears soundtrack cannot say sure even never play game would worth twice price buy iti wish could give star
3,1,excellent soundtrack truly like soundtrack enjoy video game music play game music enjoy truly relax peacefulon disk one favorites scar time life death forest illusion fortress ancient dragons lose fragment drown valleydisk two draggons galdorb home chronomantique prisoners fate gale girlfriend like zelbessdisk three best three garden god chronopolis fat jellyfish sea burn orphange dragons prayer tower star dragon god radical dreamers unstealable jeweloverall excellent soundtrack bring like video game musicxander cross
4,1,remember pull jaw floor hear play game know divine music every single song tell story game good greatest songs without doubt chrono cross time scar magical dreamers wind star sea radical dreamers unstolen jewel translation vary music perfect ask best yasunori mitsuda pour heart write paper


**Train and test set**

In [3]:
df_train.shape, df_test.shape

((20000, 2), (4000, 2))

In [4]:
X_train, y_train = df_train['title+review_clean'].fillna(' '), df_train['label'] 
X_test, y_test = df_test['title+review_clean'].fillna(' '), df_test['label'] 

**Simple first model**

In [5]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=100)
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)
print('Shape of train and test set:')
tfidf_train.shape, tfidf_test.shape

Shape of train and test set:


((20000, 100), (4000, 100))

In [None]:
print(X_train.__str__)

<method-wrapper '__str__' of Series object at 0x7f6f702effd0>


In [None]:
tfidf_train

<20000x100 sparse matrix of type '<class 'numpy.float64'>'
	with 199261 stored elements in Compressed Sparse Row format>

In [None]:

first_model = LogisticRegression(random_state=0).fit(tfidf_train, y_train)

print('Accuracy score on training set:')
score = first_model.score(tfidf_train, y_train)
print(score)
predicted = first_model.predict(tfidf_train)
print(np.mean(predicted == y_train))
print('Accuracy score on test set:')
score = first_model.score(tfidf_test, y_test)
print(score)

Accuracy score on training set:
0.77235
0.77235
Accuracy score on test set:
0.6495


In [None]:
lr = LogisticRegression(random_state=0, max_iter=200)
params = {'C':  [0.001, 0.1, 1, 10, 100]}
model2 = GridSearchCV(lr, param_grid=params, cv=5, scoring="accuracy")
model2.fit(tfidf_train, df_train['label'])
print('Best estimator and parameters:')
print(model2.best_estimator_)
print(model2.best_params_)
print('Accuracy score on training set:')
print(model2.best_score_)
print('Accuracy score on test set:')
print(accuracy_score(model2.predict(tfidf_test),y_test))
print('More detailed results:')
model2.cv_results_

Best estimator and parameters:
LogisticRegression(C=10, max_iter=200, random_state=0)
{'C': 10}
Accuracy score on training set:
0.7676499999999999
Accuracy score on test set:
0.64825
More detailed results:


{'mean_fit_time': array([0.06228089, 0.09102287, 0.1237185 , 0.26963015, 0.18254662]),
 'std_fit_time': array([0.02780819, 0.02112007, 0.0114635 , 0.11788397, 0.07814168]),
 'mean_score_time': array([0.00907674, 0.00253177, 0.00258503, 0.0104682 , 0.00416818]),
 'std_score_time': array([0.01065912, 0.00024239, 0.00021464, 0.01190597, 0.00273756]),
 'param_C': masked_array(data=[0.001, 0.1, 1, 10, 100],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.001}, {'C': 0.1}, {'C': 1}, {'C': 10}, {'C': 100}],
 'split0_test_score': array([0.6925 , 0.76375, 0.76825, 0.7705 , 0.7705 ]),
 'split1_test_score': array([0.7175, 0.7665, 0.7685, 0.7695, 0.769 ]),
 'split2_test_score': array([0.71475, 0.7645 , 0.767  , 0.7665 , 0.76625]),
 'split3_test_score': array([0.70175, 0.7685 , 0.77275, 0.77325, 0.77325]),
 'split4_test_score': array([0.67425, 0.74975, 0.75775, 0.7585 , 0.7585 ]),
 'mean_test_score': array([0.70015, 0.7

**Pipeline of Tf-idf Vectorizer and Logistic regression**

In [None]:
clf_tfidf_logreg = Pipeline([
  ('tfidf', TfidfVectorizer()),
  ('logreg', LogisticRegression(random_state=0)),])

In [None]:
clf_tfidf_logreg.fit(X_train, y_train)
predicted = clf_tfidf_logreg.predict(X_test)
print(np.mean(predicted == y_test))
accuracy_score(clf_tfidf_logreg.predict(X_test),y_test)

0.871


0.871

In [None]:
print(metrics.classification_report(y_test, predicted))
print(metrics.confusion_matrix(y_test, predicted))

              precision    recall  f1-score   support

           0       0.88      0.85      0.87      1951
           1       0.86      0.89      0.88      2049

    accuracy                           0.87      4000
   macro avg       0.87      0.87      0.87      4000
weighted avg       0.87      0.87      0.87      4000

[[1663  288]
 [ 228 1821]]


**Grid search for tf-idf and logistic regression**

In [None]:
clf_tfidf_logreg = Pipeline([
  ('tfidf', TfidfVectorizer(sublinear_tf=True)),
  ('logreg', LogisticRegression(random_state=0, max_iter=250))])

params = {'tfidf__ngram_range': [(1, 2), (1,3)],
          'tfidf__max_features': [20000, 50000],
          'tfidf__use_idf': (True, False),
          'tfidf__min_df': [0.001, 0.01],
          'logreg__C': [0.1, 1, 10, 100]}

model_tfidf_logreg = GridSearchCV(clf_tfidf_logreg, param_grid=params, cv=5, scoring="accuracy", 
                                  n_jobs=-1, verbose = 3)

model_tfidf_logreg.fit(X_train, y_train)
print('Best estimator and parameters:')
print(model_tfidf_logreg.best_estimator_)
#print(model_tfidf_logreg.best_params_)
print('Accuracy score on training set:')
print(model_tfidf_logreg.best_score_)
print('Accuracy score on test set:')
print(accuracy_score(model_tfidf_logreg.predict(X_test),y_test))

Fitting 5 folds for each of 96 candidates, totalling 480 fits
Best estimator and parameters:
Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=5000, min_df=0.001,
                                 ngram_range=(1, 3), sublinear_tf=True)),
                ('logreg',
                 LogisticRegression(C=1, max_iter=250, random_state=0))])
Accuracy score on training set:
0.8711500000000001
Accuracy score on test set:
0.8875


**Random Search and logistic regression**

In [None]:
clf_tfidf_logreg = Pipeline([
  ('tfidf', TfidfVectorizer(sublinear_tf=True)),
  ('logreg', LogisticRegression(random_state=0, max_iter=250, warm_start=True))])

params = {'tfidf__ngram_range': [(1, 2), (1,3), (2,3)],
          'tfidf__max_features': [20000, 30000, 40000],
          'tfidf__use_idf': (True, False),
          'tfidf__min_df': [0.001, 0.01],
          'logreg__C': [0.1, 1, 10, 100],
          'logreg__solver': ('lbfgs', 'liblinear', 'newton-cg', 
                             'newton-cholesky', 'sag', 'saga')}

model_tfidf_logreg = RandomizedSearchCV(clf_tfidf_logreg, param_distributions=params, cv=4, scoring="accuracy", 
                                  n_jobs=-1, verbose = 3, n_iter=40)

model_tfidf_logreg.fit(X_train, y_train)
print('Best estimator and parameters:')
print(model_tfidf_logreg.best_estimator_)
#print(model_tfidf_logreg.best_params_)
print('Accuracy score on training set:')
print(model_tfidf_logreg.best_score_)
print('Accuracy score on test set:')
print(accuracy_score(model_tfidf_logreg.predict(X_test),y_test))

Fitting 4 folds for each of 40 candidates, totalling 160 fits
Best estimator and parameters:
Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=20000, min_df=0.001,
                                 ngram_range=(1, 2), sublinear_tf=True)),
                ('logreg',
                 LogisticRegression(C=1, max_iter=250, random_state=0,
                                    solver='liblinear', warm_start=True))])
Accuracy score on training set:
0.86865
Accuracy score on test set:
0.888


In [None]:
clf_tfidf_logreg = Pipeline([
  ('tfidf', TfidfVectorizer(sublinear_tf=True)),
  ('logreg', LogisticRegression(random_state=0, max_iter=250))])

params = {'tfidf__ngram_range': [(1,1), (1, 2), (1,3), (2,3)],
          'tfidf__max_features': [1000, 2000, 5000, 10000],
          'tfidf__use_idf': (True, False),
          'tfidf__min_df': [0.0001, 0.001, 0.01],
          'logreg__C': [0.1, 1, 10, 100]}

model_tfidf_logreg = RandomizedSearchCV(clf_tfidf_logreg, param_distributions=params, cv=5, scoring="accuracy", 
                                  n_jobs=-1, verbose = 3)

model_tfidf_logreg.fit(X_train, y_train)
print('Best estimator and parameters:')
print(model_tfidf_logreg.best_estimator_)
#print(model_tfidf_logreg.best_params_)
print('Accuracy score on training set:')
print(model_tfidf_logreg.best_score_)
print('Accuracy score on test set:')
print(accuracy_score(model_tfidf_logreg.predict(X_test),y_test))

Fitting 5 folds for each of 10 candidates, totalling 50 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Best estimator and parameters:
Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=2000, min_df=0.0001,
                                 ngram_range=(1, 3), sublinear_tf=True,
                                 use_idf=False)),
                ('logreg',
                 LogisticRegression(C=100, max_iter=250, random_state=0))])
Accuracy score on training set:
0.8543000000000001
Accuracy score on test set:
0.86725


In [None]:
clf_tfidf_logreg = Pipeline([
  ('tfidf', TfidfVectorizer(sublinear_tf=True)),
  ('logreg', LogisticRegression(random_state=0, max_iter=250))])

params = {'tfidf__ngram_range': [(1,1), (1, 2), (1,3)],
          'tfidf__max_features': [1000, 2000, 5000, 10000],
          'tfidf__use_idf': (True, False),
          'tfidf__min_df': [0.0001, 0.001, 0.01],
          'logreg__C': [0.1, 1, 10, 100]}

model_tfidf_logreg = RandomizedSearchCV(clf_tfidf_logreg, param_distributions=params, cv=5, scoring="accuracy", 
                                  n_jobs=-1, verbose = 3)

model_tfidf_logreg.fit(X_train, y_train)
print('Best estimator and parameters:')
print(model_tfidf_logreg.best_estimator_)
#print(model_tfidf_logreg.best_params_)
print('Accuracy score on training set:')
print(model_tfidf_logreg.best_score_)
print('Accuracy score on test set:')
print(accuracy_score(model_tfidf_logreg.predict(X_test),y_test))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best estimator and parameters:
Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=5000, min_df=0.001,
                                 ngram_range=(1, 3), sublinear_tf=True)),
                ('logreg',
                 LogisticRegression(C=10, max_iter=250, random_state=0))])
Accuracy score on training set:
0.8654
Accuracy score on test set:
0.87625


In [None]:
clf_tfidf_logreg = Pipeline([
  ('tfidf', TfidfVectorizer(sublinear_tf=True)),
  ('logreg', LogisticRegression(random_state=0, max_iter=250))])

params = {'tfidf__ngram_range': [(1,1), (1, 2), (1,3)],
          'tfidf__max_features': [1000, 2000, 5000, 10000],
          'tfidf__use_idf': (True, False),
          'tfidf__min_df': [0.0001, 0.001, 0.01],
          'logreg__C': [0.1, 1, 10, 100]}

model_tfidf_logreg = RandomizedSearchCV(clf_tfidf_logreg, param_distributions=params, cv=5, scoring="accuracy", 
                                  n_jobs=-1, verbose = 3, n_iter=20)

model_tfidf_logreg.fit(X_train, y_train)
print('Best estimator and parameters:')
print(model_tfidf_logreg.best_estimator_)
#print(model_tfidf_logreg.best_params_)
print('Accuracy score on training set:')
print(model_tfidf_logreg.best_score_)
print('Accuracy score on test set:')
print(accuracy_score(model_tfidf_logreg.predict(X_test),y_test))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best estimator and parameters:
Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=10000, min_df=0.0001,
                                 ngram_range=(1, 3), sublinear_tf=True)),
                ('logreg',
                 LogisticRegression(C=10, max_iter=250, random_state=0))])
Accuracy score on training set:
0.8695999999999999
Accuracy score on test set:
0.87825


**Grid search for Tf-idf and SVM**

In [None]:
#clf_tfidf_svm = Pipeline([('tfidf', TfidfVectorizer()),
#                          ('SVM', SVC())])

#params = {'tfidf__ngram_range': [(1, 1)],#,(1,2)],
          #'tfidf__use_idf': (True, False),
          #'tfidf__min_df': [0.001, 0.01],
          #'SVM__C':[0.1,10], 
          #'SVM__gamma':[0.01,0.1]
#          }

#model_tfidf_svm = GridSearchCV(clf_tfidf_svm, param_grid=params, cv=5, scoring="accuracy", 
#                                  n_jobs=-1, verbose = 3)

#model_tfidf_svm.fit(X_train, y_train)
#print('Best estimator and parameters:')
#print(model_tfidf_svm.best_estimator_)
#print('Accuracy score on training set:')
#print(model_tfidf_svm.best_score_)
#print('Accuracy score on test set:')
#print(accuracy_score(model_tfidf_svm.predict(X_test),y_test))

In [None]:
#clf_tfidf_nb = Pipeline([('tfidf', TfidfVectorizer()),
#                          ('NB', GaussianNB())])

#params = {#'tfidf__ngram_range': [(1, 1), (1, 2)],
          #'tfidf__use_idf': (True, False),
#          'tfidf__min_df': [0.001]#, 0.01]}
#}

#model_tfidf_nb = GridSearchCV(clf_tfidf_nb, param_grid=params, cv=5, scoring="accuracy", 
                                  n_jobs=-1, verbose = 3)

#model_tfidf_nb.fit(X_train, y_train)
#print('Best estimator and parameters:')
#print(model_tfidf_nb.best_estimator_)
#print('Accuracy score on training set:')
#print(model_tfidf_nb.best_score_)
#print('Accuracy score on test set:')
#print(accuracy_score(model_tfidf_nb.predict(X_test),y_test))

IndentationError: ignored