In [1]:
import pandas as pd 
import numpy as np
import sklearn
from sklearn.utils import class_weight
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import decomposition, ensemble
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import precision_score
from sklearn.model_selection import cross_validate
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, recall_score, f1_score
from sklearn.metrics import make_scorer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
import time

In [2]:
X_train = pd.read_csv('./data/X_train.csv')
X_test = pd.read_csv('./data/X_test.csv')
y_train = pd.read_csv('./data/y_train.csv')
y_test = pd.read_csv('./data/y_test.csv')
X_val = pd.read_csv('./data/X_val.csv')
y_val = pd.read_csv('./data/y_val.csv')

In [3]:
X_train = X_train.iloc[:,0]
X_test = X_test.iloc[:,0]
y_train = y_train.iloc[:,0]
y_test = y_test.iloc[:,0]
X_val = X_val.iloc[:,0]
y_val = y_val.iloc[:,0]
X_train

0        worse tea fresh green tea indeed green tea tim...
1        ive tasted best real things wonderful dreadful...
2        love taste good ginger snap cookie saw offered...
3        absolutely love coachs oatmeal hated oatmeal t...
4        normally dont go instant coffees delicious ins...
                               ...                        
96058    ive loved graham crackers since kid many brand...
96059    decided give try since kcup coffee prices goin...
96060    excellent coffee either brewed hot ice drink l...
96061    reading useful negative review times comment t...
96062    excited high protein pretzels opened tasted ta...
Name: text, Length: 96063, dtype: object

In [4]:
df = pd.DataFrame(data=[X_train, y_train], index=["text", "label"]).T
df = df.append(pd.DataFrame(data=[X_val, y_val], index=["text", "label"]).T)
df = df.append(pd.DataFrame(data=[X_test, y_test], index=["text", "label"]).T)

### One-Hot encoding (CountVectorizing)

In [5]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(df['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(X_train)
xvalid_count =  count_vect.transform(X_val)
xtest_count =  count_vect.transform(X_test)

### TF-IDF

In [6]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=10000)
tfidf_vect.fit(df['text'])
xtrain_tfidf =  tfidf_vect.transform(X_train)
xvalid_tfidf =  tfidf_vect.transform(X_val)
xtest_tfidf =  tfidf_vect.transform(X_test)


# Model with Countvect

from sklearn import preprocessing, linear_model, naive_bayes, metrics, svm
model1 = naive_bayes.MultinomialNB()
model1.fit(xtrain_count,y_train)


from sklearn.neighbors import KNeighborsClassifier
model1 = KNeighborsClassifier()
model1.fit(xtrain_count,y_train)

In [7]:
from sklearn.ensemble import GradientBoostingClassifier
model1 = GradientBoostingClassifier(verbose=1)
model1.fit(xtrain_count,y_train)

      Iter       Train Loss   Remaining Time 
         1           0.8823            8.35m
         2           0.8726            8.09m
         3           0.8623            7.96m
         4           0.8548            7.88m
         5           0.8470            7.82m
         6           0.8401            7.74m
         7           0.8341            7.67m
         8           0.8273            7.60m
         9           0.8223            7.52m
        10           0.8165            7.44m
        20           0.7742            6.61m
        30           0.7430            5.75m
        40           0.7176            4.93m
        50           0.6954            4.11m
        60           0.6776            3.30m
        70           0.6614            2.48m
        80           0.6478            1.65m
        90           0.6344           49.55s
       100           0.6221            0.00s


GradientBoostingClassifier(verbose=1)

In [8]:
from sklearn.model_selection import cross_val_score
clf = GradientBoostingClassifier()
scores = cross_val_score(clf, xvalid_count, y_val, cv=5)
scores

array([0.87228728, 0.86695815, 0.86867583, 0.86867583, 0.86805122])

In [9]:
 print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.87 accuracy with a standard deviation of 0.00


In [10]:
predictions = model1.predict(xtest_count)

In [11]:
print(confusion_matrix(y_test,predictions))

[[ 1294  3997]
 [  136 26595]]


In [12]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.90      0.24      0.39      5291
           1       0.87      0.99      0.93     26731

    accuracy                           0.87     32022
   macro avg       0.89      0.62      0.66     32022
weighted avg       0.88      0.87      0.84     32022



# Model with Tf-idf

In [13]:
model2= GradientBoostingClassifier()
model2.fit(xtrain_tfidf,y_train)

GradientBoostingClassifier()

In [14]:
clf = GradientBoostingClassifier()
scores = cross_val_score(clf, xvalid_tfidf, y_val, cv=5)
scores

array([0.8724434 , 0.86867583, 0.87054966, 0.86883198, 0.866802  ])

In [15]:
 print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.87 accuracy with a standard deviation of 0.00


In [16]:
predictions = model2.predict(xtest_tfidf)

In [17]:
print(confusion_matrix(y_test,predictions))

[[ 1299  3992]
 [  139 26592]]


In [18]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.90      0.25      0.39      5291
           1       0.87      0.99      0.93     26731

    accuracy                           0.87     32022
   macro avg       0.89      0.62      0.66     32022
weighted avg       0.88      0.87      0.84     32022

