In [1]:
import pandas as pd 
import numpy as np
import sklearn
from sklearn.utils import class_weight
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import decomposition, ensemble
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import precision_score
from sklearn.model_selection import cross_validate
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, recall_score, f1_score
from sklearn.metrics import make_scorer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
import time

In [2]:
X_train = pd.read_csv('./data/X_train.csv')
X_test = pd.read_csv('./data/X_test.csv')
y_train = pd.read_csv('./data/y_train.csv')
y_test = pd.read_csv('./data/y_test.csv')
X_val = pd.read_csv('./data/X_val.csv')
y_val = pd.read_csv('./data/y_val.csv')

In [3]:
X_train = X_train.iloc[:,0]
X_test = X_test.iloc[:,0]
y_train = y_train.iloc[:,0]
y_test = y_test.iloc[:,0]
X_val = X_val.iloc[:,0]
y_val = y_val.iloc[:,0]
X_train

0        worse tea fresh green tea indeed green tea tim...
1        ive tasted best real things wonderful dreadful...
2        love taste good ginger snap cookie saw offered...
3        absolutely love coachs oatmeal hated oatmeal t...
4        normally dont go instant coffees delicious ins...
                               ...                        
96058    ive loved graham crackers since kid many brand...
96059    decided give try since kcup coffee prices goin...
96060    excellent coffee either brewed hot ice drink l...
96061    reading useful negative review times comment t...
96062    excited high protein pretzels opened tasted ta...
Name: text, Length: 96063, dtype: object

In [4]:
df = pd.DataFrame(data=[X_train, y_train], index=["text", "label"]).T
df = df.append(pd.DataFrame(data=[X_val, y_val], index=["text", "label"]).T)
df = df.append(pd.DataFrame(data=[X_test, y_test], index=["text", "label"]).T)

### One-Hot encoding (CountVectorizing)

In [5]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(df['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(X_train)
xvalid_count =  count_vect.transform(X_val)
xtest_count =  count_vect.transform(X_test)

### TF-IDF

In [6]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=10000)
tfidf_vect.fit(df['text'])
xtrain_tfidf =  tfidf_vect.transform(X_train)
xvalid_tfidf =  tfidf_vect.transform(X_val)
xtest_tfidf =  tfidf_vect.transform(X_test)


# Model with Countvect

from sklearn import preprocessing, linear_model, naive_bayes, metrics, svm
model1 = naive_bayes.MultinomialNB()
model1.fit(xtrain_count,y_train)


In [10]:
from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression(verbose=True)
model1.fit(xtrain_count,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.1s finished


LogisticRegression(verbose=True)

In [12]:
from sklearn.model_selection import cross_val_score
clf = LogisticRegression()
scores = cross_val_score(clf, xvalid_count, y_val, cv=5)
scores

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

array([0.9342701 , 0.94097439, 0.93488445, 0.93425984, 0.92816989])

In [13]:
 print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.93 accuracy with a standard deviation of 0.00


In [14]:
predictions = model1.predict(xtest_count)

In [15]:
print(confusion_matrix(y_test,predictions))

[[ 4421   870]
 [  548 26183]]


In [16]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.89      0.84      0.86      5291
           1       0.97      0.98      0.97     26731

    accuracy                           0.96     32022
   macro avg       0.93      0.91      0.92     32022
weighted avg       0.95      0.96      0.96     32022



# Model with Tf-idf

In [17]:
model2= LogisticRegression()
model2.fit(xtrain_tfidf,y_train)

LogisticRegression()

In [18]:
clf = LogisticRegression()
scores = cross_val_score(clf, xvalid_tfidf, y_val, cv=5)
scores

array([0.91209992, 0.9156777 , 0.91427233, 0.91552155, 0.90490319])

In [19]:
 print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.91 accuracy with a standard deviation of 0.00


In [20]:
predictions = model2.predict(xtest_tfidf)

In [21]:
print(confusion_matrix(y_test,predictions))

[[ 3693  1598]
 [  450 26281]]


In [22]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.89      0.70      0.78      5291
           1       0.94      0.98      0.96     26731

    accuracy                           0.94     32022
   macro avg       0.92      0.84      0.87     32022
weighted avg       0.93      0.94      0.93     32022

