In [2]:
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [3]:
import pandas as pd
import numpy as np

import string

import time


In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
#from sklearn.svm import LinearSVC

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

In [5]:
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

from xgboost import XGBClassifier

In [6]:
# Preparing the main corpus
main_df= pd.read_csv('data/reviews.csv')

# Remove nan reviews
df= main_df.copy()
df.dropna(subset= ['Review Text'], inplace= True)

print('main df and clean df shape:',main_df.shape , df.shape)

# X , y

X = df['Review Text']
y= df["Recommended IND"].values.reshape(-1,1)
print("X.shape:", X.shape, '\t y_shape:', y.shape)


X_train, X_test, y_train, y_test = train_test_split(X , y , stratify =y , test_size = 0.3)
print('train_test_split:',X_train.shape , X_test.shape,'\t', y_train.shape, y_test.shape)

main df and clean df shape: (23486, 11) (22641, 11)
X.shape: (22641,) 	 y_shape: (22641, 1)
train_test_split: (15848,) (6793,) 	 (15848, 1) (6793, 1)


In [7]:
# create countvectorizer:
start_time = time.time()

cv = CountVectorizer(binary = True, stop_words= 'english',  min_df = 5, max_df = 0.95, ngram_range=(1,2))
cv.fit_transform(X_train)
train_feature_set = cv.transform(X_train)
test_feature_set = cv.transform(X_test)

print("Time takes to convert text input into feature vector: ", round((time.time() - start_time)/60, 2), " mins")

Time takes to convert text input into feature vector:  0.04  mins


In [88]:
train_feature_set.shape

(15848, 15220)

In [96]:
## Logistic regression

lr = LogisticRegression()
lr.fit(train_feature_set,y_train)
y_pred = lr.predict(test_feature_set)

cm = confusion_matrix(y_test,y_pred)
print('Confusion Matrix:\n', cm)
print("Accuracy: ",round(accuracy_score(y_test,y_pred),3))
print("F1: ",round(f1_score(y_test, y_pred),3))
print("Recall: ",round(recall_score(y_test,y_pred),3))
print("percision: ",round(precision_score(y_test, y_pred),3))
print(classification_report(y_test, y_pred))

Confusion Matrix:
 [[ 730  500]
 [ 278 5285]]
Accuracy:  0.885
F1:  0.931
Recall:  0.95
percision:  0.914
              precision    recall  f1-score   support

           0       0.72      0.59      0.65      1230
           1       0.91      0.95      0.93      5563

    accuracy                           0.89      6793
   macro avg       0.82      0.77      0.79      6793
weighted avg       0.88      0.89      0.88      6793



In [8]:
def model_predictions(model_name, X_train,y_train,X_test, y_test):
    
    model = model_name
    model.fit(X_train, y_train)
    
    print(model_name)
    acc = cross_val_score(model, X_train, y_train, scoring = "accuracy", cv = 5)
    predictions = cross_val_predict(model, X_test, y_test, cv = 5)
    print("Accuracy:", round(acc.mean(),3))
    cm = confusion_matrix(y_test, predictions)
    print("Confusion Matrix:  \n", cm)
    print("Classification Report \n", classification_report( y_test, predictions))

In [102]:
# logistic regresion
lr = LogisticRegression()
model_predictions(lr,train_feature_set,y_train, test_feature_set, y_test)

LogisticRegression()
Accuracy: 0.886
Confusion Matrix:  
 [[ 653  577]
 [ 284 5279]]
Classification Report 
               precision    recall  f1-score   support

           0       0.70      0.53      0.60      1230
           1       0.90      0.95      0.92      5563

    accuracy                           0.87      6793
   macro avg       0.80      0.74      0.76      6793
weighted avg       0.86      0.87      0.87      6793



In [107]:
# random forest:
rf = RandomForestClassifier()
model_predictions(rf,train_feature_set,y_train, test_feature_set, y_test)

RandomForestClassifier()
Accuracy: 0.861
Confusion Matrix:  
 [[ 275  955]
 [  66 5497]]
Classification Report 
               precision    recall  f1-score   support

           0       0.81      0.22      0.35      1230
           1       0.85      0.99      0.92      5563

    accuracy                           0.85      6793
   macro avg       0.83      0.61      0.63      6793
weighted avg       0.84      0.85      0.81      6793



In [108]:
# Gaussian Naieve Bayes
gnb = GaussianNB()
model_predictions(gnb,train_feature_set.toarray(),y_train, test_feature_set.toarray(), y_test)

GaussianNB()
Accuracy: 0.801
Confusion Matrix:  
 [[ 265  965]
 [ 312 5251]]
Classification Report 
               precision    recall  f1-score   support

           0       0.46      0.22      0.29      1230
           1       0.84      0.94      0.89      5563

    accuracy                           0.81      6793
   macro avg       0.65      0.58      0.59      6793
weighted avg       0.77      0.81      0.78      6793



In [9]:
## XG Boost
xgb = XGBClassifier()
model_predictions(xgb,train_feature_set.toarray(),y_train, test_feature_set.toarray(), y_test)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
Accuracy: 0.877
Confusion Matrix:  
 [[ 603  627]
 [ 196 5367]]
Classification Report 
               precision    recall  f1-score   support

           0       0.75      0.49      0.59      1230
           1       0.90      0.96      0.93      5563

    accuracy                           0.88      6793
   macro avg       0.83      0.73      0.76      6793
weighted avg       0.87      0.88      0.87      6793



In [None]:
## same with TF_IDF

In [None]:
punct = set(string.punctuation)

ps = PorterStemmer()

remove_sw_list=['not', 'no']
sw = [i for i in stopwords.words('english') if i not in remove_sw_list]

def text_prep_stop_stem(text):
    #clean text
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    
    #remove non-letters and lower case
    text = re.sub('[^a-z\s]', '', text.lower())
    
    #remove punctuation        
    punc_removed = [char for char in text if char not in punct]
    punc_removed = ''.join(punc_removed)
    
    #stem and remove stop words
    return [ps.stem(word) for word in punc_removed.split() if not word in sw]
    #return [word for word in punc_removed.split() if not word in sw]

In [None]:
# create tfidf vectorizer:
start_time = time.time()

tv = TfidfVectorizerianalyzer=orizer(analyzer= text_prep_stop_stem,  min_df = 5, max_df = 0.95, ngram_range=(1,2))
cv.fit_transform(X_train)
train_feature_set = cv.transform(X_train)
test_feature_set = cv.transform(X_test)

print("Time takes to convert text input into feature vector: ", round((time.time() - start_time)/60, 2), " mins")