## Text pre-processing & train/test set construction

In [89]:
import os
import pickle
import numpy as np
import pandas as pd
import nltk
from pprint import pprint

In [90]:
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)

In [91]:
# Read the data

In [92]:
with open("op_spam_v1.4/negative_polarity/deceptive_train.csv") as f:
    lines = [s.replace("\n", "") for s in f.readlines()]

deceptive_train = pd.DataFrame(lines, columns=["Text"])
deceptive_train["Label"] = 0

In [93]:
with open("op_spam_v1.4/negative_polarity/truthful_train.csv") as f:
    lines = [s.replace("\n", "") for s in f.readlines()]

truthful_train = pd.DataFrame(lines, columns=["Text"])
truthful_train["Label"] = 1

In [94]:
with open("op_spam_v1.4/negative_polarity/deceptive_test.csv") as f:
    lines = [s.replace("\n", "") for s in f.readlines()]

deceptive_test = pd.DataFrame(lines, columns=["Text"])
deceptive_test["Label"] = 0

In [95]:
with open("op_spam_v1.4/negative_polarity/truthful_test.csv") as f:
    lines = [s.replace("\n", "") for s in f.readlines()]

truthful_test = pd.DataFrame(lines, columns=["Text"])
truthful_test["Label"] = 1

In [96]:
# Concat deceptive and truthful dataframes

In [97]:
df = pd.concat([deceptive_train, truthful_train, deceptive_test, truthful_test], axis=0).reset_index(drop=True)
df
#640-800 = test set

Unnamed: 0,Text,Label
0,Hotel is located 1/2 mile from the train stati...,0
1,I made my reservation at the Hilton Chicago be...,0
2,"When most people think Hilton, they think luxu...",0
3,My husband and I recently stayed stayed at the...,0
4,My wife and I booked a room at the Hilton Chic...,0
...,...,...
795,The Palmer House has a beautiful lobby with a ...,1
796,great expectations from the hotel of THE FUGIT...,1
797,"For a Hilton hotel I was very unimpressed, the...",1
798,Beautiful historic hotel -- and since I'm in h...,1


#### Further text pre-processing 

- Tokenization
- Lower-casing
- Punctuation & Special character removal
- Spelling correction
- Stop-word removal
- (Stemming (Porter)) *Skip for now

In [98]:
from nltk.corpus import stopwords # stopwords.words('english')
import string
import re
from textblob import TextBlob
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [99]:
# df["Text"] = df["Text"].apply(lambda x: [word for word in nltk.word_tokenize(x)]) # Tokenize
# df["Text"] = df["Text"].apply(lambda x: [word.lower() for word in x]) # Apply lower-casing
# df["Text"] = df["Text"].apply(lambda x: [word for word in x if word not in string.punctuation]) # Punctuation removal
# df["Text"] = df["Text"].apply(lambda x: [word for word in x if word not in ' '.join(stopwords.words('english'))]) # Stop word removal
# df["Text"] = df["Text"].apply(lambda x: [re.sub("(?:\W|\d)+", "", word) for word in x]) # Removing special chars and numbers
# df["Text"] = df["Text"].apply(lambda x: [word for word in x if word != ""]) # Remove empty strings
# df["Text"] = df["Text"].apply(lambda x: [str(TextBlob(word).correct()) for word in x]) # Spelling correction

In [100]:
# df.to_pickle("./df1.pkl")

In [101]:
# Unpickle the df. Pickled the df in case the kernel dies when fitting the models, because the spelling correction above takes a while to execute
df = pd.read_pickle("./df1.pkl")
df.head()

Unnamed: 0,Text,Label
0,"[hotel, located, mile, train, station, quite, ...",0
1,"[made, reservation, hilton, chicago, believing...",0
2,"[people, think, hilton, think, luxury, know, w...",0
3,"[husband, recently, stayed, stayed, hilton, ch...",0
4,"[wife, booked, room, hilton, chicago, three, w...",0


In [102]:
# Convert back to str to construct dtm with sklearn CountVectorizer
df["Text"] = df["Text"].apply(lambda x: ' '.join(x))

In [103]:
df.head()

Unnamed: 0,Text,Label
0,hotel located mile train station quite like tr...,0
1,made reservation hilton chicago believing goin...,0
2,people think hilton think luxury know wish hal...,0
3,husband recently stayed stayed hilton chicago ...,0
4,wife booked room hilton chicago three weekend ...,0


In [104]:
print(df["Text"][6])

high hopes hilton chicago sad say disappointed outrageous expensive two people one night expect pay park car offer free wife instead pay get internet room wait pm check even though flight morning rent car airport hotel offer transportation stress hilton chicago hotel bar either doubt stay


In [105]:
from sklearn.feature_extraction.text import CountVectorizer

#Set ngram=1,2, 1,2 and 2,2 (only bigrams) and min_df (float 0-1) for Naive Bayes when needed (use diff thresholds 0.005 increment starting at 1% -> 10%).
vectorizer = CountVectorizer(ngram_range=(1,1), min_df=1, lowercase=False)
X = vectorizer.fit_transform(df["Text"])
# X.toarray()

In [106]:
X.shape

(800, 6504)

In [107]:
vectorizer.get_feature_names()[100]

'advances'

#### Seperate train and test set

In [108]:
X_train, y_train = X[0:640], df["Label"][0:640].to_numpy()
X_test, y_test = X[640:], df["Label"][640:].to_numpy()

## Models using unigram only

In [109]:
from sklearn.model_selection import GridSearchCV

In [110]:
from sklearn.metrics import confusion_matrix

def performance_metrics(y_true, y_pred):
    cf_matrix = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cf_matrix.ravel()
    
    accuracy = (tn+tp)/(tn+tp+fp+fn)
    recall =  (tp)/(tp+fn)
    precision = (tp)/(tp+fp)
    f1 = 2 * ((precision*recall)/(precision+recall))
    
    print(f'Accuracy: {accuracy:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'Precision: {precision:.2f}')
    print(f'F1: {f1:.2f}')
    
    return (accuracy, recall, precision, f1)

#### Naive Bayes

In [61]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

In [62]:
def cv_NB():
    
    # Find best threshold
    
    thresholds = np.arange(0.01, 10, 0.001)
    
    cv_scores = []
    for thresh in thresholds:
        vectorizer = CountVectorizer(ngram_range=(1,1), min_df=thresh, lowercase=False)
        X = vectorizer.fit_transform(df["Text"]) 
        X_train = X[0:640] # Only need to change X_train/test, y_train/test remains same (only removing/adding columns, not rows)
        X_test = X[640:]
        score = cross_val_score(MultinomialNB(), X_train, y_train)
        cv_scores.append((score.mean(), X_train, X_test))
    
    return max(cv_scores, key=lambda x: x[0]) # Return X_train/X_test for which CV score was best (best doc frequency threshold)
        
best_thresh = cv_NB()  # (score.mean(), X_train, X_test)

ValueError: After pruning, no terms remain. Try a lower min_df or a higher max_df.

In [None]:
# Retrain on whole training set with reduced feature count (resulting from applying best threshold)
X_train_nb, X_test_nb = best_thresh[1], best_thresh[2]
nb_clf = MultinomialNB()
nb_clf.fit(X_train_nb, y_train) 

In [None]:
y_pred = nb_clf.predict(X_test_nb)
performance_metrics(y_test, y_pred)

In [None]:
thresholds = np.arange(0.01, 10, 0.001)
thresholds

#### Logistic regression

In [111]:
from sklearn.linear_model import LogisticRegression

In [121]:
alphas = np.arange(0.01, 10 ,0.01)
alphas = np.append(alphas, [10, 20, 30, 40, 50, 100])
params = {'C': alphas}
lr = GridSearchCV(LogisticRegression(penalty="l1", solver="liblinear"),  param_grid=params)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
performance_metrics(y_test, y_pred)



Accuracy: 0.86
Recall: 0.86
Precision: 0.85
F1: 0.86


(0.85625, 0.8625, 0.8518518518518519, 0.8571428571428572)

#### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Hyperparameters to search
alphas = np.arange(0, 10, 0.01)
alphas = np.append(alphas, [10, 20, 30, 40, 50, 100])
alphas = [round(a, 2) for a in alphas]
nmin = list(range(2, 26))
minleaf = list(range(1, 11))

params = {'min_samples_split': nmin, 'min_samples_leaf': minleaf, 'ccp_alpha': alphas}

dt_clf = GridSearchCV(DecisionTreeClassifier(), param_grid=params, verbose=1, n_jobs=-1)

In [None]:
dt_clf.fit(X_train, y_train)

In [None]:
_file = open('dt_uni.pkl', "wb")
pickle.dump(dt_clf, _file)
_file.close()

In [None]:
dt_clf.best_params_

In [None]:
y_pred = dt_clf.predict(X_test)
performance_metrics(y_test, y_pred)

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
import itertools
from IPython.display import clear_output

In [None]:
import multiprocessing as mp

nmin = list(range(2, 21))
minleaf = list(range(1, 11))
ntrees = [50, 100, 200, 300, 500]
nfeats = list(range(10, 201, 10))

cartesian_product = list(itertools.product(nmin, minleaf, ntrees, nfeats))

params = [p for p in cartesian_product if p[1] * 2 <= p[0]]

def train_rf(params):
    
    # Estimate best RF hyperparameters using OOB performance instead of CV
    best_rf_clf = None
    i = 0
    for p in params:
        rf_clf = RandomForestClassifier(n_estimators=p[2], min_samples_split=p[0],
                                    min_samples_leaf=p[1], max_features=p[3],
                                    oob_score=True, n_jobs=-1)
        rf_clf.fit(X_train, y_train)
        
        if best_rf_clf is None:
            best_rf_clf = rf_clf
            
        if rf_clf.oob_score_ > best_rf_clf.oob_score_:
            best_rf_clf = rf_clf
            
        clear_output(wait=True)
    
        print(f'Iteration: {i}, OOB: {round(rf_clf.oob_score_, 2)}') 
    
    return (rf_clf, rf_clf.oob_score_)

# Start multiprocessing pool
best_rf_clf = train_rf(params)
# p = mp.Pool(processes=mp.cpu_count())
# rf_clfs = p.map(train_rf, params)
# p.close()
# p.join()

In [None]:
len(params)

In [None]:
best_rf_clf = max(rf_clfs, key=lambda x: x[1])

In [None]:
_file = open('rf_uni.pkl', "wb")
pickle.dump(best_rf_clf, _file)
_file.close()

In [None]:
y_pred = best_rf_clf.predict(X_test)
performance_metrics(y_test, y_pred)

## Models using unigrams **and** bigrams

## Models using bigrams only