In [249]:
import os
import numpy as np
import pandas as pd
import nltk
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from pprint import pprint

In [150]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Text pre-processing & train/test set construction

In [489]:
# Read the data

In [514]:
with open("op_spam_v1.4/negative_polarity/deceptive_train.csv") as f:
    lines = [s.replace("\n", "") for s in f.readlines()]

deceptive_train = pd.DataFrame(lines, columns=["Text"])
deceptive_train["Label"] = 0

In [515]:
with open("op_spam_v1.4/negative_polarity/truthful_train.csv") as f:
    lines = [s.replace("\n", "") for s in f.readlines()]

truthful_train = pd.DataFrame(lines, columns=["Text"])
truthful_train["Label"] = 1

In [516]:
with open("op_spam_v1.4/negative_polarity/deceptive_test.csv") as f:
    lines = [s.replace("\n", "") for s in f.readlines()]

deceptive_test = pd.DataFrame(lines, columns=["Text"])
deceptive_test["Label"] = 0

In [517]:
with open("op_spam_v1.4/negative_polarity/truthful_test.csv") as f:
    lines = [s.replace("\n", "") for s in f.readlines()]

truthful_test = pd.DataFrame(lines, columns=["Text"])
truthful_test["Label"] = 1

In [518]:
# Concat deceptive and truthful dataframes

In [519]:
df = pd.concat([deceptive_train, truthful_train, deceptive_test, truthful_test], axis=0).reset_index(drop=True)
df
#640-800 = test set

Unnamed: 0,Text,Label
0,Hotel is located 1/2 mile from the train stati...,0
1,I made my reservation at the Hilton Chicago be...,0
2,"When most people think Hilton, they think luxu...",0
3,My husband and I recently stayed stayed at the...,0
4,My wife and I booked a room at the Hilton Chic...,0
...,...,...
795,The Palmer House has a beautiful lobby with a ...,1
796,great expectations from the hotel of THE FUGIT...,1
797,"For a Hilton hotel I was very unimpressed, the...",1
798,Beautiful historic hotel -- and since I'm in h...,1


#### Further text pre-processing 

- Tokenization
- Lower-casing
- Punctuation & Special character removal
- Spelling correction
- Stop-word removal
- (Stemming (Porter)) *Skip for now

In [520]:
from nltk.corpus import stopwords # stopwords.words('english')
import string
import re
from textblob import TextBlob

In [521]:
df["Text"] = df["Text"].apply(lambda x: [word for word in nltk.word_tokenize(x)]) # Tokenize
df["Text"] = df["Text"].apply(lambda x: [word.lower() for word in x]) # Apply lower-casing
df["Text"] = df["Text"].apply(lambda x: [word for word in x if word not in string.punctuation]) # Punctuation removal
df["Text"] = df["Text"].apply(lambda x: [word for word in x if word not in ' '.join(stopwords.words('english'))]) # Stop word removal
df["Text"] = df["Text"].apply(lambda x: [re.sub("(?:\W|\d)+", "", word) for word in x]) # Removing special chars and numbers
df["Text"] = df["Text"].apply(lambda x: [word for word in x if word != ""]) # Remove empty strings
df["Text"] = df["Text"].apply(lambda x: [str(TextBlob(word).correct()) for word in x]) # Spelling correction

In [None]:
# Convert back to str to construct dtm with sklearn
df["Text"] = df["Text"].apply(lambda x: ' '.join(x))

In [566]:
print(df["Text"][0])

hotel located mile train station quite like traveling luggage ardor kiss seem cash guests arrive private car charging exorbitant parkingvalet fees rooms feature either double king sized beds queen beds want little extra leg room bed price jump double kingsized stiff rooms kind view pay healthy surcharge


In [599]:
from sklearn.feature_extraction.text import CountVectorizer

#Set ngram=1,2, 1,2 and 2,2 (only bigrams) and min_df (float 0-1) when needed.
vectorizer = CountVectorizer(ngram_range=(1,1), min_df=1, lowercase=False)
X = vectorizer.fit_transform(df["Text"])
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [600]:
X.shape

(800, 6504)

In [585]:
vectorizer.get_feature_names_out()[12000]

'day variation'

#### Seperate train and test set

In [None]:
hierna opsplitsen train en test (640-800)
en shuffel beide sets met sample(n=len(df), replace=False)

## Training and testing Unigram models

#### Logistic regression

## Training and testing of Bigram models

In [273]:
df["Text"] = df["Text"].apply(lambda x: [re.sub("(?:\W|\d)+", "", word) for word in x]) # Removing special chars

Unnamed: 0,Text,Label
0,"[Hotel, is, located, , mile, from, the, train,...",0
1,"[I, made, my, reservation, at, the, Hilton, Ch...",0
2,"[When, most, people, think, Hilton, , they, th...",0
3,"[My, husband, and, I, recently, stayed, stayed...",0
4,"[My, wife, and, I, booked, a, room, at, the, H...",0
...,...,...
795,"[The, Palmer, House, has, a, beautiful, lobby,...",1
796,"[great, expectations, from, the, hotel, of, TH...",1
797,"[For, a, Hilton, hotel, I, was, very, unimpres...",1
798,"[Beautiful, historic, hotel, , and, since, I, ...",1
