# 1. Import necessary libraries

In [2]:
import pandas as pd
import numpy as np
import re
import nltk
import spacy
import scipy
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from textblob import Word
from sklearn.utils import resample
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,accuracy_score,precision_score,recall_score
import pickle
from sklearn import preprocessing
from scipy.sparse import csr_matrix,hstack
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# 2. Read Data

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
#Read files
train = pd.read_csv("/content/drive/MyDrive/CS4248_Project/raw_data/fulltrain.csv",header=None)
test = pd.read_csv("/content/drive/MyDrive/CS4248_Project/raw_data/balancedtest.csv",header=None)

In [6]:
train.columns = ['Verdict','Text']
test.columns = ['Verdict','Text']

In [7]:
train1 = train.copy()

# 3. Preprocessing for Text

In [8]:
def preprocess_text(s, replace=None, remove_punctuation = None, lower=None,stopword=None,frequency_words=None,scared_word=None, noisy=None, stemming=None,lemmatization=None):
    #Throw an error is both stemming and lemmatization are not None

    s1 = s.copy()
    if stemming is not None and lemmatization is not None:
        raise ValueError('Stemming and Lemmatization cannot both be not None!')


    if replace is not None:
        #Replace URLs with 'webaddress'
        s1['Text'] = s1['Text'].str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress',regex=True)
        #Replace email address with 'email'
        s1['Text'] = s1['Text'].str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                 'emailaddress',regex=True)
        #Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
        s1['Text'] = s1['Text'].str.replace(r'£|\$', 'moneysymb',regex=True)

        #Replace percentage symbols with 'percentage'
        s1['Text'] = s1['Text'].str.replace(r'%', 'percentage',regex=True)

        #Replace 10 digit phone number
        s1['Text'] = s1['Text'].str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr',regex=True)
        # Replace numbers with 'numbr'
        s1['Text'] = s1['Text'].str.replace(r'\d+(\.\d+)?', 'numbr',regex=True)
    #Remove punctuation
    if remove_punctuation is not None:
        s1['Text'] = s1['Text'].apply(lambda x: re.sub(r'[^\w\s\d]', '', x))

    #Transform to lower letter
    if lower is not None:
        s1['Text'] = s1['Text'].apply(lambda x: x.lower())

    #Remove the stopwords
    if stopword is not None:
        stop=stopwords.words('english')
        s1['Text']=s1['Text'].apply(lambda sen:" ".join(x for x in sen.split() if x not in stop))

    #Remove the frequency words
    if frequency_words is not None:
        freq=pd.Series(' '.join(s).split()).value_counts()[:10]
        freq=list(freq.index)
        s1['Text']=s1['Text'].apply(lambda sen:' '.join(x for x in sen.split() if x not in freq))

    # Remove the scarce word
    if scared_word is not None:
        scared = pd.Series(' '.join(s).split()).value_counts()[-10:]
        scared = list(scared.index)
        s1['Text'] = s1['Text'].apply(lambda sen: " ".join(x for x in sen.split() if x not in scared))

    #Noisy Removal
    if noisy is not None:
        #remove non-ascii
        s1['Text']= s1['Text'].apply(lambda x: re.sub("(\\W)"," ",x))
        #remove whitespace
        s1['Text']=s1['Text'].apply(lambda x: x.strip())

    #Stemming
    if stemming is not None:
        ps = PorterStemmer()
        s1['Text']=s1['Text'].apply(lambda x:" ".join(ps.stem(word) for word in x.split()))

    #Lemmatization
    if lemmatization is not None:
        nltk.download('wordnet')
        s1['Text']= s1['Text'].apply(lambda x:" ".join([Word(word).lemmatize() for word in x.split()]))

    return s1

# 4. Feature Engineering

In [9]:
#Combine three feature engineering methods into one class
def feature_engineering(s, train=None,tf_idf=None, word2vec=None, word_count=None):
    #1. TF-IDF
    s1 = s.copy()
    if tf_idf is not None:
        tfv = TfidfVectorizer(min_df=3,  max_features=None,strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = 'english')
        # Fitting TF-IDF to both training and test sets (semi-supervised learning)
        tfv.fit(list(train['Text']))
        features =  tfv.transform(s1['Text'])
    #2. Word2Vec
    if word2vec is not None:
        nlp = spacy.load('en_core_web_sm')
        features = []
        for sentence in s1['Text']:
            doc = nlp(sentence)
            features.append(doc.vector)
    #3. Word-count document
    if word_count is not None:
        #Instantiate the vectorizer
        count_vectorizer = CountVectorizer()
        features = count_vectorizer.fit_transform(s1['Text'])

    return features

# 5. XGBoost

In [10]:
pre7_train = preprocess_text(train1, replace=1, remove_punctuation=1, lower=1,stopword=None,noisy=None,frequency_words=None,scared_word=None,lemmatization=1)
pre7_test = preprocess_text(test, replace=1, remove_punctuation=1, lower=1,stopword=None,noisy=None,frequency_words=None,scared_word=None,lemmatization=1)
# # TF-IDF
train_tf = feature_engineering(pre7_train, tf_idf=1, train=pre7_train, word2vec=None, word_count=None)
test_tf = feature_engineering(pre7_test, tf_idf=1, train=pre7_train, word2vec=None, word_count=None)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
train_y = pre7_train['Verdict']
y_test = pre7_test['Verdict']

In [None]:
#Randomly choose 10% dataset from the original one
X_train, X_one_ten, y_train, y_one_ten = train_test_split(train_tf, train_y, test_size=0.1, random_state=42)

In [14]:
xgb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
xgb.fit(X_one_ten, y_one_ten)
y_pred = xgb.predict(test_tf)
score = []
f1_macro = f1_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)
precision_macro = precision_score(y_test, y_pred, average='macro')
recall_macro = recall_score(y_test, y_pred, average='macro')
score.append([f1_macro,accuracy,precision_macro,recall_macro])
print(score)

[[0.5468871920880969, 0.561, 0.5709124197031853, 0.5609999999999999]]


## 5.2 Hyperparameter Tunning

In [15]:
#Learning Rate
lr = [0.001,1,5,10,20]
result = []
for i in lr:
    clf = GradientBoostingClassifier(n_estimators=100, learning_rate=i, max_depth=1, random_state=0)
    clf.fit(X_one_ten, y_one_ten)
    y_pred = clf.predict(test_tf)
    f1_macro = f1_score(y_test, y_pred, average='macro')
    result.append(f1_macro)
print(result)

[0.1, 0.5468871920880969, 0.23466887744501796, 0.1128348906971933, 0.16604359824495227]


In [16]:
#Learning Rate
n_estimators = [200,500,100]
result = []
for i in n_estimators:
    clf = GradientBoostingClassifier(n_estimators=i, learning_rate=1, max_depth=1, random_state=0)
    clf.fit(X_one_ten, y_one_ten)
    y_pred = clf.predict(test_tf)
    f1_macro = f1_score(y_test, y_pred, average='macro')
    result.append(f1_macro)
print(result)

[0.575792082245718, 0.5947006450429947, 0.5468871920880969]


In [17]:
#Max Depth
max_depth = [3,5]
result = []
for i in max_depth:
    clf = GradientBoostingClassifier(n_estimators=1000, learning_rate=1, max_depth=i, random_state=0)
    clf.fit(X_one_ten, y_one_ten)
    y_pred = clf.predict(test_tf)
    f1_macro = f1_score(y_test, y_pred, average='macro')
    result.append(f1_macro)
print(result)

[0.6056158475293896, 0.609257245227721]


## 5.3 Best Model

In [13]:
result = []
clf = GradientBoostingClassifier(n_estimators=1000, learning_rate=1, max_depth=5, random_state=0)
clf.fit(train_tf, train_y)
y_pred = clf.predict(test_tf)
f1_macro = f1_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)
precision_macro = precision_score(y_test, y_pred, average='macro')
recall_macro = recall_score(y_test, y_pred, average='macro')
result.append([f1_macro,accuracy,precision_macro,recall_macro])
print(result)

## 5.4 Large and Small Dataset

In [14]:
train5 = train1.sample(n=20000, random_state=1)
test5 = test

In [15]:
pre7_train = preprocess_text(train5,replace=1, remove_punctuation=1, lower=1,stopword=None,noisy=None,frequency_words=None,scared_word=None,lemmatization=1)
pre7_test = preprocess_text(test5,replace=1, remove_punctuation=1, lower=1,stopword=None,noisy=None,frequency_words=None,scared_word=None,lemmatization=1)

In [16]:
# TF-IDF
train_tf_all_s = feature_engineering(pre7_train, tf_idf=1, train=pre7_train, word2vec=None, word_count=None)
test_tf_all_s = feature_engineering(pre7_test, tf_idf=1, train=pre7_train, word2vec=None, word_count=None)

In [17]:
train_y_s = pre7_train['Verdict']
y_test_s = pre7_test['Verdict']

In [18]:
lg = GradientBoostingClassifier(n_estimators=500, learning_rate=1, max_depth=i, random_state=0)
lg.fit(train_tf_all_s, train_y_s)
y_pred = lg.predict(test_tf_all_s)
score = []
f1_macro = f1_score(y_test_s, y_pred, average='macro')
accuracy = accuracy_score(y_test_s, y_pred)
precision_macro = precision_score(y_test_s, y_pred, average='macro')
recall_macro = recall_score(y_test_s, y_pred, average='macro')
score.append([f1_macro,accuracy,precision_macro,recall_macro])
print(score)