# 1. Import necessary libraries

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import spacy
import scipy
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from textblob import Word
from sklearn.utils import resample
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,accuracy_score,precision_score,recall_score
import pickle
from sklearn import preprocessing
from scipy.sparse import csr_matrix,hstack
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
!pip3 install spacy
!python3 -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 5.3 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


# 2. Read Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Read files
train = pd.read_csv("/content/drive/MyDrive/CS4248_Project/raw_data/fulltrain.csv",header=None)
test = pd.read_csv("/content/drive/MyDrive/CS4248_Project/raw_data/balancedtest.csv",header=None)

In [None]:
train.columns = ['Verdict','Text']
test.columns = ['Verdict','Text']

In [None]:
#Unbalance training data
train['Verdict'].value_counts()

3    17870
1    14047
4     9995
2     6942
Name: Verdict, dtype: int64

In [None]:
train1=train.copy()
test1 = test.copy()

In [None]:
from sklearn.model_selection import train_test_split
y=train['Verdict']
X=train['Text']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1, random_state=42)

In [None]:
X_test_p = pd.DataFrame(X_test)
y_test_p = pd.DataFrame(y_test)

In [None]:
data_small = pd.concat([X_test_p,y_test_p],axis=1)
data_small

Unnamed: 0,Text,Verdict
14019,According to witnesses at your old high school...,1
44568,The National Immigration Agency (NIA) announce...,4
43613,The enforcement division of the Securities and...,4
2213,"In an effort to bolster its flagging ratings, ...",1
17804,Sarah Palin Makes Tragically Heartbreaking New...,2
...,...,...
3504,Back to story: Netflix Board Of Directors Meet...,1
28162,Germany Bans Fracking But Theres a HitchBy Bra...,3
43123,They are forever shocked to be visited in thei...,4
14471,WOW: Secret Hillary Video Leaks... She Just Lo...,2


In [None]:
data_small_X = data_small['Text']
data_small_y = data_small['Verdict']

In [None]:
test_X = test['Text']
test_y = test['Verdict']

# 3. Preprocessing for Text

In [None]:
def preprocess_text(s, replace=None, remove_punctuation = None, lower=None,stopword=None,frequency_words=None,scared_word=None, noisy=None, stemming=None,lemmatization=None):
    #Throw an error is both stemming and lemmatization are not None

    s1 = s.copy()
    if stemming is not None and lemmatization is not None:
        raise ValueError('Stemming and Lemmatization cannot both be not None!')


    if replace is not None:
        #Replace URLs with 'webaddress'
        s1['Text'] = s1['Text'].str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress',regex=True)
        #Replace email address with 'email'
        s1['Text'] = s1['Text'].str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                 'emailaddress',regex=True)
        #Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
        s1['Text'] = s1['Text'].str.replace(r'£|\$', 'moneysymb',regex=True)

        #Replace percentage symbols with 'percentage'
        s1['Text'] = s1['Text'].str.replace(r'%', 'percentage',regex=True)

        #Replace 10 digit phone number
        s1['Text'] = s1['Text'].str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr',regex=True)
        # Replace numbers with 'numbr'
        s1['Text'] = s1['Text'].str.replace(r'\d+(\.\d+)?', 'numbr',regex=True)
    #Remove punctuation
    if remove_punctuation is not None:
        s1['Text'] = s1['Text'].apply(lambda x: re.sub(r'[^\w\s\d]', '', x))

    #Transform to lower letter
    if lower is not None:
        s1['Text'] = s1['Text'].apply(lambda x: x.lower())

    #Remove the stopwords
    if stopword is not None:
        stop=stopwords.words('english')
        s1['Text']=s1['Text'].apply(lambda sen:" ".join(x for x in sen.split() if x not in stop))

    #Remove the frequency words
    if frequency_words is not None:
        freq=pd.Series(' '.join(s).split()).value_counts()[:10]
        freq=list(freq.index)
        s1['Text']=s1['Text'].apply(lambda sen:' '.join(x for x in sen.split() if x not in freq))

    # Remove the scarce word
    if scared_word is not None:
        scared = pd.Series(' '.join(s).split()).value_counts()[-10:]
        scared = list(scared.index)
        s1['Text'] = s1['Text'].apply(lambda sen: " ".join(x for x in sen.split() if x not in scared))

    #Noisy Removal
    if noisy is not None:
        #remove non-ascii
        s1['Text']= s1['Text'].apply(lambda x: re.sub("(\\W)"," ",x))
        #remove whitespace
        s1['Text']=s1['Text'].apply(lambda x: x.strip())

    #Stemming
    if stemming is not None:
        ps = PorterStemmer()
        s1['Text']=s1['Text'].apply(lambda x:" ".join(ps.stem(word) for word in x.split()))

    #Lemmatization
    if lemmatization is not None:
        nltk.download('wordnet')
        s1['Text']= s1['Text'].apply(lambda x:" ".join([Word(word).lemmatize() for word in x.split()]))

    return s1

# 4. Feature Engineering

In [None]:
#Combine three feature engineering methods into one class
def feature_engineering(s, train=None,tf_idf=None, word2vec=None, word_count=None):
    #1. TF-IDF
    s1 = s.copy()
    if tf_idf is not None:
        tfv = TfidfVectorizer(min_df=3,  max_features=None,strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = 'english')
        # Fitting TF-IDF to both training and test sets (semi-supervised learning)
        tfv.fit(list(train['Text']))
        features =  tfv.transform(s1['Text'])
    #2. Word2Vec
    if word2vec is not None:
        nlp = spacy.load('en_core_web_sm')
        features = []
        for sentence in s1['Text']:
            doc = nlp(sentence)
            features.append(doc.vector)
    #3. Word-count document
    if word_count is not None:
        #Instantiate the vectorizer
        count_vectorizer = CountVectorizer()
        features = count_vectorizer.fit_transform(s1['Text'])

    return features

# 5. Logistic Regression

## 5.1 Data Preprocessing (no data preprocessing vs data preprocessing)

In [None]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [None]:
# Preproecssing
# pr1 = no data preprocessing
# pre2 = replace, remove_punctionation, lower
# pre3 = replace, remove_punctionation, lower, stopword
# pre4 = replace, remove_punctionation, lower, stopword, noisy
# pre5 = replace, remove_punctionation, lower, stopword, noisy, frequency_words, scared_word
# pre6 = replace, remove_punctionation, lower, stopword, noisy, frequency_words, scared_word, lemmatization
# pre7 = replace, remove_punctionation, lower, lemmatization

pre1_train = preprocess_text(data_small)
pre1_test = preprocess_text(test)
pre2_train = preprocess_text(data_small, replace=1, remove_punctuation=1, lower=1)
pre2_test = preprocess_text(test, replace=1, remove_punctuation=1, lower=1)
pre3_train = preprocess_text(data_small, replace=1, remove_punctuation=1, lower=1,stopword=1)
pre3_test = preprocess_text(test, replace=1, remove_punctuation=1, lower=1,stopword=1)
pre4_train = preprocess_text(data_small, replace=1, remove_punctuation=1, lower=1,stopword=1,noisy=1)
pre4_test = preprocess_text(test, replace=1, remove_punctuation=1, lower=1,stopword=1,noisy=1)
pre5_train = preprocess_text(data_small, replace=1, remove_punctuation=1, lower=1,stopword=1,noisy=1,frequency_words=1,scared_word=1)
pre5_test = preprocess_text(test, replace=1, remove_punctuation=1, lower=1,stopword=1,noisy=1,frequency_words=1,scared_word=1)
pre6_train = preprocess_text(data_small, replace=1, remove_punctuation=1, lower=1,stopword=1,noisy=1,frequency_words=1,scared_word=1,lemmatization=1)
pre6_test = preprocess_text(test, replace=1, remove_punctuation=1, lower=1,stopword=1,noisy=1,frequency_words=1,scared_word=1,lemmatization=1)
pre7_train = preprocess_text(data_small, replace=1, remove_punctuation=1, lower=1,stopword=None,noisy=None,frequency_words=None,scared_word=None,lemmatization=1)
pre7_test = preprocess_text(test, replace=1, remove_punctuation=1, lower=1,stopword=None,noisy=None,frequency_words=None,scared_word=None,lemmatization=1)

pre_train = [(pre1_train,pre1_test),(pre2_train,pre2_test), (pre3_train,pre3_test), (pre4_train,pre4_test), (pre5_train,pre5_test),(pre6_train,pre6_test),(pre7_train,pre7_test)]

for i in pre_train:
    train=i[0]
    test=i[1]
    train_tf = feature_engineering(train, tf_idf=1, train=train, word2vec=None, word_count=None)
    test_tf = feature_engineering(test, tf_idf=1, train=train, word2vec=None, word_count=None)
    clf = LogisticRegression(C=1.0, solver='lbfgs',max_iter=3000)
    clf.fit(train_tf, train['Verdict'])
    y_pred = clf.predict(test_tf)
    y_test=test['Verdict']
    score = []
    f1_macro = f1_score(y_test, y_pred, average='macro')
    accuracy = accuracy_score(y_test, y_pred)
    precision_macro = precision_score(y_test, y_pred, average='macro')
    recall_macro = recall_score(y_test, y_pred, average='macro')
    score.append([f1_macro,accuracy,precision_macro,recall_macro])
    print(score)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[[0.6564849626947871, 0.6763333333333333, 0.7131907443613872, 0.6763333333333333]]
[[0.6640057322935629, 0.6896666666666667, 0.7238802241381181, 0.6896666666666667]]
[[0.6613423276026961, 0.687, 0.7194724199670502, 0.687]]
[[0.6613423276026961, 0.687, 0.7194724199670502, 0.687]]
[[0.6613423276026961, 0.687, 0.7194724199670502, 0.687]]
[[0.6714395719827623, 0.695, 0.7302885119037538, 0.6950000000000001]]
[[0.6730502196339856, 0.697, 0.7336226717847157, 0.697]]


We can find that pre7 can get the best score. (pre7 = replace, remove_punctionation, lower, lemmatization)

## 5.2 Difference of feature engineering

In [None]:
pre7_alltrain = preprocess_text(train1, replace=1, remove_punctuation=1, lower=1,stopword=None,noisy=None,frequency_words=None,scared_word=None,lemmatization=1)
pre7_test = preprocess_text(test, replace=1, remove_punctuation=1, lower=1,stopword=None,noisy=None,frequency_words=None,scared_word=None,lemmatization=1)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# TF-IDF
train_tf = feature_engineering(pre7_alltrain, tf_idf=1, train=pre6_train, word2vec=None, word_count=None)
test_tf = feature_engineering(pre7_test, tf_idf=1, train=pre6_train, word2vec=None, word_count=None)
#Word2Vec
train_w2v = feature_engineering(pre7_alltrain, tf_idf=None, train=pre6_train, word2vec=1, word_count=None)
test_w2v = feature_engineering(pre7_test, tf_idf=None, train=pre6_train, word2vec=1, word_count=None)


### 5.2.1 TF-IDF

In [None]:
#TF-IDF
clf = LogisticRegression(C=1.0, solver='lbfgs',max_iter=3000)
clf.fit(train_tf, pre7_alltrain['Verdict'])
y_pred = clf.predict(test_tf)
y_test=pre7_test['Verdict']
score = []
f1_macro = f1_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)
precision_macro = precision_score(y_test, y_pred, average='macro')
recall_macro = recall_score(y_test, y_pred, average='macro')
score.append([f1_macro,accuracy,precision_macro,recall_macro])
print(score)

[[0.7455367922168222, 0.7553333333333333, 0.7728430610239194, 0.7553333333333333]]


### 5.2.2 Word2Vec

In [None]:
#Word2Vec
clf = LogisticRegression(C=1.0, solver='lbfgs',max_iter=3000)
clf.fit(train_w2v, pre7_alltrain['Verdict'])
y_pred = clf.predict(test_w2v)
y_test=pre7_test['Verdict']
score = []
f1_macro = f1_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)
precision_macro = precision_score(y_test, y_pred, average='macro')
recall_macro = recall_score(y_test, y_pred, average='macro')
score.append([f1_macro,accuracy,precision_macro,recall_macro])
print(score)

[[0.548621234271049, 0.5656666666666667, 0.569778908217901, 0.5656666666666667]]


### 5.2.3 LIWC

In [None]:
#LIWC
with open("/content/drive/MyDrive/CS4248 Project/liwc+glove/mlp_X_train.pickle", 'rb') as f1:
    LIWC_train = pickle.load(f1)
with open("/content/drive/MyDrive/CS4248 Project/liwc+glove/mlp_X_test.pickle", 'rb') as f2:
    LIWC_test = pickle.load(f2)

In [None]:
LIWC_train = pd.DataFrame(LIWC_train)
LIWC_test = pd.DataFrame(LIWC_test)

In [None]:
with open("/content/drive/MyDrive/CS4248 Project/liwc+glove/mlp_y_train.pickle", 'rb') as f3:
    LIWC_train_y = pickle.load(f3)
with open("/content/drive/MyDrive/CS4248 Project/liwc+glove/mlp_y_test.pickle", 'rb') as f4:
    LIWC_test_y = pickle.load(f4)

In [None]:
train_y = LIWC_train_y
test_y = LIWC_test_y
LIWC_train1 = LIWC_train.copy()
LIWC_test1 = LIWC_test.copy()

In [None]:
train_X = LIWC_train1.iloc[:,:112]
test_X = LIWC_test1.iloc[:,:112]

In [None]:
#Normalization
from sklearn import preprocessing

train_X_n = preprocessing.scale(train_X)
test_X_n = preprocessing.scale(test_X)

In [None]:
clf = LogisticRegression(C=1.0, solver='lbfgs',max_iter=3000)
clf.fit(train_X_n, train_y)
y_pred = clf.predict(test_X_n)
y_test=test_y
score = []
f1_macro = f1_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)
precision_macro = precision_score(y_test, y_pred, average='macro')
recall_macro = recall_score(y_test, y_pred, average='macro')
score.append([f1_macro,accuracy,precision_macro,recall_macro])
print(score)

[[0.476288217731061, 0.48933333333333334, 0.4738564991638031, 0.4893333333333333]]


#### 5.2.3.1 LIWC + Glove

In [None]:
#LIWC
with open("/content/drive/MyDrive/CS4248 Project/liwc+glove/mlp_X_train.pickle", 'rb') as f1:
    LIWC_train = pickle.load(f1)
with open("/content/drive/MyDrive/CS4248 Project/liwc+glove/mlp_X_test.pickle", 'rb') as f2:
    LIWC_test = pickle.load(f2)

In [None]:
LIWC_train = pd.DataFrame(LIWC_train)
LIWC_test = pd.DataFrame(LIWC_test)

In [None]:
with open("/content/drive/MyDrive/CS4248 Project/liwc+glove/mlp_y_train.pickle", 'rb') as f3:
    LIWC_train_y = pickle.load(f3)
with open("/content/drive/MyDrive/CS4248 Project/liwc+glove/mlp_y_test.pickle", 'rb') as f4:
    LIWC_test_y = pickle.load(f4)

In [None]:
train_y = LIWC_train_y
test_y = LIWC_test_y
LIWC_train1 = LIWC_train.copy()
LIWC_test1 = LIWC_test.copy()

In [None]:
train_X = LIWC_train1
test_X = LIWC_test1

In [None]:
#Normalization
from sklearn import preprocessing

train_X_n = preprocessing.scale(train_X)
test_X_n = preprocessing.scale(test_X)

In [None]:
clf = LogisticRegression(C=1.0, solver='lbfgs',max_iter=3000)
clf.fit(train_X_n, train_y)
y_pred = clf.predict(test_X_n)
y_test=test_y
score = []
f1_macro = f1_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)
precision_macro = precision_score(y_test, y_pred, average='macro')
recall_macro = recall_score(y_test, y_pred, average='macro')
score.append([f1_macro,accuracy,precision_macro,recall_macro])
print(score)

[[0.5239150358155775, 0.532, 0.5213275652189299, 0.532]]


####5.2.3.2 TF-IDF + LIWC

In [None]:
#TF-IDF + LIWC

pre7_alltrain = preprocess_text(train1, replace=1, remove_punctuation=1, lower=1,stopword=None,noisy=None,frequency_words=None,scared_word=None,lemmatization=1)
pre7_test = preprocess_text(test, replace=1, remove_punctuation=1, lower=1,stopword=None,noisy=None,frequency_words=None,scared_word=None,lemmatization=1)

# TF-IDF
train_tf_all = feature_engineering(pre7_alltrain, tf_idf=1, train=pre7_alltrain, word2vec=None, word_count=None)
test_tf_all = feature_engineering(pre7_test, tf_idf=1, train=pre7_alltrain, word2vec=None, word_count=None)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
from scipy.sparse import csr_matrix,hstack
train_X = LIWC_train1.iloc[:,:112]
test_X = LIWC_test1.iloc[:,:112]
train_X_n = preprocessing.scale(train_X)
test_X_n = preprocessing.scale(test_X)

train_X_n_m = csr_matrix(train_X_n)
train_X_total = hstack([train_tf_all,train_X_n_m])
test_X_n_m = csr_matrix(test_X_n)
test_X_total = hstack([test_tf_all,test_X_n_m])

In [None]:
clf = LogisticRegression(C=1.0, solver='lbfgs',max_iter=3000)
clf.fit(train_X_total, train_y)
y_pred = clf.predict(test_X_total)
y_test=test_y
score = []
f1_macro = f1_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)
precision_macro = precision_score(y_test, y_pred, average='macro')
recall_macro = recall_score(y_test, y_pred, average='macro')
score.append([f1_macro,accuracy,precision_macro,recall_macro])
print(score)

[[0.5067317016117807, 0.5223333333333333, 0.5012040185318641, 0.5223333333333333]]


## 5.3 Hyperparameters Tuning

In [None]:
train_smalltf = feature_engineering(pre7_train, tf_idf=1, train=pre6_train, word2vec=None, word_count=None)

In [None]:
C = [45,60,70,80]
result = []
for i in C:
    clf = LogisticRegression(C=i,max_iter=1000)
    clf.fit(train_smalltf,pre7_train['Verdict'])
    y_pred = clf.predict(test_tf)
    y_test=pre7_test['Verdict']
    f1_macro = f1_score(y_test, y_pred, average='macro')
    result.append(f1_macro)
print(result)

[0.7217188724163944, 0.7225850096469508, 0.7229108796366671, 0.7225348534629683]


In [None]:
C = [0.0001,0.001,0.01,0.1]
result = []
for i in C:
    clf = LogisticRegression(C=i,max_iter=1000)
    clf.fit(train_smalltf,pre7_train['Verdict'])
    y_pred = clf.predict(test_tf)
    y_test=pre7_test['Verdict']
    f1_macro = f1_score(y_test, y_pred, average='macro')
    result.append(f1_macro)
print(result)

[0.1, 0.1, 0.10207209592375097, 0.4572874054654142]


In [None]:
C = [1,10,20,30]
result = []
for i in C:
    clf = LogisticRegression(C=i,max_iter=1000)
    clf.fit(train_smalltf,pre7_train['Verdict'])
    y_pred = clf.predict(test_tf)
    y_test=pre7_test['Verdict']
    f1_macro = f1_score(y_test, y_pred, average='macro')
    result.append(f1_macro)
print(result)

[0.6868295517497363, 0.7149693422660267, 0.7207270301536691, 0.7200622736546014]


In [None]:
max_iter = [100,500,1000,1500,2000,3000]
result = []
for i in max_iter:
    clf = LogisticRegression(C=70,max_iter=i)
    clf.fit(train_smalltf, pre7_train['Verdict'])
    y_pred = clf.predict(test_tf)
    y_test=pre6_test['Verdict']
    f1_macro = f1_score(y_test, y_pred, average='macro')
    result.append(f1_macro)
print(result)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[0.7208965316973053, 0.7229108796366671, 0.7229108796366671, 0.7229108796366671, 0.7229108796366671, 0.7229108796366671]


In [None]:
solver = ['newton-cg','sag','saga','lbfgs']
result = []
for i in solver:
    clf = LogisticRegression(C=70,max_iter=1000, solver=i)
    clf.fit(train_smalltf, pre7_train['Verdict'])
    y_pred = clf.predict(test_tf)
    y_test=pre6_test['Verdict']
    f1_macro = f1_score(y_test, y_pred, average='macro')
    result.append(f1_macro)
print(result)

[0.7229108796366671, 0.7223104254705561, 0.7212449677019634, 0.7229108796366671]


## 5.4 Final Training

### 5.4.1 TF-IDF

#### 5.4.1.1 Test Dataset

In [None]:
y_test=pre7_test['Verdict']
y_train=pre7_alltrain['Verdict']

In [None]:
# TF-IDF
train_tf = feature_engineering(pre7_alltrain, tf_idf=1, train=pre7_alltrain, word2vec=None, word_count=None)
test_tf = feature_engineering(pre7_test, tf_idf=1, train=pre7_alltrain, word2vec=None, word_count=None)

In [None]:
lg = LogisticRegression(C=70,max_iter=1000,solver='newton-cg')
lg.fit(train_tf, y_train)
y_pred = lg.predict(test_tf)
score = []
f1_macro = f1_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)
precision_macro = precision_score(y_test, y_pred, average='macro')
recall_macro = recall_score(y_test, y_pred, average='macro')
score.append([f1_macro,accuracy,precision_macro,recall_macro])
print(score)

[[0.7558780493242784, 0.7643333333333333, 0.7792575111598599, 0.7643333333333333]]


In [None]:
lg = LogisticRegression(C=70,max_iter=1500,solver='newton-cg')
lg.fit(train_tf, y_train)
y_pred = lg.predict(test_tf)
score = []
f1_macro = f1_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)
precision_macro = precision_score(y_test, y_pred, average='macro')
recall_macro = recall_score(y_test, y_pred, average='macro')
score.append([f1_macro,accuracy,precision_macro,recall_macro])
print(score)

[[0.7558780493242784, 0.7643333333333333, 0.7792575111598599, 0.7643333333333333]]


In [None]:
lg = LogisticRegression(C=70,max_iter=2000,solver='newton-cg')
lg.fit(train_tf, y_train)
y_pred = lg.predict(test_tf)
score = []
f1_macro = f1_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)
precision_macro = precision_score(y_test, y_pred, average='macro')
recall_macro = recall_score(y_test, y_pred, average='macro')
score.append([f1_macro,accuracy,precision_macro,recall_macro])
print(score)

[[0.7558780493242784, 0.7643333333333333, 0.7792575111598599, 0.7643333333333333]]


## 5.5 Analysis the result of TF-IDF + LIWC 

### 5.5.1 The original dataset

In [None]:
train2 = train1.copy()
test2 = test1.copy()

In [None]:
#Cross Validation
pre7_train = preprocess_text(train2, replace=1, remove_punctuation=1, lower=1,stopword=1,noisy=1,frequency_words=1,scared_word=1,lemmatization=1)
pre7_test = preprocess_text(test2, replace=1, remove_punctuation=1, lower=1,stopword=1,noisy=1,frequency_words=1,scared_word=1,lemmatization=1)
y_train = pre7_train['Verdict']
y_test = pre7_test['Verdict']

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# TF-IDF
train_tf = feature_engineering(pre7_train, tf_idf=1, train=pre7_train, word2vec=None, word_count=None)
test_tf = feature_engineering(pre7_test, tf_idf=1, train=pre7_train, word2vec=None, word_count=None)

In [None]:
#LIWC
LIWC_train = pd.read_csv("/content/drive/MyDrive/CS4248_Project/LIWC/df_train_spacy.csv")
LIWC_test = pd.read_csv("/content/drive/MyDrive/CS4248_Project/LIWC/df_test_spacy.csv")

train_y = LIWC_train['Verdict']
test_y = LIWC_test['Verdict']
LIWC_train1 = LIWC_train.copy()
LIWC_test1 = LIWC_test.copy()
LIWC_train1.drop(columns=['Verdict','Text'],inplace=True)
LIWC_test1.drop(columns=['Verdict','Text'],inplace=True)
train_X = LIWC_train1
test_X = LIWC_test1
#Normalization

train_X_n = preprocessing.scale(train_X)
test_X_n = preprocessing.scale(test_X)


In [None]:
from scipy.sparse import csr_matrix,hstack
train_X_n_m = csr_matrix(train_X_n)
train_X_total = hstack([train_tf,train_X_n_m])
test_X_n_m = csr_matrix(test_X_n)
test_X_total = hstack([test_tf,test_X_n_m])

In [None]:
# from sklearn.model_selection import GridSearchCV
lg = LogisticRegression(C=0.01,max_iter=1000,solver='newton-cg')
lg.fit(train_X_total, train_y)
y_pred = lg.predict(test_X_total)
score = []
f1_macro = f1_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)
precision_macro = precision_score(y_test, y_pred, average='macro')
recall_macro = recall_score(y_test, y_pred, average='macro')
score.append([f1_macro,accuracy,precision_macro,recall_macro])
print(score)

[[0.5438564372888904, 0.5466666666666666, 0.5425634864321488, 0.5466666666666666]]


### 5.5.2 Small dataset (45854 news)

In [None]:
train3 = train2.sample(frac=1.0, random_state=10)
train_tf2 = train3[0:45854]
test_tf = train3[45854:48854]

#Cross Validation
pre7_train = preprocess_text(train_tf2,replace=1, remove_punctuation=1, lower=1,stopword=None,noisy=None,frequency_words=None,scared_word=None,lemmatization=1)
pre7_test = preprocess_text(test_tf, replace=1, remove_punctuation=1, lower=1,stopword=None,noisy=None,frequency_words=None,scared_word=None,lemmatization=1)
y_train = pre7_train['Verdict']
y_test = pre7_test['Verdict']

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# TF-IDF
train_tf = feature_engineering(pre7_train, tf_idf=1, train=pre7_train, word2vec=None, word_count=None)
test_tf = feature_engineering(pre7_test, tf_idf=1, train=pre7_train, word2vec=None, word_count=None)

In [None]:
#LIWC
LIWC_all = pd.read_csv("/content/drive/MyDrive/CS4248_Project/LIWC/df_train_spacy.csv")
LIWC_all2 = LIWC_all.sample(frac=1.0, random_state=10)
LIWC_train = LIWC_all2[0:45854]
LIWC_test = LIWC_all2[45854:48854]
LIWC_train1 = LIWC_train.copy()
LIWC_test1 = LIWC_test.copy()
LIWC_y = LIWC_train['Verdict']
LIWC_y = LIWC_test['Verdict']
LIWC_train1.drop(columns=['Verdict','Text'],inplace=True)
LIWC_test1.drop(columns=['Verdict','Text'],inplace=True)
train_X = LIWC_train1
test_X = LIWC_test1

#Normalization

train_X_n = preprocessing.scale(train_X)
test_X_n = preprocessing.scale(test_X)


train_X_n_m = csr_matrix(train_X_n)
train_X_total = hstack([train_tf,train_X_n_m])
test_X_n_m = csr_matrix(test_X_n)
test_X_total = hstack([test_tf,test_X_n_m])

#The result in the training set
lg = LogisticRegression(C=0.01,max_iter=1000,solver='newton-cg')
lg.fit(train_X_total, y_train)
y_pred = lg.predict(train_X_total)
score = []
f1_macro = f1_score(y_train, y_pred, average='macro')
accuracy = accuracy_score(y_train, y_pred)
precision_macro = precision_score(y_train, y_pred, average='macro')
recall_macro = recall_score(y_train, y_pred, average='macro')
score.append([f1_macro,accuracy,precision_macro,recall_macro])
print(score)



[[0.8776810966324724, 0.8871854145767, 0.8843099723863017, 0.8735467669220807]]


In [None]:
lg = LogisticRegression(C=0.01,max_iter=1000,solver='newton-cg')
lg.fit(train_X_total, y_train)
y_pred = lg.predict(test_X_total)
score = []
f1_macro = f1_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)
precision_macro = precision_score(y_test, y_pred, average='macro')
recall_macro = recall_score(y_test, y_pred, average='macro')
score.append([f1_macro,accuracy,precision_macro,recall_macro])
print(score)

[[0.8518225399944388, 0.8476666666666667, 0.8650446208834282, 0.8544400201185647]]


## 5.6 Analysis of TF-IDF

In [None]:
train4 = train2.sample(frac=1.0, random_state=10)
X_train = train4[0:45854]
X_test = train4[45854:48854]
pre7_train = preprocess_text(X_train, replace=1, remove_punctuation=1, lower=1,stopword=None,noisy=None,frequency_words=None,scared_word=None,lemmatization=1)
pre7_test = preprocess_text(X_test, replace=1, remove_punctuation=1, lower=1,stopword=None,noisy=None,frequency_words=None,scared_word=None,lemmatization=1)
y_train = X_train['Verdict']
y_test = X_test['Verdict']


# TF-IDF
train_tf = feature_engineering(pre7_train, tf_idf=1, train=X_train, word2vec=None, word_count=None)
test_tf = feature_engineering(pre7_test, tf_idf=1, train=X_train, word2vec=None, word_count=None)

#The result in the training set
lg = LogisticRegression(C=70,max_iter=1000,solver='newton-cg')
lg.fit(train_tf, y_train)
y_pred = lg.predict(train_tf)
score = []
f1_macro = f1_score(y_train, y_pred, average='macro')
accuracy = accuracy_score(y_train, y_pred)
precision_macro = precision_score(y_train, y_pred, average='macro')
recall_macro = recall_score(y_train, y_pred, average='macro')
score.append([f1_macro,accuracy,precision_macro,recall_macro])
print(score)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[[0.9999734347469967, 0.9999781916517643, 0.9999850933158428, 0.9999617795444122]]


In [None]:
#The result in the test set
lg = LogisticRegression(C=70,max_iter=1000,solver='newton-cg')
lg.fit(train_tf, y_train)
y_pred = lg.predict(test_tf)
score = []
f1_macro = f1_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)
precision_macro = precision_score(y_test, y_pred, average='macro')
recall_macro = recall_score(y_test, y_pred, average='macro')
score.append([f1_macro,accuracy,precision_macro,recall_macro])
print(score)

[[0.9696553197571132, 0.9703333333333334, 0.9719263584310721, 0.9674837630168126]]


## 5.7 Larger and Smaller dataset

In [None]:
train5 = train1.sample(n=20000, random_state=1)
test5 = test1

In [None]:
# TF-IDF
train_tf_all_s = feature_engineering(pre7_train, tf_idf=1, train=pre7_train, word2vec=None, word_count=None)
test_tf_all_s = feature_engineering(pre7_test, tf_idf=1, train=pre7_train, word2vec=None, word_count=None)

In [None]:
train_y_s = pre7_train['Verdict']
y_test_s = pre7_test['Verdict']

In [None]:
# from sklearn.model_selection import GridSearchCV
lg = LogisticRegression(C=40,max_iter=1000,solver='newton-cg')
lg.fit(train_tf_all_s, train_y_s)
y_pred = lg.predict(test_tf_all_s)
score = []
f1_macro = f1_score(y_test_s, y_pred, average='macro')
accuracy = accuracy_score(y_test_s, y_pred)
precision_macro = precision_score(y_test_s, y_pred, average='macro')
recall_macro = recall_score(y_test_s, y_pred, average='macro')
score.append([f1_macro,accuracy,precision_macro,recall_macro])
print(score)

[[0.7469248824124232, 0.7556666666666667, 0.7691425162884407, 0.7556666666666667]]
