In [1]:
import pandas as pd
import csv
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

# import tensorflow as tf
# from tensorflow import keras
# from keras.preprocessing.sequence import pad_sequences
# from keras.preprocessing.text import one_hot , Tokenizer, hashing_trick, text_to_word_sequence
from simpletransformers.classification import ClassificationModel
import spacy
nlp = spacy.load("en_core_web_lg", disable=["tagger", "parser" , "ner"])

In [2]:
traindf = pd.read_csv("train.csv", encoding = 'UTF-8')
testdf_woLabel1 = pd.read_csv("test.csv", encoding = 'UTF-8')
testLabels = pd.read_csv("submit.csv", encoding = 'UTF-8')
testdf = pd.merge(testdf_woLabel1 , testLabels , on = ['id'])
traindf.dropna(how = 'any' , inplace = True)
testdf.dropna(how = 'any' , inplace = True) 
X_train = traindf.drop(['id' , 'label'] , axis = 1)
y_train = traindf["label"]
X_test = testdf.drop(['id' , 'label'] , axis = 1)
y_test = testdf["label"]

In [3]:
X_train.head()

Unnamed: 0,title,author,text
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ..."
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...


In [4]:
X_test.head()

Unnamed: 0,title,author,text
0,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
2,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...
6,Pelosi Calls for FBI Investigation to Find Out...,Pam Key,"Sunday on NBC’s “Meet the Press,” House Minori..."


In [5]:
print(f'Shape of train is : {X_train.shape}')
print(f'Shape of test is : {X_test.shape}')

Shape of train is : (18285, 3)
Shape of test is : (4575, 3)


### Preprocessing Data

In [6]:
def genRowVal(df):
    for row in df.iteritems():
        yield row[1]


def preprocess(data , NLP = nlp):
    corpus = []
    for row in genRowVal(data):
        doc = nlp(row)
        tokens = [token.lemma_ for token in doc if (token.is_stop == False and token.is_alpha == True)] 
        corpus.append(' '.join(tokens))
    return corpus

In [7]:
train_X = X_train.title + ' ' + X_train.author + ' ' + X_train.text
test_X = X_test.title + ' ' + X_test.author + ' ' + X_test.text

In [8]:
print(f'Shape of train is : {train_X.shape}')
print(f'Shape of test  is : {test_X.shape}')

Shape of train is : (18285,)
Shape of test  is : (4575,)


In [9]:
train_X_processed = preprocess(train_X)
test_X_processes  = preprocess(test_X)

In [10]:
print(f'Samples in train are : {len(train_X_processed)}')
print(f'Samples in test  are  : {len(test_X_processes)}')

Samples in train are : 18285
Samples in test  are  : 4575


### Logistic Regression based model for benchmarking

In [11]:
logistic_Pipeline = Pipeline([
                                ('vectorizer' , TfidfVectorizer(lowercase= True)),
                                ('logisticModel' , LogisticRegression(dual = False , random_state = 12345))  
                                     ])

hyperPara_Logistic = {                 
              'logisticModel__C': [0.01, 0.1, 1 , 5], 
              'logisticModel__max_iter' : [5000] ,   
              'logisticModel__solver' : ['newton-cg' , 'lbfgs'], 
             }

logistic_Model = GridSearchCV(logistic_Pipeline , param_grid = hyperPara_Logistic , cv = 5)

In [12]:
logistic_Model.fit(train_X_processed, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                                       ('logisticModel',
                                        LogisticRegression(random_state=12345))]),
             param_grid={'logisticModel__C': [0.01, 0.1, 1, 5],
                         'logisticModel__max_iter': [5000],
                         'logisticModel__solver': ['newton-cg', 'lbfgs']})

In [13]:
logistic_Model.best_params_

{'logisticModel__C': 5,
 'logisticModel__max_iter': 5000,
 'logisticModel__solver': 'newton-cg'}

In [14]:
logistic_Pred = logistic_Model.predict(test_X_processes)
print(classification_report(y_test , logistic_Pred))

              precision    recall  f1-score   support

           0       0.58      0.69      0.63      2213
           1       0.65      0.54      0.59      2362

    accuracy                           0.61      4575
   macro avg       0.62      0.61      0.61      4575
weighted avg       0.62      0.61      0.61      4575



Since extrem value of C is choosen during grid search, so checking for higher values in the network

In [15]:
hyperPara_Logistic2 = {                 
              'logisticModel__C': [10,15,25,30], 
              'logisticModel__max_iter' : [5000] ,   
              'logisticModel__solver' : ['newton-cg'], 
             }

logistic_Model2 = GridSearchCV(logistic_Pipeline , param_grid = hyperPara_Logistic2 , cv = 5)

In [16]:
logistic_Model2.fit(train_X_processed, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                                       ('logisticModel',
                                        LogisticRegression(random_state=12345))]),
             param_grid={'logisticModel__C': [10, 15, 25, 30],
                         'logisticModel__max_iter': [5000],
                         'logisticModel__solver': ['newton-cg']})

In [17]:
logistic_Model2.best_params_

{'logisticModel__C': 30,
 'logisticModel__max_iter': 5000,
 'logisticModel__solver': 'newton-cg'}

In [18]:
logistic_Pred2 = logistic_Model2.predict(test_X_processes)
print(classification_report(y_test , logistic_Pred2))

              precision    recall  f1-score   support

           0       0.59      0.69      0.63      2213
           1       0.65      0.54      0.59      2362

    accuracy                           0.61      4575
   macro avg       0.62      0.62      0.61      4575
weighted avg       0.62      0.61      0.61      4575



With change in C value, no furthur improvement is seen in model performance so selecting model named 'logistic_Model' as final model.

### Random Forest based Model

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
RF_Pipeline = Pipeline([
                        ('vectorizer' , TfidfVectorizer(lowercase=True)),
                        ('RandomForest' , RandomForestClassifier())  
                                     ])

hyperPara_RF = {                 
              'RandomForest__ccp_alpha' : [0.1,0,1], 
              'RandomForest__criterion' : ['gini', 'entropy'],
              'RandomForest__n_estimators': [100,200,300,400]
    
             }

rf_model = GridSearchCV(RF_Pipeline , param_grid = hyperPara_RF , cv = 5)

In [21]:
rf_model.fit(train_X_processed, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                                       ('RandomForest',
                                        RandomForestClassifier())]),
             param_grid={'RandomForest__ccp_alpha': [0.1, 0, 1],
                         'RandomForest__criterion': ['gini', 'entropy'],
                         'RandomForest__n_estimators': [100, 200, 300, 400]})

In [22]:
rf_model.best_params_

{'RandomForest__ccp_alpha': 0,
 'RandomForest__criterion': 'gini',
 'RandomForest__n_estimators': 300}

In [23]:
rf_pred = rf_model.predict(test_X_processes)
print(classification_report(y_test , rf_pred))

              precision    recall  f1-score   support

           0       0.59      0.76      0.67      2213
           1       0.70      0.51      0.59      2362

    accuracy                           0.63      4575
   macro avg       0.64      0.64      0.63      4575
weighted avg       0.65      0.63      0.63      4575



### Neural Networks

* BERT based model

This model is build using Simple Transform pipeline and input text is preprocessed to remove redundant text.

In [24]:
bert_model = ClassificationModel('bert', 'bert-base-cased', num_labels=2, 
                            args={'reprocess_input_data': True, 'overwrite_output_dir': True},use_cuda=True)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [25]:
df_processed = pd.DataFrame(train_X_processed , columns=['TrainData'])
y_train2 = y_train.copy(deep = True)
y_train2 = y_train2.reset_index()
y_train2 = y_train2.drop(['index'] , axis = 1)
training_data_joined = df_processed.join(y_train2)

In [26]:
bert_model.train_model(training_data_joined)



HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 1', max=2286.0, style=ProgressStyle(de…







(2286, 0.10436240554153987)

In [27]:
df_processed_test = pd.DataFrame(test_X_processes , columns=['TrainData'])
y_test2 = y_test.copy(deep = True)
y_test2 = y_test2.reset_index()
y_test2 = y_test2.drop(['index'] , axis = 1)
test_data_joined = df_processed_test.join(y_test2)

In [28]:
result, model_outputs, wrong_predictions = bert_model.eval_model(test_data_joined)



HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=572.0, style=ProgressStyle(descr…




In [31]:
predicted_y_test = [np.argmax(val) for val in model_outputs]

In [32]:
print(classification_report(y_test , predicted_y_test))

              precision    recall  f1-score   support

           0       0.58      0.69      0.63      2213
           1       0.65      0.54      0.59      2362

    accuracy                           0.61      4575
   macro avg       0.62      0.61      0.61      4575
weighted avg       0.62      0.61      0.61      4575



Applying sliding window inference with BERT to consider larger sequence of data

In [6]:
#new run with sliding window inference 
train_Sliding_Window = X_train.title + ' ' + X_train.author + ' ' + X_train.text
train_Sliding_Window_df = pd.DataFrame(train_Sliding_Window).join(y_train)
test_Sliding_Window = X_test.title + ' ' + X_test.author + ' ' + X_test.text
test_Sliding_Window_df = pd.DataFrame(test_Sliding_Window).join(y_test)

In [7]:
train_Sliding_Window_df = train_Sliding_Window_df.rename(columns={0: 'text'})

In [8]:
train_Sliding_Window_df.head()

Unnamed: 0,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0
2,Why the Truth Might Get You Fired Consortiumne...,1
3,15 Civilians Killed In Single US Airstrike Hav...,1
4,Iranian woman jailed for fictional unpublished...,1


In [9]:
test_Sliding_Window_df = test_Sliding_Window_df.rename(columns={0: 'text'})

In [10]:
test_Sliding_Window_df.head()

Unnamed: 0,text,label
0,"Specter of Trump Loosens Tongues, if Not Purse...",0
2,#NoDAPL: Native American Leaders Vow to Stay A...,0
3,"Tim Tebow Will Attempt Another Comeback, This ...",1
4,Keiser Report: Meme Wars (E995) Truth Broadcas...,1
6,Pelosi Calls for FBI Investigation to Find Out...,1


In [11]:
# for apply changing preprocess step
def preprocess2(data , NLP = nlp):
    #corpus = []    
    doc = nlp(data)
    tokens = [token.lemma_ for token in doc if (token.is_stop == False and token.is_alpha == True)] 
    return  ''.join(tokens)
    #return corpus

In [12]:
test_Sliding_Window_df.text = test_Sliding_Window_df.text.apply(lambda x : preprocess2(x))

In [13]:
test_Sliding_Window_df.head()

Unnamed: 0,text,label
0,SpecterTrumpLoosensTonguesPurseStringsSiliconV...,0
2,NoDAPLNativeAmericanLeadersVowStayWinterFileLa...,0
3,TimTebowAttemptComebackTimeBaseballNewYorkTime...,1
4,KeiserReportMemeWarsTruthBroadcastNetworkminag...,1
6,PelosiCallsFBIInvestigationFindRussianDonaldTr...,1


In [14]:
train_Sliding_Window_df.text = train_Sliding_Window_df.text.apply(lambda x : preprocess2(x))

In [15]:
train_Sliding_Window_df.head()

Unnamed: 0,text,label
0,HouseDemAideComeyLetterJasonChaffetzTweetedDar...,1
1,FLYNNHillaryClintonBigWomanCampusBreitbartDani...,0
2,TruthFiredTruthFiredOctobertensionintelligence...,1
3,CiviliansKilledSingleAirstrikeIdentifiedJessic...,1
4,Iranianwomanjailfictionalunpublishedstorywoman...,1


In [16]:
#bert specific tokenization is handeled by Simple Transform library 

In [17]:
model_Sliding_Window = ClassificationModel('bert', 'bert-base-cased', num_labels=2, 
                            args={'reprocess_input_data': True, 
                                  'overwrite_output_dir': True , 
                                  'sliding_window' : True 
                                 # , 'eval_batch_size' : 2
                                 },use_cuda=True)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [18]:
model_Sliding_Window.train_model(train_Sliding_Window_df)



HBox(children=(FloatProgress(value=0.0, max=18285.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 1', max=2286.0, style=ProgressStyle(de…







(2286, 0.6852637930387796)

In [19]:
result_SW, model_outputs_SW, wrong_predictions_SW = model_Sliding_Window.eval_model(test_Sliding_Window_df)



HBox(children=(FloatProgress(value=0.0, max=4575.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=572.0, style=ProgressStyle(descr…




In [None]:
predicted_class  = model.predict(list(test_Sliding_Window_df.text))

In [20]:
predicted_y_test_SW = [np.argmax(val) for val in model_outputs_SW]
print(classification_report(test_Sliding_Window_df.label , predicted_y_test_SW))

              precision    recall  f1-score   support

           0       0.49      0.99      0.66      2213
           1       0.86      0.03      0.06      2362

    accuracy                           0.50      4575
   macro avg       0.67      0.51      0.36      4575
weighted avg       0.68      0.50      0.35      4575



It can be seen that F1 Score for class 1 which is fake news is extremly low with sliding window inference and also accuracy of the model is low compared to other models built till now.

In [21]:
# passing input without tokenization 
#new run with sliding window inference 
train_Complete = X_train.title + ' ' + X_train.author + ' ' + X_train.text
train_Complete_df = pd.DataFrame(train_Complete).join(y_train)
test_Complete = X_test.title + ' ' + X_test.author + ' ' + X_test.text
test_Complete_df = pd.DataFrame(test_Complete).join(y_test)

In [22]:
train_Complete_df = train_Complete_df.rename(columns={0: 'text'})
test_Complete_df = test_Complete_df.rename(columns={0: 'text'})

In [27]:
train_Complete_df.head()

Unnamed: 0,text,label
0,House Dem Aide We Didn t Even See Comey s Let...,1
1,FLYNN Hillary Clinton Big Woman on Campus ...,0
2,Why the Truth Might Get You Fired Consortiumne...,1
3,Civilians Killed In Single US Airstrike Hav...,1
4,Iranian woman jailed for fictional unpublished...,1


In [23]:
import re

In [24]:
train_Complete_df.text = train_Complete_df.text.apply(lambda x : re.sub("[^a-zA-Z]", " ", x))

In [25]:
test_Complete_df.text = test_Complete_df.text.apply(lambda x : re.sub("[^a-zA-Z]", " ", x))

In [26]:
model_complete_data = ClassificationModel('bert', 'bert-base-cased', num_labels=2, 
                            args={'reprocess_input_data': True, 
                                  'overwrite_output_dir': True , 
                                  'sliding_window' : False 
                                 # , 'eval_batch_size' : 2
                                 },use_cuda=True)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [28]:
model_complete_data.train_model(train_Complete_df)



HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 1', max=2286.0, style=ProgressStyle(de…







(2286, 0.08490984376799957)

In [29]:
result_Comp, model_outputs_Comp, wrong_predictions_Comp = model_complete_data.eval_model(test_Complete_df)



HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=572.0, style=ProgressStyle(descr…




In [31]:
predicted_class  = model_complete_data.predict(list(test_Complete_df.text))

HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))




In [37]:
print(classification_report(test_Complete_df.label , predicted_class[0]))

              precision    recall  f1-score   support

           0       0.58      0.69      0.63      2213
           1       0.65      0.54      0.59      2362

    accuracy                           0.61      4575
   macro avg       0.62      0.61      0.61      4575
weighted avg       0.62      0.61      0.61      4575



In [39]:
# sliding window with complete data 
model_complete_data_SS = ClassificationModel('bert', 'bert-base-cased', num_labels=2, 
                            args={'reprocess_input_data': True, 
                                  'overwrite_output_dir': True , 
                                  'sliding_window' : True 
                                 # , 'eval_batch_size' : 2
                                 },use_cuda=True)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [41]:
model_complete_data_SS.train_model(train_Complete_df)



HBox(children=(FloatProgress(value=0.0, max=18285.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 1', max=21745.0, style=ProgressStyle(d…







(21745, 0.24151869044288485)

In [42]:
predicted_class_SS  = model_complete_data_SS.predict(list(test_Complete_df.text))

HBox(children=(FloatProgress(value=0.0, max=4575.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5544.0), HTML(value='')))




In [43]:
print(classification_report(test_Complete_df.label , predicted_class_SS[0]))

              precision    recall  f1-score   support

           0       0.57      0.69      0.63      2213
           1       0.64      0.52      0.57      2362

    accuracy                           0.60      4575
   macro avg       0.61      0.60      0.60      4575
weighted avg       0.61      0.60      0.60      4575

