## **Import Packages**

In [1]:
import pandas as pd
from DataVisualization import DataVisualization
from TextCleaning import TextCleaning
from CredibilityScoreExtraction import CredibilityScoreExtraction
from SentimentFeatureExtraction import SentimentFeatureExtraction
from NERFeatureExtraction import NERFeatureExtraction
from POSTagFeatureExtraction import POSTagFeatureExtraction
from DependencyFeatureExtraction import DependencyFeatureExtraction
from MetaFeatureExtraction import MetaFeatureExtraction
from Utilities import PCAPercentage, FeatureScaling
from sklearn.ensemble import  RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score, classification_report, make_scorer
from sklearn.model_selection import GridSearchCV

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prateek_bhardwaj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prateek_bhardwaj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prateek_bhardwaj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## **Loading Dataset**

In [2]:
## Path of Dataset
trainFile = './Dataset/LIAR/train.tsv'
testFile = './Dataset/LIAR/test.tsv'
valFile = './Dataset/LIAR/valid.tsv'

## Name of headers
headers = ["json ID", "label", "statement", "subject", "speaker", "job title", "state", "party", 
           "barely true", "false", "half true", "mostly true", "pants on fire", "los"]

## Loading Data
traindata = pd.read_csv(trainFile, delimiter='\t', encoding='utf-8', names=headers)
testdata = pd.read_csv(testFile, delimiter='\t', encoding='utf-8', names=headers)
valdata = pd.read_csv(valFile, delimiter='\t', encoding='utf-8', names=headers)

## concatenate 
traindata = pd.concat([traindata,valdata])
traindata.shape, testdata.shape

((11524, 14), (1267, 14))

In [3]:
traindata.head()

Unnamed: 0,json ID,label,statement,subject,speaker,job title,state,party,barely true,false,half true,mostly true,pants on fire,los
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN


## **Data Cleaning**

In [4]:
## Removing Unncessary information
removed_col = ['json ID', 'barely true', 'false', 'half true', 'mostly true', 'pants on fire', 'los', 
               'party', 'state', 'subject', 'job title']
traindata.drop(removed_col, axis='columns', inplace=True)
testdata.drop(removed_col, axis='columns', inplace=True)

In [5]:
## Checking for nan
traindata.isna().sum()

label        0
statement    0
speaker      2
dtype: int64

In [6]:
## fillling speaker nan values
traindata.fillna({'speaker':"None"}, inplace=True)

Text Cleaning

In [7]:
cleaned_train = TextCleaning(traindata, 'statement').GetDataFrame()
cleaned_test = TextCleaning(testdata, 'statement').GetDataFrame()

Text Cleaning Starts
		Text Lowered
		Punctuation Removed
		Stopwords Removed
		Urls Removed
		HashTags Removed
		Numbers Removed
		Lemmatization Done
Text Cleaning Done

Text Cleaning Starts
		Text Lowered
		Punctuation Removed
		Stopwords Removed
		Urls Removed
		HashTags Removed
		Numbers Removed
		Lemmatization Done
Text Cleaning Done



## **Label Class**

In [8]:
y_train = cleaned_train['label']
y_test = cleaned_test['label']

# Feature Extraction And Individual Results

### **Vectorization**

In [9]:
x_string_train = cleaned_train['speaker'].map(str) + " " + cleaned_train['statement'].map(str)
x_string_test = cleaned_test['speaker'].map(str) + " " + cleaned_test['statement'].map(str)

In [10]:
vec = CountVectorizer(max_features=4500, ngram_range=(1,1))
x_vec_train = vec.fit_transform(x_string_train)
x_vec_test = vec.transform(x_string_test)

Grid Search CV

In [11]:
# parameters = {'max_depth':[100,250,500],
#               'n_estimators':[100,250,500],
#               'min_samples_leaf':[1,5,10],
#               'min_samples_split':[5,10,15]
#               }
# rf = RandomForestClassifier()

# scorer = make_scorer(f1_score, average = 'weighted')
# clf = GridSearchCV(rf, parameters, scoring=scorer, cv=10)
# clf.fit(x_vec_train, y_train)

# clf.best_params_

# # {'max_depth': 250,
# #  'min_samples_leaf': 1,
# #  'min_samples_split': 5,
# #  'n_estimators': 100}

Results

In [12]:
rf_vec = RandomForestClassifier(max_depth=250, min_samples_leaf=1, min_samples_split=5, n_estimators=100, random_state=42)
rf_vec.fit(x_vec_train, y_train)

RandomForestClassifier(max_depth=250, min_samples_split=5, random_state=42)

In [13]:
print("Training Accuracy: ", rf_vec.score(x_vec_train, y_train))
print("Testing Accuracy: ", rf_vec.score(x_vec_test, y_test))

Training Accuracy:  0.9927976397084346
Testing Accuracy:  0.2920284135753749


In [14]:
predict_vec_train = pd.DataFrame(rf_vec.predict_proba(x_vec_train), columns=["vec-1","vec-2","vec-3","vec-4","vec-5","vec-6"])
predict_vec_test = pd.DataFrame(rf_vec.predict_proba(x_vec_test), columns=["vec-1","vec-2","vec-3","vec-4","vec-5","vec-6"])

### **NER Feature Extraction**

In [15]:
# ## Extraction takes time, Run only if have time

# x_ner_train = NERFeatureExtraction(traindata, 'statement').GetDataFrame()
# x_ner_test = NERFeatureExtraction(testdata, 'statement').GetDataFrame()
# x_ner_train.to_csv('./Files/x_ner_train.csv', index = False)
# x_ner_test.to_csv('./Files/x_ner_test.csv', index = False)

In [16]:
## Load Saved NER Features
x_ner_train = pd.read_csv("./Files/x_ner_train.csv")
x_ner_test = pd.read_csv("./Files/x_ner_test.csv")

In [17]:
x_ner_train.head()

Unnamed: 0,PERSON,ORG,FAC,GPE,NORP,LOC,PRODUCT,EVENT,WORK_OF_ART,LAW,LANGUAGE,DATE,TIME,PERCENT,MONEY,CARDINAL,QUANTITY,ORDINAL
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Grid Search CV

In [18]:
# parameters = {'max_depth':[100,250,500],
#               'n_estimators':[100,250,500],
#               'min_samples_leaf':[1,5,10],
#               'min_samples_split':[5,10,15]
#               }
# rf = RandomForestClassifier()

# scorer = make_scorer(f1_score, average = 'weighted')
# clf = GridSearchCV(rf, parameters, scoring=scorer, cv=10)
# clf.fit(x_meta_train, y_train)

# clf.best_params_

# # {'max_depth': 250,
# #  'min_samples_leaf': 5,
# #  'min_samples_split': 5,
# #  'n_estimators': 250}

Results

In [19]:
rf_ner = RandomForestClassifier(max_depth=250, min_samples_leaf=5, min_samples_split=5, n_estimators=250, random_state=42)
rf_ner.fit(x_ner_train, y_train)

RandomForestClassifier(max_depth=250, min_samples_leaf=5, min_samples_split=5,
                       n_estimators=250, random_state=42)

In [20]:
print("Training Accuracy: ", rf_ner.score(x_ner_train, y_train))
print("Testing Accuracy: ", rf_ner.score(x_ner_test, y_test))

Training Accuracy:  0.29130510239500174
Testing Accuracy:  0.24861878453038674


### **POS Tag Feature Extraction**

In [21]:
# ## Extraction takes time, Run only if have time

# x_pos_train = POSTagFeatureExtraction(traindata, 'statement').GetDataFrame()
# x_pos_test = POSTagFeatureExtraction(testdata, 'statement').GetDataFrame()
# x_pos_train.to_csv('./Files/x_pos_train.csv', index = False)
# x_pos_test.to_csv('./Files/x_pos_test.csv', index = False)

In [22]:
## Load Saved POS Features
x_pos_train = pd.read_csv("./Files/x_pos_train.csv")
x_pos_test = pd.read_csv("./Files/x_pos_test.csv")

In [23]:
x_pos_train.head()

Unnamed: 0,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,NUM,PART,PRON,X,PROPN,PUNCT,SCONJ,SYM,VERB,SPACE,CONJ
0,2,1,0,0,0,1,0,4,0,0,0,0,2,2,0,0,2,0,0
1,1,3,2,1,0,2,0,5,0,1,1,0,4,4,0,0,4,0,0
2,0,4,0,0,0,2,0,2,0,1,0,0,7,3,0,0,3,0,0
3,2,0,0,1,0,0,0,7,0,1,0,0,0,1,0,0,1,0,0
4,1,2,0,0,0,3,0,3,0,0,0,0,0,1,0,0,1,0,0


Grid Search CV

In [24]:
# parameters = {'max_depth':[100,250,500],
#               'n_estimators':[100,250,500],
#               'min_samples_leaf':[1,5,10],
#               'min_samples_split':[5,10,15]
#               }
# rf = RandomForestClassifier()

# scorer = make_scorer(f1_score, average = 'weighted')
# clf = GridSearchCV(rf, parameters, scoring=scorer, cv=10)
# clf.fit(x_pos_train, y_train)

# clf.best_params_

# # {'max_depth': 250,
# #  'min_samples_leaf': 1,
# #  'min_samples_split': 10,
# #  'n_estimators': 250}

Results

In [25]:
rf_pos = RandomForestClassifier(max_depth=250, min_samples_leaf=1, min_samples_split=10, n_estimators=250, random_state=42)
rf_pos.fit(x_pos_train, y_train)

RandomForestClassifier(max_depth=250, min_samples_split=10, n_estimators=250,
                       random_state=42)

In [26]:
print("Training Accuracy: ", rf_pos.score(x_pos_train, y_train))
print("Testing Accuracy: ", rf_pos.score(x_pos_test, y_test))

Training Accuracy:  0.9331829225963207
Testing Accuracy:  0.23520126282557222


### **Dependency Feature Extraction**

In [27]:
# ## Extraction takes time, Run only if have time

# x_dep_train = DependencyFeatureExtraction(traindata, 'statement').GetDataFrame()
# x_dep_test = DependencyFeatureExtraction(testdata, 'statement').GetDataFrame()
# x_dep_train.to_csv('./Files/x_dep_train.csv', index = False)
# x_dep_test.to_csv('./Files/x_dep_test.csv', index = False)

In [28]:
## Load Saved DEP Features
x_dep_train = pd.read_csv("./Files/x_dep_train.csv")
x_dep_test = pd.read_csv("./Files/x_dep_test.csv")

In [29]:
x_dep_train.head()

Unnamed: 0,ROOT,acl,acomp,advcl,advmod,agent,amod,appos,attr,aux,...,pobj,poss,preconj,predet,prep,prt,punct,quantmod,relcl,xcomp
0,1,0,0,0,0,0,2,0,0,0,...,1,0,0,0,1,0,2,0,0,0
1,2,0,0,1,2,0,1,0,0,1,...,2,0,0,0,2,1,4,0,0,1
2,1,0,0,0,0,0,0,0,0,1,...,3,0,0,0,4,0,3,0,0,1
3,1,0,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,1
4,1,0,0,0,0,0,1,0,0,0,...,2,1,0,0,2,0,1,0,0,0


Grid Search CV

In [30]:
# parameters = {'max_depth':[100,250,500],
#               'n_estimators':[100,250,500],
#               'min_samples_leaf':[1,5,10],
#               'min_samples_split':[5,10,15]
#               }
# rf = RandomForestClassifier()

# scorer = make_scorer(f1_score, average = 'weighted')
# clf = GridSearchCV(rf, parameters, scoring=scorer, cv=10)
# clf.fit(x_dep_train, y_train)

# clf.best_params_

# # {'max_depth': 250,
# #  'min_samples_leaf': 5,
# #  'min_samples_split': 10,
# #  'n_estimators': 250}

Results

In [31]:
rf_dep = RandomForestClassifier(max_depth=250, min_samples_leaf=5, min_samples_split=10, n_estimators=250, random_state=42)
rf_dep.fit(x_dep_train, y_train)

RandomForestClassifier(max_depth=250, min_samples_leaf=5, min_samples_split=10,
                       n_estimators=250, random_state=42)

In [32]:
print("Training Accuracy: ", rf_dep.score(x_dep_train, y_train))
print("Testing Accuracy: ", rf_dep.score(x_dep_test, y_test))

Training Accuracy:  0.7628427629295383
Testing Accuracy:  0.23362273086029992


### **Speaker Credibility**

In [33]:
x_speaker_cred_train = CredibilityScoreExtraction(cleaned_train,'speaker').GetDataFrame()
x_speaker_cred_test = CredibilityScoreExtraction(cleaned_test,'speaker').GetDataFrame()

Credibility Starts
Credibility Done

Credibility Starts
Credibility Done



In [34]:
x_speaker_cred_train.head()

Unnamed: 0,true,mostly-true,half-true,barely-true,false,pants-fire
0,0.0,0.0625,0.125,0.1875,0.3125,0.3125
1,0.0,0.0625,0.1875,0.1875,0.25,0.3125
2,0.196809,0.257092,0.264184,0.120567,0.136525,0.024823
3,0.023256,0.05814,0.05814,0.116279,0.244186,0.5
4,0.134021,0.195876,0.237113,0.164948,0.195876,0.072165


Grid Search CV

In [35]:
# parameters = {'max_depth':[100,250,500],
#               'n_estimators':[100,250,400],
#               'min_samples_leaf':[1,5,10],
#               'min_samples_split':[5,10,15]
#               }
# rf = RandomForestClassifier()

# scorer = make_scorer(f1_score, average = 'weighted')
# clf = GridSearchCV(rf, parameters, scoring=scorer, cv=10)
# clf.fit(x_speaker_cred_train, y_train)

# clf.best_params_

# # {'max_depth': 500,
# #  'min_samples_leaf': 10,
# #  'min_samples_split': 5,
# #  'n_estimators': 100}

Results

In [36]:
rf_speaker_cred = RandomForestClassifier(max_depth=500, min_samples_leaf=10, min_samples_split=5, n_estimators=100, random_state=42)
rf_speaker_cred.fit(x_speaker_cred_train, y_train)

RandomForestClassifier(max_depth=500, min_samples_leaf=10, min_samples_split=5,
                       random_state=42)

In [37]:
print("Training Accuracy: ", rf_speaker_cred.score(x_speaker_cred_train, y_train))
print("Testing Accuracy: ", rf_speaker_cred.score(x_speaker_cred_test, y_test))

Training Accuracy:  0.48307879208608123
Testing Accuracy:  0.6606156274664562


### **Sentiment Extraction**

In [38]:
x_sentiment_train = SentimentFeatureExtraction(traindata, 'statement').GetDataFrame()
x_sentiment_test = SentimentFeatureExtraction(testdata, 'statement').GetDataFrame()

Sentiment Feature Extraction Starts
Sentiment Feature Extraction Done

Sentiment Feature Extraction Starts
Sentiment Feature Extraction Done



In [39]:
x_sentiment_train.head()

Unnamed: 0,neg,neu,pos
0,0.115,0.692,0.192
1,0.0,0.902,0.098
2,0.107,0.687,0.206
3,0.0,0.606,0.394
4,0.0,1.0,0.0


## **Final**

In [40]:
x_train = pd.concat([x_speaker_cred_train,x_pos_train,x_ner_train,x_dep_train,x_sentiment_train], axis = 1)
x_test = pd.concat([x_speaker_cred_test,x_pos_test,x_ner_test,x_dep_test,x_sentiment_test], axis = 1)

In [41]:
x_train.head()

Unnamed: 0,true,mostly-true,half-true,barely-true,false,pants-fire,ADJ,ADP,ADV,AUX,...,predet,prep,prt,punct,quantmod,relcl,xcomp,neg,neu,pos
0,0.0,0.0625,0.125,0.1875,0.3125,0.3125,2,1,0,0,...,0,1,0,2,0,0,0,0.115,0.692,0.192
1,0.0,0.0625,0.1875,0.1875,0.25,0.3125,1,3,2,1,...,0,2,1,4,0,0,1,0.0,0.902,0.098
2,0.196809,0.257092,0.264184,0.120567,0.136525,0.024823,0,4,0,0,...,0,4,0,3,0,0,1,0.107,0.687,0.206
3,0.023256,0.05814,0.05814,0.116279,0.244186,0.5,2,0,0,1,...,0,0,0,1,0,0,1,0.0,0.606,0.394
4,0.134021,0.195876,0.237113,0.164948,0.195876,0.072165,1,2,0,0,...,0,2,0,1,0,0,0,0.0,1.0,0.0


### **Random Forest Classifier**

Grid Search CV

In [42]:
# parameters = {'max_depth':[100,250,500],
#               'n_estimators':[100,250,400],
#               'min_samples_leaf':[2,5,10],
#               'min_samples_split':[5,10,15]
#               }
# rf = RandomForestClassifier()

# scorer = make_scorer(f1_score, average = 'weighted')
# clf = GridSearchCV(rf, parameters, scoring=scorer, cv=10)
# clf.fit(x_train, y_train)

# clf.best_params_

# # {'max_depth': 500,
# #  'min_samples_leaf': 2,
# #  'min_samples_split': 10,
# #  'n_estimators': 250}

Results

In [43]:
rf = RandomForestClassifier(max_depth=500, min_samples_leaf=2, min_samples_split=10, n_estimators=250, random_state=42)
rf.fit(x_train, y_train)

RandomForestClassifier(max_depth=500, min_samples_leaf=2, min_samples_split=10,
                       n_estimators=250, random_state=42)

In [44]:
print("Training Accuracy: ", rf.score(x_train, y_train))
print("Testing Accuracy: ", rf.score(x_test, y_test))

Training Accuracy:  0.9507983339118362
Testing Accuracy:  0.6724546172059984


In [45]:
y_predict = rf.predict(x_test)
report = classification_report(y_predict, y_test)
print(report)

              precision    recall  f1-score   support

 barely-true       0.64      0.67      0.65       201
       false       0.73      0.72      0.73       253
   half-true       0.70      0.59      0.64       311
 mostly-true       0.65      0.64      0.64       245
  pants-fire       0.68      0.77      0.72        82
        true       0.63      0.75      0.68       175

    accuracy                           0.67      1267
   macro avg       0.67      0.69      0.68      1267
weighted avg       0.67      0.67      0.67      1267



### **Extra Tree Classifier**

Results

In [46]:
et = ExtraTreesClassifier(max_depth=500, min_samples_leaf=2, min_samples_split=10, n_estimators=250, bootstrap=True, random_state=42)
et.fit(x_train, y_train)

ExtraTreesClassifier(bootstrap=True, max_depth=500, min_samples_leaf=2,
                     min_samples_split=10, n_estimators=250, random_state=42)

In [47]:
print("Training Accuracy: ", et.score(x_train, y_train))
print("Testing Accuracy: ", et.score(x_test, y_test))

Training Accuracy:  0.9448976049982645
Testing Accuracy:  0.6637726913970008


In [48]:
y_predict = et.predict(x_test)
report = classification_report(y_predict, y_test)
print(report)

              precision    recall  f1-score   support

 barely-true       0.60      0.69      0.64       185
       false       0.76      0.69      0.72       272
   half-true       0.70      0.57      0.63       324
 mostly-true       0.63      0.61      0.62       249
  pants-fire       0.60      0.86      0.71        64
        true       0.63      0.76      0.69       173

    accuracy                           0.66      1267
   macro avg       0.65      0.70      0.67      1267
weighted avg       0.67      0.66      0.66      1267



### **Bagging Decision Tree Classifier**

Results

In [49]:
bag_dt = BaggingClassifier(
    base_estimator = DecisionTreeClassifier(max_depth=500, min_samples_split=10, min_samples_leaf=2),
    n_estimators = 250,
    random_state=42
)
bag_dt.fit(x_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=500,
                                                        min_samples_leaf=2,
                                                        min_samples_split=10),
                  n_estimators=250, random_state=42)

In [50]:
print("Training Accuracy: ", bag_dt.score(x_train, y_train))
print("Testing Accuracy: ", bag_dt.score(x_test, y_test))

Training Accuracy:  0.9723186393613329
Testing Accuracy:  0.6550907655880032


In [51]:
y_predict = bag_dt.predict(x_test)
report = classification_report(y_predict, y_test)
print(report)

              precision    recall  f1-score   support

 barely-true       0.63      0.63      0.63       213
       false       0.69      0.70      0.70       244
   half-true       0.67      0.59      0.63       303
 mostly-true       0.60      0.63      0.62       229
  pants-fire       0.76      0.72      0.74        97
        true       0.63      0.72      0.67       181

    accuracy                           0.66      1267
   macro avg       0.66      0.67      0.66      1267
weighted avg       0.66      0.66      0.65      1267

