##**Mounting Google Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##**Import Packages**

In [None]:
import numpy as np
import pandas as pd

###*LIAR*

In [None]:
## Path of Dataset
trainFile = '/content/drive/MyDrive/BTP/Dataset/LIAR/train.tsv'
testFile = '/content/drive/MyDrive/BTP/Dataset/LIAR/test.tsv'
valFile = '/content/drive/MyDrive/BTP/Dataset/LIAR/valid.tsv'

## Name of headers
headers = ["json ID", "label", "statement", "subject", "speaker", "job title", "state", "party", 
           "barely true", "false", "half true", "mostly true", "pants on fire", "los"]

## Loading Data
traindata = pd.read_csv(trainFile, delimiter='\t', encoding='utf-8', names=headers)
testdata = pd.read_csv(testFile, delimiter='\t', encoding='utf-8', names=headers)
valdata = pd.read_csv(valFile, delimiter='\t', encoding='utf-8', names=headers)

## Concatenation
dataset1 = pd.concat([traindata, testdata, valdata])

## Labels encoding
label_dic = {
            "true" : 1,
            "mostly-true" : 1,
            "half-true" : 1,
            "barely-true" : 0,
            "false" : 0,
            "pants-fire" : 0
        }

## Binary conversion
dataset1['label'] = dataset1['label'].map(label_dic)

## Removing Unnecessary Columns
removed_col = ['json ID', 'barely true', 'false', 'half true', 'mostly true', 'pants on fire', 'los', 
               'party', 'state', 'subject', 'job title', 'speaker']
dataset1.drop(removed_col, axis='columns', inplace=True)

## Saving 
dataset1.to_csv("/content/drive/MyDrive/BTP/Files/LIAR_Binary.csv", encoding='utf-8', index=False)

dataset1.head()

Unnamed: 0,label,statement
0,0,Says the Annies List political group supports ...
1,1,When did the decline of coal start? It started...
2,1,"Hillary Clinton agrees with John McCain ""by vo..."
3,0,Health care reform legislation is likely to ma...
4,1,The economic turnaround started at the end of ...


In [None]:
dataset1.describe()

Unnamed: 0,label
count,12791.0
mean,0.557736
std,0.496675
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


###*ISOT*

In [None]:
fakeFile = '/content/drive/MyDrive/BTP/Dataset/ISOT/Fake.csv'
trueFile = '/content/drive/MyDrive/BTP/Dataset/ISOT/True.csv'

fakedata = pd.read_csv(fakeFile, encoding='utf-8')
truedata = pd.read_csv(trueFile, encoding='utf-8')

## Adding Labels
fakedata['label'] = "false"
truedata['label'] = "true"

dataset2 = pd.concat([fakedata, truedata])

## Renaming Column
dataset2.rename(columns = {'text':'statement'}, inplace = True)

label_dic = {
            "true" : 1,
            "false" : 0,
        }

dataset2['label'] = dataset2['label'].map(label_dic)

removed_col = ['subject', 'date', 'title']
dataset2.drop(removed_col, axis='columns', inplace=True)
dataset2 = dataset2[['label','statement']]

dataset2.to_csv("/content/drive/MyDrive/BTP/Files/ISOT_Binary.csv", encoding='utf-8', index=False)

dataset2.head()

Unnamed: 0,label,statement
0,0,Donald Trump just couldn t wish all Americans ...
1,0,House Intelligence Committee Chairman Devin Nu...
2,0,"On Friday, it was revealed that former Milwauk..."
3,0,"On Christmas day, Donald Trump announced that ..."
4,0,Pope Francis used his annual Christmas Day mes...


In [None]:
dataset2.describe()

Unnamed: 0,label
count,44898.0
mean,0.477015
std,0.499477
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


###*POLITIFACT*

In [None]:
File = '/content/drive/MyDrive/BTP/Dataset/POLITIFACT/politifact.csv'

dataset3 = pd.read_csv(File, encoding='utf-8')

## Dropping Unncessary Values
dataset3.drop(dataset3.index[dataset3['fact'] == 'half-flip'], inplace=True)
dataset3.drop(dataset3.index[dataset3['fact'] == 'no-flip'], inplace=True)
dataset3.drop(dataset3.index[dataset3['fact'] == 'full-flop'], inplace=True)

dataset3.rename(columns = {'fact':'label', 'curators_article_title' : 'title', 'curator_complete_article' : 'statement'}, inplace = True)

label_dic = {
            "true" : 1,
            "mostly-true" : 1,
            "half-true" : 1,
            "barely-true" : 0,
            "false" : 0,
            "pants-fire" : 0
        }

dataset3['label'] = dataset3['label'].map(label_dic)

removed_col = ['sources', 'sources_dates', 'sources_post_location', 'sources_quote', 'curator_name', 'curated_date', 
               'sources_url', 'curator_tags', 'Unnamed: 0', 'title']
dataset3.drop(removed_col, axis='columns', inplace=True)

## Removing Nan Values
dataset3 = dataset3[dataset3['statement'].notna()]

dataset3.to_csv("/content/drive/MyDrive/BTP/Files/POLITIFACT_Binary.csv", encoding='utf-8', index=False)

dataset3.head()

Unnamed: 0,label,statement
0,0,\nBack in July a Japanese amusement park drew ...
1,0,\nA recent Facebook post uses a video clip of ...
2,1,"\n""If you look at the average teacher pay comp..."
3,0,\nA popular TikTok video said that if you want...
4,1,\nAfter failing to stop the $1.9 trillion COVI...


In [None]:
dataset3.describe()

Unnamed: 0,label
count,19105.0
mean,0.47307
std,0.499287
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [None]:
dataset = pd.concat([dataset1, dataset2, dataset3], ignore_index = True)
dataset.to_csv("/content/drive/MyDrive/BTP/Files/dataset.csv", encoding='utf-8', index=False)
dataset.head()

Unnamed: 0,label,statement
0,0,Says the Annies List political group supports ...
1,1,When did the decline of coal start? It started...
2,1,"Hillary Clinton agrees with John McCain ""by vo..."
3,0,Health care reform legislation is likely to ma...
4,1,The economic turnaround started at the end of ...


In [None]:
dataset.describe()

Unnamed: 0,label
count,76794.0
mean,0.489478
std,0.499893
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [None]:
dataset.isna().sum()

label        0
statement    0
dtype: int64