In [None]:
import numpy as np
import pandas as pd
import scipy
from sklearn.model_selection import train_test_split
import os
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet') 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
!pip install autocorrect
from autocorrect import Speller
!pip install ipython-autotime
%load_ext autotime

# Pre-processing criterion

* Remove header or tail of article if needed (**Ex**. *WASHINGTON (Reuters) -* , **Ex**. *For entire article: Chicago Tribune*) 

> **Kaggle1 True** specific header & tail
> * **WASHINGTON (Reuters) -** ...
> * ... **-- Source link: (bit.ly/2jBh4LU)**

> **Kaggle1 Fake** specific header & tail
> * **The following statements were posted to the verified Twitter accounts** ...
> * ... **For entire article: Chicago Tribune**
> * ... **Featured image via David McNew/Stringer/Getty Images**
> * ... **Photo by Christopher Furlong/Getty Images**

> **Kaggle3** specific header & tail
> * **LONDON (Reuters) -** ...
> * **Image copyright Getty Images** ...
> * ... **-- Source link: (bit.ly/2jBh4LU)**


* Remove Twitter ID (**Ex**. *@Gonzalez*), probabily one or two word infront of it (name) and 3 words after it (date) (**Ex**. *John McCain (@SenJohnMcCain) December 4, 2017*)
* Remove url (**Ex**. (*bit.ly/2jBh4LU*), (Graphic: tmsnrt.rs/@Bgq29K), (*pic.twitter.com/4FPAe2KypA*), https//t.co/zcbyc4wp5b), 
* Remove words in square bracket (**Ex**. *\[nL1N1FIOK0]*), hashtags (**Ex**. *\#NOBAnNoWall*)
* Replace none alphabetic characters (i.e. characters other than A\~Z, a\~z) by space
* Transform letters into lowercase
* Remove the stopwords (frequently used but meaningless words, **Ex**. *me, my, you, do, a, the*)
* Remove one letter words (possibly generated by error)
* Transform words into the base form (`nltk.wordnet.WordNetLemmatizer()`)
* Spell correction (`autocorrect.Speller`)
* Exclude pre-processed article with less than 5 words



In [2]:
lemma = nltk.wordnet.WordNetLemmatizer()
spell = Speller(lang='en')

def preprocess(text):
  text=str(text)
  
  # Header exclusion
  header = re.search("\(Reuters\) -|Image copyright Getty Images|The following statements[ ]{0,1}[^ ]*[ ]{0,1}were posted to the verified Twitter accounts", text)
  if header: text = text[header.end():]

  # Tail exclusion
  tail = re.search("For entire article:|Featured image via|Photo by", text)
  if tail: text = text[:tail.start()]

  # Remove Twitter ID
  text = re.sub("\@[^ ]*" ,"", text)

  # Remove url
  text = re.sub("bit.[^ ]*|Graphic: [^ ]*|pic.twitter.[^ ]*|https[^ ]*", "", text)

  # Remove words in square bracket and hashtags
  text = re.sub("\[[^ ]*\]|\#[^ ]*", "", text)

  # Replace none alphabetic characters by space
  text = re.sub("[^A-Za-z]", " ", text)

  # Transform letters into lowercase
  text = text.lower()

  # Remove the stopwords and one-letter word
  pattern = re.compile(r'\b(' + r'|'.join(stopwords.words("english") + list(string.ascii_lowercase)) + r')\b\s*')
  text = pattern.sub('', text)

  # A string to a word list for future pre-processing
  tokenized_text = word_tokenize(text)

  # Transform words into the base form and Spell correction
  tokenized_text_final = [spell(lemma.lemmatize(word)) for word in tokenized_text]

  # Exclude pre-process article with less than 5 words
  if len(tokenized_text_final) < 5: return ""
  else: return " ".join(tokenized_text_final)


### Example ###
teststr = "WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress. For entire article: Chicago Tribune"
print(teststr + "\n" + preprocess(teststr) + "\n")

teststr = "Image copyright Getty Images. The head of a conservative Republican faction in the U.S. Congress. Photo by Christopher Furlong/Getty Images"
print(teststr + "\n" + preprocess(teststr) + "\n")

teststr = "The head @Gonzalez of a conservative Republican faction in the U.S. Congress. Photo by Christopher Furlong/Getty Images"
print(teststr + "\n" + preprocess(teststr) + "\n")

teststr = "The head of a conservative Republican faction in the U.S. Congress. (bit.ly/2jBh4LU), (Graphic: tmsnrt.rs/@Bgq29K), (pic.twitter.com/4FPAe2KypA), https//t.co/zcbyc4wp5b)"
print(teststr + "\n" + preprocess(teststr) + "\n")

teststr = "Remove words in square bracket (Ex. [nL1N1FIOK0]), hashtags (Ex. #NOBAnNoWall)"
print(teststr + "\n" + preprocess(teststr) + "\n")

teststr = "Remove words in square bracket"
print(teststr + "\n" + preprocess(teststr) + "\n")


WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress. For entire article: Chicago Tribune
head conservative republican faction congress

Image copyright Getty Images. The head of a conservative Republican faction in the U.S. Congress. Photo by Christopher Furlong/Getty Images
head conservative republican faction congress

The head @Gonzalez of a conservative Republican faction in the U.S. Congress. Photo by Christopher Furlong/Getty Images
head conservative republican faction congress

The head of a conservative Republican faction in the U.S. Congress. (bit.ly/2jBh4LU), (Graphic: tmsnrt.rs/@Bgq29K), (pic.twitter.com/4FPAe2KypA), https//t.co/zcbyc4wp5b)
head conservative republican faction congress

Remove words in square bracket (Ex. [nL1N1FIOK0]), hashtags (Ex. #NOBAnNoWall)
remove word square bracket ex hashtag ex

Remove words in square bracket


time: 5.02 s (started: 2022-04-08 19:05:28 +00:00)


# Kaggle 1

https://www.kaggle.com/clmentbisaillon/fake-and-real-news-dataset

In [None]:
Kaggle1_fake = pd.read_csv('K1_Fake.csv')
Kaggle1_true = pd.read_csv('K1_True.csv')

time: 933 ms (started: 2021-10-14 02:53:07 +00:00)


In [None]:
Kaggle1_fake_df = Kaggle1_fake.assign(label = 0)
Kaggle1_fake_df = Kaggle1_fake_df[['text', 'label']]
# Kaggle1_fake_df = Kaggle1_fake_df.assign(text = lambda dataframe: dataframe['text'].map(preprocess))
Kaggle1_fake_df

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0
...,...,...
23476,21st Century Wire says As 21WIRE reported earl...,0
23477,21st Century Wire says It s a familiar theme. ...,0
23478,Patrick Henningsen 21st Century WireRemember ...,0
23479,21st Century Wire says Al Jazeera America will...,0


time: 27.5 ms (started: 2021-10-14 02:53:21 +00:00)


In [None]:
Kaggle1_true_df = Kaggle1_true.assign(label = 1)
Kaggle1_true_df = Kaggle1_true_df[['text', 'label']]
# Kaggle1_true_df = Kaggle1_true_df.assign(text = lambda dataframe: dataframe['text'].map(preprocess))
Kaggle1_true_df

Unnamed: 0,text,label
0,WASHINGTON (Reuters) - The head of a conservat...,1
1,WASHINGTON (Reuters) - Transgender people will...,1
2,WASHINGTON (Reuters) - The special counsel inv...,1
3,WASHINGTON (Reuters) - Trump campaign adviser ...,1
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,1
...,...,...
21412,BRUSSELS (Reuters) - NATO allies on Tuesday we...,1
21413,"LONDON (Reuters) - LexisNexis, a provider of l...",1
21414,MINSK (Reuters) - In the shadow of disused Sov...,1
21415,MOSCOW (Reuters) - Vatican Secretary of State ...,1


time: 24.7 ms (started: 2021-10-14 02:53:47 +00:00)


In [None]:
# Check with just 10 articles

Kaggle1_true_df = Kaggle1_true.assign(label = 1)
Kaggle1_true_df = Kaggle1_true_df[['text', 'label']]
Kaggle1_true_df1 = Kaggle1_true_df.iloc[0:10,]
Kaggle1_true_df1 = Kaggle1_true_df1.assign(text = lambda dataframe: dataframe['text'].map(preprocess))
Kaggle1_true_df1

Unnamed: 0,text,label
0,head conservative republican faction congress ...,1
1,transgender people allowed first time enlist m...,1
2,special counsel investigation link russia pres...,1
3,trump campaign adviser george papadopoulos tol...,1
4,president donald trump called postal service f...,1
5,white house said friday set kick talk next wee...,1
6,president donald trump said thursday belief fa...,1
7,president donald trump opinion expressed reute...,1
8,president donald trump opinion expressed reute...,1
9,alabama secretary state john merriam said cert...,1


time: 4.56 s (started: 2021-10-14 03:15:30 +00:00)


In [None]:
print("Raw: " + Kaggle1_true_df.iloc[8,0] + "\n" + "Processed: " + Kaggle1_true_df1.iloc[8,0])

Raw: The following statements were posted to the verified Twitter accounts of U.S. President Donald Trump, @realDonaldTrump and @POTUS.  The opinions expressed are his own. Reuters has not edited the statements or confirmed their accuracy.  @realDonaldTrump : - Together, we are MAKING AMERICA GREAT AGAIN! bit.ly/2lnpKaq [1814 EST] - In the East, it could be the COLDEST New Year’s Eve on record. Perhaps we could use a little bit of that good old Global Warming that our Country, but not other countries, was going to pay TRILLIONS OF DOLLARS to protect against. Bundle up! [1901 EST] -- Source link: (bit.ly/2jBh4LU) (bit.ly/2jpEXYR) 
Processed: president donald trump opinion expressed reuters edited statement confirmed accuracy together making america great est east could coldest new year eve record perhaps could use little good old global warming country country going pay trillion dollar protect bundle est source link


In [None]:
Kaggle1_true_df.iloc[9,0]

'WASHINGTON (Reuters) - Alabama Secretary of State John Merrill said he will certify Democratic Senator-elect Doug Jones as winner on Thursday despite opponent Roy Moore’s challenge, in a phone call on CNN. Moore, a conservative who had faced allegations of groping teenage girls when he was in his 30s, filed a court challenge late on Wednesday to the outcome of a U.S. Senate election he unexpectedly lost. '

time: 8.08 ms (started: 2021-10-14 03:19:59 +00:00)


In [None]:
Kaggle1_true_df1.iloc[9,0]

'alabama secretary state john merriam said certify democratic senator elect doug jones winner thursday despite opponent roy moore challenge phone call cnn moore conservative faced allegation growing teenage girl filed court challenge late wednesday outcome senate election unexpectedly lost'

time: 5.2 ms (started: 2021-10-14 03:20:06 +00:00)


In [None]:
Example_article = '''
WASHINGTON (Reuters) - Alabama Secretary of State John Merrill (@johnmerrill) 
saids he will certify Democratic Senator-elect Doug Jones as winner on Thursday despite 
opponent Roy Moore’s challenge [EST 1872], in a phone call on CNN. Moore, a conservative 
who had faced allegations of groping teenage girls when he was in his 30s, filed 
a court challenge late on Wednesday to the outcome of a U.S. Senate election 
he unexpectedly lost (bit.ly/2jBh4LU). For entire article: Chicago Tribune
'''

time: 1.8 ms (started: 2021-10-14 03:21:59 +00:00)


In [None]:
Example_article


WASHINGTON (Reuters) - Alabama Secretary of State John Merrill (@johnmerrill) 
saids he will certify Democratic Senator-elect Doug Jones as winner on Thursday despite 
opponent Roy Moore’s challenge [EST 1872], in a phone call on CNN. Moore, a conservative 
who had faced allegations of groping teenage girls when he was in his 30s, filed 
a court challenge late on Wednesday to the outcome of a U.S. Senate election 
he unexpectedly lost (bit.ly/2jBh4LU). For entire article: Chicago Tribune

time: 1.64 ms (started: 2021-10-14 03:22:41 +00:00)


In [None]:
print(preprocess(Example_article))

alabama secretary state john merriam said certify democratic senator elect doug jones winner thursday despite opponent roy moore challenge est phone call cnn moore conservative faced allegation growing teenage girl filed court challenge late wednesday outcome senate election unexpectedly lost
time: 80.7 ms (started: 2021-10-14 03:22:51 +00:00)


# Kaggle 2

https://www.kaggle.com/ksaivenketpatro/fake-news-detection-dataset

In [None]:
Kaggle2 = pd.read_csv('K2.csv')
Kaggle2

Unnamed: 0,Statement,Label
0,Says the Annies List political group supports ...,False
1,When did the decline of coal start? It started...,True
2,"Hillary Clinton agrees with John McCain ""by vo...",True
3,Health care reform legislation is likely to ma...,False
4,The economic turnaround started at the end of ...,True
...,...,...
10235,There are a larger number of shark attacks in ...,True
10236,Democrats have now become the party of the [At...,True
10237,Says an alternative to Social Security that op...,True
10238,On lifting the U.S. Cuban embargo and allowing...,False


time: 35.4 ms (started: 2021-10-14 02:57:11 +00:00)


In [None]:
Kaggle2_df = Kaggle2.rename({'Statement': 'text', 'Label': 'label'}, axis=1)
Kaggle2_df

Unnamed: 0,text,label
0,Says the Annies List political group supports ...,False
1,When did the decline of coal start? It started...,True
2,"Hillary Clinton agrees with John McCain ""by vo...",True
3,Health care reform legislation is likely to ma...,False
4,The economic turnaround started at the end of ...,True
...,...,...
10235,There are a larger number of shark attacks in ...,True
10236,Democrats have now become the party of the [At...,True
10237,Says an alternative to Social Security that op...,True
10238,On lifting the U.S. Cuban embargo and allowing...,False


time: 23.7 ms (started: 2021-10-14 02:57:16 +00:00)


In [None]:
def truefalse_to_01(label):
  if label is True:
    return 1
  if label is False:
    return 0

Kaggle2_df = Kaggle2.rename({'Statement': 'text', 'Label': 'label'}, axis=1)
Kaggle2_df = Kaggle2_df.assign(label = lambda dataframe: dataframe['label'].map(truefalse_to_01))
Kaggle2_df

Unnamed: 0,text,label
0,Says the Annies List political group supports ...,0
1,When did the decline of coal start? It started...,1
2,"Hillary Clinton agrees with John McCain ""by vo...",1
3,Health care reform legislation is likely to ma...,0
4,The economic turnaround started at the end of ...,1
...,...,...
10235,There are a larger number of shark attacks in ...,1
10236,Democrats have now become the party of the [At...,1
10237,Says an alternative to Social Security that op...,1
10238,On lifting the U.S. Cuban embargo and allowing...,0


time: 33.1 ms (started: 2021-10-14 02:57:17 +00:00)


In [None]:
Kaggle2_df.to_csv('K2.csv', sep='\t', index = False)

# Kaggle 3

https://www.kaggle.com/jruvika/fake-news-detection?select=data.csv

In [None]:
Kaggle3 = pd.read_csv('K3.csv')
Kaggle3

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1
...,...,...,...,...
4004,http://beforeitsnews.com/sports/2017/09/trends...,Trends to Watch,Trends to Watch\n% of readers think this story...,0
4005,http://beforeitsnews.com/u-s-politics/2017/10/...,Trump Jr. Is Soon To Give A 30-Minute Speech F...,Trump Jr. Is Soon To Give A 30-Minute Speech F...,0
4006,https://www.activistpost.com/2017/09/ron-paul-...,"Ron Paul on Trump, Anarchism & the AltRight",,0
4007,https://www.reuters.com/article/us-china-pharm...,China to accept overseas trial data in bid to ...,SHANGHAI (Reuters) - China said it plans to ac...,1


time: 118 ms (started: 2021-10-14 03:02:34 +00:00)


In [None]:
Kaggle3_df = Kaggle3[['Body', 'Label']]
Kaggle3_df = Kaggle3_df.rename({'Body': 'text', 'Label': 'label'}, axis=1)
Kaggle3_df = Kaggle3_df.assign(text = lambda dataframe: dataframe['text'].map(deleteheader))

Kaggle3_df

Unnamed: 0,text,label
0,Image copyright Getty Images\nOn Sunday mornin...,1
1,"“Last Flag Flying”, a comedy-drama about Vietn...",1
2,The feud broke into public view last week when...,1
3,Egypt’s Cheiron Holdings Limited won the right...,1
4,"Country singer Jason Aldean, who was performin...",1
...,...,...
4004,Trends to Watch\n% of readers think this story...,0
4005,Trump Jr. Is Soon To Give A 30-Minute Speech F...,0
4006,,0
4007,China said it plans to accept data from overse...,1


In [None]:
Kaggle3_df.to_csv('Kaggle3.csv', sep='\t', index = False)

# LIAR 

In [None]:
uri_train  = 'https://raw.githubusercontent.com/thiagorainmaker77/liar_dataset/master/train.tsv'
uri_valid  = 'https://raw.githubusercontent.com/thiagorainmaker77/liar_dataset/master/valid.tsv'
uri_test  = 'https://raw.githubusercontent.com/thiagorainmaker77/liar_dataset/master/test.tsv'

    
df_train = pd.read_table(uri_train,
                             names = ['id',	'label'	,'statement',	'subject',	'speaker', 	'job', 	'state',	'party',	'barely_true_c',	'false_c',	'half_true_c',	'mostly_true_c',	'pants_on_fire_c',	'venue'])

    
df_valid = pd.read_table(uri_valid,
                             names =['id',	'label'	,'statement',	'subject',	'speaker', 	'job', 	'state',	'party',	'barely_true_c',	'false_c',	'half_true_c',	'mostly_true_c',	'pants_on_fire_c',	'venue'])


df_test = pd.read_csv(uri_test, sep='\t', 
                            names =['id',	'label'	,'statement',	'subject',	'speaker', 	'job', 	'state',	'party',	'barely_true_c',	'false_c',	'half_true_c',	'mostly_true_c',	'pants_on_fire_c',	'venue']) 


df = pd.concat([df_train, df_valid, df_test])

time: 744 ms (started: 2021-10-14 03:03:51 +00:00)


In [None]:
def switchfunc(label):
  if label in ["true", "mostly-true"]:
    return 1
  if label in ["pants-fire", "false"]:
    return 0
  return -1

time: 2.78 ms (started: 2021-10-14 03:03:54 +00:00)


In [None]:
df_LIAR = df[['statement', 'label']]
df_LIAR = df_LIAR.assign(label = lambda dataframe: dataframe['label'].map(switchfunc))
df_LIAR = df_LIAR[df_LIAR['label'] != -1]
df_LIAR = df_LIAR.rename({'statement': 'text'}, axis=1)
df_LIAR

Unnamed: 0,text,label
0,Says the Annies List political group supports ...,0
2,"Hillary Clinton agrees with John McCain ""by vo...",1
3,Health care reform legislation is likely to ma...,0
5,The Chicago Bears have had more starting quart...,1
9,Says GOP primary opponents Glenn Grothman and ...,1
...,...,...
1256,Says Chris Christies plan to kick-start our ec...,0
1257,Obama used $20 million in federal money to emm...,0
1260,I think its seven or eight of the California s...,0
1261,Sen. Bob Menendez voted to enact a new tax on ...,0


time: 53.5 ms (started: 2021-10-14 03:03:56 +00:00)


In [None]:
df_LIAR[df_LIAR['label'] == 0]

Unnamed: 0,text,label
0,Says the Annies List political group supports ...,0
3,Health care reform legislation is likely to ma...,0
12,When Mitt Romney was governor of Massachusetts...,0
20,Women and men both are making less when you ad...,0
25,I dont know who (Jonathan Gruber) is.,0
...,...,...
1256,Says Chris Christies plan to kick-start our ec...,0
1257,Obama used $20 million in federal money to emm...,0
1260,I think its seven or eight of the California s...,0
1261,Sen. Bob Menendez voted to enact a new tax on ...,0


time: 24.9 ms (started: 2021-10-14 03:06:17 +00:00)


###Complete Pre-processing using `preprocess` function.  
### Due to time and memorey issues, it was done on the Longleaf computing cluster. 

# ==================== After pre-processing ===================
## Done in Longleaf cluster

In [None]:
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from numpy import array
from scipy.sparse import csr_matrix
import random

# Load preprocessed dataset
with open('Kaggle1_fake_processed.p', 'rb') as Kaggle1_fake: 
    Kaggle1_fake_processed = pickle.load(Kaggle1_fake)

with open('Kaggle1_true_processed.p', 'rb') as Kaggle1_true: 
    Kaggle1_true_processed = pickle.load(Kaggle1_true)

with open('Kaggle2_processed.p', 'rb') as Kaggle2: 
    Kaggle2_processed = pickle.load(Kaggle2)

with open('Kaggle3_processed.p', 'rb') as Kaggle3: 
    Kaggle3_processed = pickle.load(Kaggle3)

with open('LIAR_processed.p', 'rb') as liar: 
    LIAR_processed = pickle.load(liar)

# Dimensions of dataset
K1_fake_dim = Kaggle1_fake_processed.shape[0]
K1_true_dim = Kaggle1_true_processed.shape[0]
K2_dim = Kaggle2_processed.shape[0]
K3_dim = Kaggle3_processed.shape[0]
L_dim = LIAR_processed.shape[0]

# Generate training and testing index for Kaggle1_fake and Kaggle1_true
random.seed(1)
train_ratio = 0.6

indices_train_fake = random.sample(range(K1_fake_dim), round(K1_fake_dim * train_ratio))
indices_train_fake.sort()
indices_test_fake = list(set(range(K1_fake_dim)) - set(indices_train_fake))
indices_test_fake.sort()
print(K1_fake_dim, len(indices_train_fake), len(indices_test_fake), len(indices_train_fake) + len(indices_test_fake))

indices_train_true = random.sample(range(K1_true_dim), round(K1_true_dim * train_ratio))
indices_train_true.sort()
indices_test_true = list(set(range(K1_true_dim)) - set(indices_train_true))
indices_test_true.sort()
print(K1_true_dim, len(indices_train_true), len(indices_test_true), len(indices_train_true) + len(indices_test_true))

# ======================================= #
#    1. Document features (RNN, BERT)     #
# ======================================= #

# Training-Testing split on Kaggle1 data
train_fake = Kaggle1_fake_processed.iloc[indices_train_fake,:]
test_fake  = Kaggle1_fake_processed.iloc[indices_test_fake, :]

train_true = Kaggle1_true_processed.iloc[indices_train_true,:]
test_true  = Kaggle1_true_processed.iloc[indices_test_true, :]

Kaggle1_processed_train = pd.concat((train_fake, train_true))
Kaggle1_processed_test  = pd.concat((test_fake,  test_true ))

# Save as pickle object
with open('./document/train_document.p', 'wb') as file: 
    pickle.dump(Kaggle1_processed_train['text'], file)
    pickle.dump(Kaggle1_processed_train['label'], file)

with open('./document/test_K1_document.p',  'wb') as file: 
    pickle.dump(Kaggle1_processed_test['text'],  file)
    pickle.dump(Kaggle1_processed_test['label'],  file)

with open('./document/test_K2_document.p', 'wb') as file: 
    pickle.dump(Kaggle2_processed['text'], file)
    pickle.dump(Kaggle2_processed['label'], file)

with open('./document/test_K3_document.p', 'wb') as file: 
    pickle.dump(Kaggle3_processed['text'], file)
    pickle.dump(Kaggle3_processed['label'], file)

with open('./document/test_L_document.p', 'wb') as file: 
    pickle.dump(LIAR_processed['text'], file)
    pickle.dump(LIAR_processed['label'], file)

'''
# To load pickle object
with open('train_document.p', 'rb') as file: 
    X_train = pickle.load(file)
    y_train = pickle.load(file)

with open('test_K1_document.p',  'rb') as file: 
    X_test_K1 = pickle.load(file)
    y_test_K1 = pickle.load(file)

with open('test_K2_document.p', 'rb') as file: 
    X_test_K2 = pickle.load(file)
    y_test_K2 = pickle.load(file)

with open('test_K3_document.p', 'rb') as file: 
    X_test_K3 = pickle.load(file)
    y_test_K3 = pickle.load(file)

with open('test_L_document.p', 'rb') as file: 
    X_test_L = pickle.load(file)
    y_test_L = pickle.load(file)
'''


# ======================================= #
#    2. BOW features (NB, LR, SVM, RF)    #
# ======================================= #

# Combining pre-processed data frame
combined_df = pd.concat([Kaggle1_processed_train, Kaggle1_processed_test, Kaggle2_processed, Kaggle3_processed, LIAR_processed], ignore_index=True)

# Combined data frame to BOW matrix 
vectorizer = CountVectorizer()
combined_BOW_matrix = vectorizer.fit_transform(combined_df.iloc[:,0]).toarray()

# convert to sparse matrix (CSR method)
combined_BOW_matrix = csr_matrix(combined_BOW_matrix)

# words list (column of BOW matrix)
combined_vocab = vectorizer.vocabulary_
worddict = dict(sorted(combined_vocab.items(), key=lambda item: item[1]))
wordlist = np.asarray(list(worddict.keys()))

# Include top 1000 frequent words only
colsum = np.ravel(np.asarray(np.sum(combined_BOW_matrix, axis = 0)))
top1000index = np.argsort(colsum)[-1000:]
BOW_matrix  = combined_BOW_matrix[:,top1000index]
wordlist = wordlist[top1000index]

# Divide BOW matrix into dataset specific matrices
# Dimensions of dataset (including Kaggle1 train and test data)
K1_train_dim = Kaggle1_processed_train.shape[0]
K1_test_dim = Kaggle1_processed_test.shape[0]
K2_dim = Kaggle2_processed.shape[0]
K3_dim = Kaggle3_processed.shape[0]
L_dim = LIAR_processed.shape[0]

index = np.cumsum([K1_train_dim, K1_test_dim, K2_dim, K3_dim, L_dim])

K1_train_BOW_mat = BOW_matrix[:index[0],]
K1_test_BOW_mat = BOW_matrix[index[0]:index[1],]
K2_BOW_mat = BOW_matrix[index[1]:index[2],]
K3_BOW_mat = BOW_matrix[index[2]:index[3],]
L_BOW_mat = BOW_matrix[index[3]:index[4],]

# Save as pickle object
with open('./bow/train_BOW.p', 'wb') as file:
    pickle.dump(K1_train_BOW_mat, file)
    pickle.dump(Kaggle1_processed_train['label'], file)
    pickle.dump(wordlist, file)

with open('./bow/test_K1_BOW.p', 'wb') as file:
    pickle.dump(K1_test_BOW_mat, file)
    pickle.dump(Kaggle1_processed_test['label'], file)
    pickle.dump(wordlist, file)

with open('./bow/test_K2_BOW.p', 'wb') as file:
    pickle.dump(K2_BOW_mat, file)
    pickle.dump(Kaggle2_processed['label'], file)
    pickle.dump(wordlist, file)

with open('./bow/test_K3_BOW.p', 'wb') as file:
    pickle.dump(K3_BOW_mat, file)
    pickle.dump(Kaggle3_processed['label'], file)
    pickle.dump(wordlist, file)

with open('./bow/test_L_BOW.p', 'wb') as file:
    pickle.dump(L_BOW_mat, file)
    pickle.dump(LIAR_processed['label'], file)
    pickle.dump(wordlist, file)
    
'''
# To load objects

with open('train_BOW.p', 'rb') as file:
    X_train = pickle.load(file)
    y_train = pickle.load(file)
    wordlist = pickle.load(file)

with open('test_K1_BOW.p', 'rb') as file:
    X_test_K1 = pickle.load(file)
    y_test_K1 = pickle.load(file)
    wordlist  = pickle.load(file)

with open('test_K2_BOW.p', 'rb') as file:
    X_test_K2 = pickle.load(file)
    y_test_K2 = pickle.load(file)
    wordlist  = pickle.load(file)

with open('test_K3_BOW.p', 'rb') as file:
    X_test_K3 = pickle.load(file)
    y_test_K3 = pickle.load(file)
    wordlist  = pickle.load(file)

with open('test_L_BOW.p', 'rb') as file:
    X_test_L = pickle.load(file)
    y_test_L = pickle.load(file)
    wordlist = pickle.load(file)
'''

# ======================================= #
#    3. TF-IDF features (NB, LR, SVM, RF) #
# ======================================= #

# TF_IDF matrix from BOW matrix
TF_IDF_matrix = BOW_matrix.todense()
TF_IDF_matrix = TF_IDF_matrix / TF_IDF_matrix.sum(axis = 1)

# convert to sparse matrix (CSR method)
TF_IDF_matrix = csr_matrix(TF_IDF_matrix)

# Divide TF-IDF matrix into dataset specific matrices
# Dimensions of dataset (including Kaggle1 train and test data)
K1_train_dim = Kaggle1_processed_train.shape[0]
K1_test_dim = Kaggle1_processed_test.shape[0]
K2_dim = Kaggle2_processed.shape[0]
K3_dim = Kaggle3_processed.shape[0]
L_dim = LIAR_processed.shape[0]

index = np.cumsum([K1_train_dim, K1_test_dim, K2_dim, K3_dim, L_dim])

K1_train_TF_IDF_mat = TF_IDF_matrix[:index[0],]
K1_test_TF_IDF_mat = TF_IDF_matrix[index[0]:index[1],]
K2_TF_IDF_mat = TF_IDF_matrix[index[1]:index[2],]
K3_TF_IDF_mat = TF_IDF_matrix[index[2]:index[3],]
L_TF_IDF_mat = TF_IDF_matrix[index[3]:index[4],]

# Save as pickle object
with open('./tfidf/train_TF_IDF.p', 'wb') as file:
    pickle.dump(K1_train_TF_IDF_mat, file)
    pickle.dump(Kaggle1_processed_train['label'], file)
    pickle.dump(wordlist, file)

with open('./tfidf/test_K1_TF_IDF.p', 'wb') as file:
    pickle.dump(K1_test_TF_IDF_mat, file)
    pickle.dump(Kaggle1_processed_test['label'], file)
    pickle.dump(wordlist, file)

with open('./tfidf/test_K2_TF_IDF.p', 'wb') as file:
    pickle.dump(K2_TF_IDF_mat, file)
    pickle.dump(Kaggle2_processed['label'], file)
    pickle.dump(wordlist, file)

with open('./tfidf/test_K3_TF_IDF.p', 'wb') as file:
    pickle.dump(K3_TF_IDF_mat, file)
    pickle.dump(Kaggle3_processed['label'], file)
    pickle.dump(wordlist, file)

with open('./tfidf/test_L_TF_IDF.p', 'wb') as file:
    pickle.dump(L_TF_IDF_mat, file)
    pickle.dump(LIAR_processed['label'], file)
    pickle.dump(wordlist, file)
    
'''
# To load objects

with open('train_TF_IDF.p', 'rb') as file:
    X_train = pickle.load(file)
    y_train = pickle.load(file)
    wordlist = pickle.load(file)

with open('test_K1_TF_IDF.p', 'rb') as file:
    X_test_K1 = pickle.load(file)
    y_test_K1 = pickle.load(file)
    wordlist  = pickle.load(file)

with open('test_K2_TF_IDF.p', 'rb') as file:
    X_test_K2 = pickle.load(file)
    y_test_K2 = pickle.load(file)
    wordlist  = pickle.load(file)

with open('test_K3_TF_IDF.p', 'rb') as file:
    X_test_K3 = pickle.load(file)
    y_test_K3 = pickle.load(file)
    wordlist  = pickle.load(file)

with open('test_L_TF_IDF.p', 'rb') as file:
    X_test_L = pickle.load(file)
    y_test_L = pickle.load(file)
    wordlist = pickle.load(file)
'''



# ======================================= #
#    4. Bigram features (NB, LR, SVM, RF) #
# ======================================= #

# Combining pre-processed data frame
combined_df = pd.concat([Kaggle1_processed_train, Kaggle1_processed_test, Kaggle2_processed, Kaggle3_processed, LIAR_processed], ignore_index=True)

# Combined data frame to BOW matrix 
vectorizer = CountVectorizer(ngram_range=(2, 2), min_df = 0.001)
combined_BIGRAM_matrix = csr_matrix(vectorizer.fit_transform(combined_df.iloc[:,0]).toarray())
# convert to sparse matrix (CSR method)

# words list (column of BOW matrix)
combined_vocab = vectorizer.vocabulary_
worddict = dict(sorted(combined_vocab.items(), key=lambda item: item[1]))
wordlist = np.asarray(list(worddict.keys()))

# Include top 1000 frequent words only
colsum = np.ravel(np.asarray(np.sum(combined_BIGRAM_matrix, axis = 0)))
top1000index = np.argsort(colsum)[-1000:]
BIGRAM_matrix  = combined_BIGRAM_matrix[:,top1000index]
wordlist = wordlist[top1000index]

# Divide BOW matrix into dataset specific matrices
# Dimensions of dataset (including Kaggle1 train and test data)
K1_train_dim = Kaggle1_processed_train.shape[0]
K1_test_dim = Kaggle1_processed_test.shape[0]
K2_dim = Kaggle2_processed.shape[0]
K3_dim = Kaggle3_processed.shape[0]
L_dim = LIAR_processed.shape[0]

index = np.cumsum([K1_train_dim, K1_test_dim, K2_dim, K3_dim, L_dim])

K1_train_BIGRAM_mat = BIGRAM_matrix[:index[0],]
K1_test_BIGRAM_mat = BIGRAM_matrix[index[0]:index[1],]
K2_BIGRAM_mat = BIGRAM_matrix[index[1]:index[2],]
K3_BIGRAM_mat = BIGRAM_matrix[index[2]:index[3],]
L_BIGRAM_mat = BIGRAM_matrix[index[3]:index[4],]

# Save as pickle object
with open('./bigram/train_BIGRAM.p', 'wb') as file:
    pickle.dump(K1_train_BIGRAM_mat, file)
    pickle.dump(Kaggle1_processed_train['label'], file)
    pickle.dump(wordlist, file)

with open('./bigram/test_K1_BIGRAM.p', 'wb') as file:
    pickle.dump(K1_test_BIGRAM_mat, file)
    pickle.dump(Kaggle1_processed_test['label'], file)
    pickle.dump(wordlist, file)

with open('./bigram/test_K2_BIGRAM.p', 'wb') as file:
    pickle.dump(K2_BIGRAM_mat, file)
    pickle.dump(Kaggle2_processed['label'], file)
    pickle.dump(wordlist, file)

with open('./bigram/test_K3_BIGRAM.p', 'wb') as file:
    pickle.dump(K3_BIGRAM_mat, file)
    pickle.dump(Kaggle3_processed['label'], file)
    pickle.dump(wordlist, file)

with open('./bigram/test_L_BIGRAM.p', 'wb') as file:
    pickle.dump(L_BIGRAM_mat, file)
    pickle.dump(LIAR_processed['label'], file)
    pickle.dump(wordlist, file)
    
'''
# To load objects

with open('train_BIGRAM.p', 'rb') as file:
    X_train = pickle.load(file)
    y_train = pickle.load(file)
    wordlist = pickle.load(file)

with open('test_K1_BIGRAM.p', 'rb') as file:
    X_test_K1 = pickle.load(file)
    y_test_K1 = pickle.load(file)
    wordlist  = pickle.load(file)

with open('test_K2_BIGRAM.p', 'rb') as file:
    X_test_K2 = pickle.load(file)
    y_test_K2 = pickle.load(file)
    wordlist  = pickle.load(file)

with open('test_K3_BIGRAM.p', 'rb') as file:
    X_test_K3 = pickle.load(file)
    y_test_K3 = pickle.load(file)
    wordlist  = pickle.load(file)

with open('test_L_BIGRAM.p', 'rb') as file:
    X_test_L = pickle.load(file)
    y_test_L = pickle.load(file)
    wordlist = pickle.load(file)
'''