In [1]:
#Libraries 
import pandas as pd
from sklearn.model_selection import train_test_split
import string 
import re
import nltk
from nltk.stem import WordNetLemmatizer,PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import Word
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
#Reading the dataset 
fake_news = pd.read_csv("Fake.csv")
fake_news['label'] = 0 #Fake news 
true_news = pd.read_csv("True.csv")
true_news['label'] = 1 #True news 

In [3]:
#Combining the fake and true data 
data = pd.concat([fake_news, true_news])
#Make label into categorical data
data['label'] = data['label'].astype('category')


#Combine the text data into one column 
data['combined_text'] = data['title'] + '. ' +  data['text']


data = data.drop(data.columns[[0, 1, 2, 3]], axis=1)

In [4]:
data = data.reset_index(drop=True)

In [5]:
data.shape

(44898, 2)

In [19]:
X = pd.DataFrame(data['combined_text'])

In [20]:
y = pd.DataFrame(data['label'])

In [21]:
#######################################
############TEXT CLEANING##############
#####APPLIED TO BOTH TRAIN AND TEST####
#######################################
#remove digits 
#remove words less than 3 characters 
#remove punctuation

X['clean_text'] = X['combined_text'].str.replace('\d+', ' ') # for digits
X['clean_text'] = X['clean_text'].str.replace(r'(\b\w{1,2}\b)', ' ') # for words less than 3 characters
X['clean_text'] = X['clean_text'].str.replace('[^\w\s]', ' ') # for punctuation 

#lemmatization 
X['clean_text'] = X['clean_text'] .apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))


In [22]:
#Split to train and test dataset 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1000)

#Reset all the index
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [23]:
X_train.head()

Unnamed: 0,combined_text,clean_text
0,REPORT: Trump Blocked HIS OWN CAMPAIGN From V...,REPORT Trump Blocked HIS OWN CAMPAIGN From Vet...
1,Sean Spicer Just Got Into A Fight With Report...,Sean Spicer Just Got Into Fight With Reporters...
2,"Putin, Trump agree to defeat IS in Syria: Krem...",Putin Trump agree defeat Syria Kremlin DANANG ...
3,"In first remarks since retweet feud, UAE diplo...",first remark since retweet feud UAE diplomat s...
4,ICYMI: Seth Meyers Rips Apart Email from Paul...,ICYMI Seth Meyers Rips Apart Email from Paul R...


In [24]:
y_train['label'].value_counts()

0    18754
1    17164
Name: label, dtype: int64

In [25]:
y_test['label'].value_counts()

0    4727
1    4253
Name: label, dtype: int64

In [26]:
def count_vectorizer(train_data, test_data, ngram):
    #train data
    countVec = CountVectorizer(lowercase=True, stop_words='english', min_df =0.01, ngram_range= ngram, strip_accents='ascii')
    vector_train = countVec.fit_transform(train_data['clean_text'].values.astype('U'))
    tokens = countVec.get_feature_names()
    vectorized_train = pd.DataFrame(vector_train.toarray(), columns=tokens)
    
    #test data - only transform
    vector_test = countVec.transform(test_data['clean_text'].values.astype('U'))
    vectorized_test = pd.DataFrame(vector_test.toarray(), columns=tokens)
    return vectorized_train, vectorized_test

def tfidf_vectorizer(train_data, test_data, ngram):
    #train data
    tfidfVec = TfidfVectorizer(lowercase=True, stop_words='english', min_df =0.01, ngram_range= ngram, strip_accents='ascii')
    vector_train = tfidfVec.fit_transform(train_data['clean_text'].values.astype('U'))
    tokens = tfidfVec.get_feature_names()
    vectorized_train = pd.DataFrame(vector_train.toarray(), columns=tokens)
    
    #test data - only transform
    vector_test = tfidfVec.transform(test_data['clean_text'].values.astype('U'))
    vectorized_test = pd.DataFrame(vector_test.toarray(), columns=tokens)
    return vectorized_train, vectorized_test


In [27]:
#Count Vectorize
ngram = (1,1)
train_data, test_data = count_vectorizer(X_train, X_test, ngram)

In [28]:
train_data

Unnamed: 0,abc,ability,able,abortion,abroad,absolute,absolutely,abuse,accept,accepted,...,year,yemen,yes,yesterday,york,young,youth,youtube,zero,zone
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35913,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35914,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35915,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35916,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0


In [29]:
test_data

Unnamed: 0,abc,ability,able,abortion,abroad,absolute,absolutely,abuse,accept,accepted,...,year,yemen,yes,yesterday,york,young,youth,youtube,zero,zone
0,0,0,1,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8975,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8976,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
8977,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
8978,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0


In [30]:
#Tfidf Vectorize
ngram = (1,1)
train_data_tfidf, test_data_tfidf = tfidf_vectorizer(X_train, X_test, ngram)

In [31]:
train_data_tfidf

Unnamed: 0,abc,ability,able,abortion,abroad,absolute,absolutely,abuse,accept,accepted,...,year,yemen,yes,yesterday,york,young,youth,youtube,zero,zone
0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.030859,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
1,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.034245,0.0,0.0,0.000000,0.0,0.0
2,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
3,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
4,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35913,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
35914,0.0,0.050943,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
35915,0.0,0.000000,0.038665,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
35916,0.0,0.000000,0.000000,0.0,0.0,0.072493,0.0,0.0,0.0,0.0,...,0.023892,0.0,0.0,0.0,0.000000,0.0,0.0,0.055885,0.0,0.0


In [32]:
test_data_tfidf 

Unnamed: 0,abc,ability,able,abortion,abroad,absolute,absolutely,abuse,accept,accepted,...,year,yemen,yes,yesterday,york,young,youth,youtube,zero,zone
0,0.0,0.000000,0.034094,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,...,0.035655,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
1,0.0,0.000000,0.000000,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,...,0.070739,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
2,0.0,0.000000,0.068933,0.0,0.0,0.10937,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
3,0.0,0.000000,0.000000,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
4,0.0,0.000000,0.000000,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.181421,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8975,0.0,0.000000,0.000000,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
8976,0.0,0.000000,0.000000,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,...,0.030979,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
8977,0.0,0.000000,0.000000,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,...,0.077848,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
8978,0.0,0.000000,0.000000,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,...,0.048854,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
