In [1]:
# Adapted from: https://github.com/Shubha23/Fake-News-Detection-Text-Preprocessing-and-Classification/blob/master/fake-news-detection-text-pre-processing-using-nltk.ipynb
# and: https://github.com/manthan89-py/Fake_News_detection/blob/master/Fake%20News.ipynb

# Import necessary Python libraries, modules, etc. 
import re # for regular expressions
import string as st # for removing punctuation
import numpy as np # for linear algebra
import pandas as pd # for frame processing
import matplotlib.pyplot as plt # for data visualization
import nltk # for natural language processing
from nltk.corpus import stopwords # for removing english stopwords
from nltk.stem import WordNetLemmatizer # for term stemming
import sklearn # for predictive data analysis
from sklearn import preprocessing # for data preprocessing
from sklearn.model_selection import train_test_split # for splitting data into test/train sets
from sklearn.feature_extraction.text import TfidfVectorizer
from IPython.core.interactiveshell import InteractiveShell # to modify Jupyter notebook configuration
InteractiveShell.ast_node_interactivity = "all" # so that all outputs in a cell are returned (instead of last instance)

In [2]:
# Load the fake and true news CSVs into Pandas dataframes
true_news = pd.read_csv('True.csv') 
fake_news = pd.read_csv('Fake.csv')

# Add column for fake/true label
true_news['label'] = 'true'
fake_news['label'] = 'fake'

# Preview first 5 rows in datasets to ensure they imported properly
true_news.head()
fake_news.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",True
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",True
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",True
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",True
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",True


Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",fake
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",fake
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",fake
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",fake
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",fake


In [3]:
# Combine true_news and fake_news data frames into one
dfs = [true_news, fake_news]
news_data = pd.concat(dfs)

# Initialize instance of LabelEncoder
le = preprocessing.LabelEncoder()

# Assign numerical values to column of target values (true = 1, fake = 0)
news_data['target'] = le.fit_transform(news_data['label'])

# Concatenate text columns and isolate only relevant columns for analysis (i.e., text and target)
news_data['text'] = news_data['title'] + news_data['text']
news_data = news_data[['text', 'target']]

# Check that binary values were assigned correctly
news_data.shape # data frame dimensions
news_data.head(-5) # first 5 and last 5 rows
news_data.isnull().sum() # check for null values

(44898, 2)

Unnamed: 0,text,target
0,"As U.S. budget fight looms, Republicans flip t...",1
1,U.S. military to accept transgender recruits o...,1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,1
3,FBI Russia probe helped by Australian diplomat...,1
4,Trump wants Postal Service to charge 'much mor...,1
...,...,...
23471,Seven Iranians freed in the prisoner swap have...,0
23472,#Hashtag Hell & The Fake Left By Dady Chery an...,0
23473,Astroturfing: Journalist Reveals Brainwashing ...,0
23474,The New American Century: An Era of FraudPaul ...,0


text      0
target    0
dtype: int64

In [4]:
# Initialize instance of WordNetLemmatizer() -- reduce words to their roots
wnl = WordNetLemmatizer()

# Download multilingual Wordnet data from OMW
nltk.download('omw-1.4')

# List of english stopwords
nltk.download('stopwords') 
stop_words = set(stopwords.words('english'))

# Download english dictionary ('wordnet')
nltk.download('wordnet')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ZLoken\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ZLoken\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ZLoken\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
# Define function for cleaning data
def data_cleaning(row):
    row = row.lower() # convert text into lowercase
    row = re.sub('[^a-zA-Z]', ' ', row) # remove number and special characters using regex (keep words only)
    token = row.split() # split the data and tokenize it
    news = [wnl.lemmatize(word) for word in token if not word in stop_words] # lemmatize the words and remove any stopwords (e.g., a, an, the, etc.)
    cleaned_news = ' '.join(news) # join all tokenized words with space in between 
    
    return cleaned_news

# Clean the data - might take a couple minutes to run.
news_data['text'] = news_data['text'].apply(lambda x : data_cleaning(x)) # 'text' column

# Check for null values
news_data.isnull().sum()

text      0
target    0
dtype: int64

In [6]:
# Separate news_data into predictor and response variables
X = news_data.iloc[:40000, 0] # features used to predict if news is fake or true
y = news_data.iloc[:40000, 1] # what we're trying to predict: (whether is fake(0) or true(1))

# Split the data into training and test subsets
train_data, test_data, train_target, test_target = train_test_split(X, y, random_state = 0, train_size = 0.80)

# View first 5 rows to ensure data split worked correctly
train_data.head(-5)
test_data.head(-5)
train_target.head(-5)

21370    finnish police release one knife attack suspec...
2470     federal reserve nominee quarles pledge transpa...
13767    pope meet myanmar military chief shadow rohing...
13316    bosnian croat war criminal praljak killed cyan...
4957     gop senator compare obama drug dealer republic...
                               ...                        
17089    catalan foreign affair chief say planning regi...
14650    turkey detains suspected tie coup plotter agen...
18095    watch robin williams call hypocrisy audience p...
15430    eu eye tough brexit transition termsbrussels r...
14935    green hold climate german coalition talksberli...
Name: text, Length: 31995, dtype: object

12836    german voter punish fdp leader coalition walko...
10913    obama budget envisions trillion year deficit r...
4214     penny say south korea u free trade agreement r...
8198     obama host italy renzi state visit oct washing...
9986     mainstream medium spread huge lie gop official...
                               ...                        
9201     fcc vote repeal obama net neutrality rule bomb...
10850    obama block north korea sanction bill white ho...
9461     clinton call trump unsteady presidentwashingto...
5417     queen declares rnc use music unauthorizedyeste...
4276     elizabeth warren slapped hell trump busy losin...
Name: text, Length: 7995, dtype: object

21370    1
2470     1
13767    1
13316    1
4957     0
        ..
17089    1
14650    1
18095    0
15430    1
14935    1
Name: target, Length: 31995, dtype: int32

In [7]:
# Initialize vectorizer
vectorizer = TfidfVectorizer(max_features = 50000, lowercase = False, ngram_range = (1,2))

# Vectorize training data
vec_train_data = vectorizer.fit_transform(train_data)
vec_train_data = vec_train_data.toarray()

# Vectorize test data
vec_test_data = vectorizer.transform(test_data).toarray()

# Double check dimensions
train_data.shape , test_data.shape # pre-vectorization
vec_train_data.shape , vec_test_data.shape #post-vectorization

# Double check data partitioning after split
train_target.value_counts() # balanced partition of train data
test_target.value_counts() # balanced partition of test data

((32000,), (8000,))

((32000, 50000), (8000, 50000))

1    17169
0    14831
Name: target, dtype: int64

1    4248
0    3752
Name: target, dtype: int64

In [8]:
# Store vectorized training and test data into respective dfs
training_data = pd.DataFrame(vec_train_data , columns=vectorizer.get_feature_names())
testing_data = pd.DataFrame(vec_test_data , columns= vectorizer.get_feature_names())