In [1]:
import pandas as pd
import spacy
import numpy as np

In [2]:
fake_df = pd.read_csv("/Users/baobach/ML-Projects/ml-fake-news/data/Fake.csv")
true_df = pd.read_csv("/Users/baobach/ML-Projects/ml-fake-news/data/True.csv")

In [3]:
fake_df['True'] = 0
true_df['True'] = 1

In [4]:
df = pd.concat([fake_df, true_df], axis=0)
df.head()

Unnamed: 0,title,text,subject,date,True
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [5]:
print("Shape of fake_df: " , fake_df.shape)
print("Shape of true_df: " ,true_df.shape)
print("Shape of df: " ,df.shape)

Shape of fake_df:  (23481, 5)
Shape of true_df:  (21417, 5)
Shape of df:  (44898, 5)


In [6]:
df = df.drop(['title', 'subject', 'date'], axis=1)
df.head()

Unnamed: 0,text,True
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0


In [7]:
df['True'].value_counts()

0    23481
1    21417
Name: True, dtype: int64

In [8]:
#en_core_web_lg is the trained pipeline for the English language
nlp = spacy.load("en_core_web_sm")

In [9]:
df['text_vector'] = df['text'].apply(lambda x: nlp(x).vector)
df.head()

Unnamed: 0,text,True,text_vector
0,Donald Trump just couldn t wish all Americans ...,0,"[0.19154578, 0.1168541, 0.05546496, -0.1568120..."
1,House Intelligence Committee Chairman Devin Nu...,0,"[0.29478067, -0.0013799303, 0.14750189, -0.076..."
2,"On Friday, it was revealed that former Milwauk...",0,"[0.2153904, 0.056878168, 0.068169765, -0.20057..."
3,"On Christmas day, Donald Trump announced that ...",0,"[0.20874538, 0.059322342, 0.07611682, -0.20267..."
4,Pope Francis used his annual Christmas Day mes...,0,"[0.19050713, 0.08093774, 0.23777202, -0.152757..."


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['text_vector'],df['True'], test_size = 0.3, random_state = 42)

In [11]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_train: ", y_train.shape)
print("Shape of y_test: ", y_test.shape)

Shape of X_train:  (31428,)
Shape of X_test:  (13470,)
Shape of y_train:  (31428,)
Shape of y_test:  (13470,)


In [12]:
X_train_convert = np.stack(X_train)
X_test_convert = np.stack(X_test)

In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_convert_scaled = scaler.fit_transform(X_train_convert)
X_test_convert_scaled = scaler.transform(X_test_convert)

clf = MultinomialNB()
clf.fit(X_train_convert_scaled, y_train)

In [14]:
from sklearn.metrics import classification_report


y_prediction = clf.predict(X_test_convert_scaled)

print(classification_report(y_test, y_prediction))

              precision    recall  f1-score   support

           0       0.56      0.99      0.72      7091
           1       0.94      0.14      0.25      6379

    accuracy                           0.59     13470
   macro avg       0.75      0.57      0.48     13470
weighted avg       0.74      0.59      0.50     13470



In [15]:
from sklearn.tree import DecisionTreeClassifier
clf_tree = DecisionTreeClassifier(random_state=0)
clf_tree.fit(X_train_convert_scaled, y_train)

In [16]:
y_prediction_tree = clf_tree.predict(X_test_convert_scaled)
print(classification_report(y_test, y_prediction_tree))

              precision    recall  f1-score   support

           0       0.88      0.91      0.90      7091
           1       0.89      0.87      0.88      6379

    accuracy                           0.89     13470
   macro avg       0.89      0.89      0.89     13470
weighted avg       0.89      0.89      0.89     13470

