In [2]:
import numpy as np 
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
import gensim.downloader as api 
wv = api.load('word2vec-google-news-300')

In [4]:
df = pd.read_csv('Fake_Real_Data.csv')
df.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [5]:
df['label'].value_counts()

label
Fake    5000
Real    4900
Name: count, dtype: int64

In [6]:
df['label'] = df['label'].map({'Fake' : 0, 'Real' : 1})

In [7]:
df

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,0
1,U.S. conservative leader optimistic of common ...,1
2,"Trump proposes U.S. tax overhaul, stirs concer...",1
3,Court Forces Ohio To Allow Millions Of Illega...,0
4,Democrats say Trump agrees to work on immigrat...,1
...,...,...
9895,Wikileaks Admits To Screwing Up IMMENSELY Wit...,0
9896,Trump consults Republican senators on Fed chie...,1
9897,Trump lawyers say judge lacks jurisdiction for...,1
9898,WATCH: Right-Wing Pastor Falsely Credits Trum...,0


In [8]:
import spacy 
nlp = spacy.load('en_core_web_lg')

In [15]:
def preprocessVector(text):
    newToken = list()
    doc = nlp(text)
    for token in doc:
        if  not (token.is_punct or token.is_stop):
            newToken.append(token.lemma_)
    return wv.get_mean_vector(newToken)

In [16]:
df['vector'] = df['Text'].apply(lambda text : preprocessVector(text))

In [17]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df['vector'].values, df['label'], random_state=42, test_size=0.2, stratify= df['label'])

In [18]:
## For x we need to give a 2d array for training purpose so for that we need to use np.stack
x_train2d = np.stack(x_train)
x_test2d = np.stack(x_test)

In [19]:
## First scale the vector
from sklearn.preprocessing import MinMaxScaler
minMax = MinMaxScaler()
minMax.fit(x_train2d)
x_train2d = minMax.transform(x_train2d)
x_test2d = minMax.transform(x_test2d)

In [20]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(x_train2d, y_train)
nbPred = nb.predict(x_test2d)

In [21]:
from sklearn.metrics import classification_report, confusion_matrix
print("The classification report is: \n", classification_report(y_test, nbPred))
print("The confusion matrix is: \n", confusion_matrix(y_test, nbPred))

The classification report is: 
               precision    recall  f1-score   support

           0       0.94      0.94      0.94      1000
           1       0.94      0.94      0.94       980

    accuracy                           0.94      1980
   macro avg       0.94      0.94      0.94      1980
weighted avg       0.94      0.94      0.94      1980

The confusion matrix is: 
 [[939  61]
 [ 61 919]]


In [22]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors= 5 , metric='euclidean')
knn.fit(x_train2d, y_train)
knnPred = knn.predict(x_test2d)

In [23]:
from sklearn.metrics import accuracy_score
print("The classification report is: \n", classification_report(y_test, knnPred))
print("The confusion matrix is: \n", confusion_matrix(y_test, knnPred))
print("The accuracy is : ", accuracy_score(y_test, knnPred), "\n")

The classification report is: 
               precision    recall  f1-score   support

           0       0.99      0.95      0.97      1000
           1       0.95      0.99      0.97       980

    accuracy                           0.97      1980
   macro avg       0.97      0.97      0.97      1980
weighted avg       0.97      0.97      0.97      1980

The confusion matrix is: 
 [[953  47]
 [ 14 966]]
The accuracy is :  0.9691919191919192 



In [24]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_train2d, y_train)
rfPred =  rf.predict(x_test2d)

print("The accuracy is : ", accuracy_score(y_test, rfPred), "\n")
print("The classification report is: \n", classification_report(y_test, rfPred))
print("The confusion matrix is: \n", confusion_matrix(y_test, rfPred))

The accuracy is :  0.9666666666666667 

The classification report is: 
               precision    recall  f1-score   support

           0       0.97      0.96      0.97      1000
           1       0.96      0.97      0.97       980

    accuracy                           0.97      1980
   macro avg       0.97      0.97      0.97      1980
weighted avg       0.97      0.97      0.97      1980

The confusion matrix is: 
 [[963  37]
 [ 29 951]]


In [25]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier()

clf.fit(x_train2d, y_train)

y_pred = clf.predict(x_test2d)

print(classification_report(y_test, y_pred))
print("The accuracy is : ", accuracy_score(y_test, y_pred))     
print("The confusion matrix is : ", confusion_matrix(y_test, y_pred))     

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1000
           1       0.98      0.98      0.98       980

    accuracy                           0.98      1980
   macro avg       0.98      0.98      0.98      1980
weighted avg       0.98      0.98      0.98      1980

The accuracy is :  0.9813131313131314
The confusion matrix is :  [[979  21]
 [ 16 964]]
