In [1]:
import pandas as pd

df = pd.read_csv("Fake_Real_Data.csv")
df.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [2]:
df.shape

(9900, 2)

In [3]:
df.label.value_counts()

label
Fake    5000
Real    4900
Name: count, dtype: int64

In [4]:
df['label_num'] = df['label'].map({'Fake':0, 'Real': 1})
df.head()

Unnamed: 0,Text,label,label_num
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0
1,U.S. conservative leader optimistic of common ...,Real,1
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0
4,Democrats say Trump agrees to work on immigrat...,Real,1


In [5]:
import spacy
nlp = spacy.load("en_core_web_md")

In [6]:
doc = nlp("Top Trump Surrogate BRUTALLY Stabs Hi")
doc.vector.shape

(300,)

In [7]:
df['vector'] = df['Text'].apply(lambda text: nlp(text).vector)

In [8]:
df.head()

Unnamed: 0,Text,label,label_num,vector
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0,"[-0.71928114, 0.16101022, -0.15489662, -0.0466..."
1,U.S. conservative leader optimistic of common ...,Real,1,"[-0.69856507, 0.14338098, -0.08352064, -0.0492..."
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1,"[-0.7301719, 0.22488753, -0.030448029, -0.0744..."
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0,"[-0.70796, 0.20194161, -0.053331256, -0.048489..."
4,Democrats say Trump agrees to work on immigrat...,Real,1,"[-0.71223575, 0.16067797, -0.08119477, -0.0351..."


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.vector.values,
    df.label_num,
    test_size = 0.2,
    random_state = 2022
)

In [10]:
X_train.shape

(7920,)

In [11]:
X_train

array([array([-6.81998253e-01,  2.16981739e-01, -9.03688371e-02, -1.55177694e-02,
              -7.80234486e-02, -2.71199793e-02, -7.35975010e-03, -1.13568194e-01,
               1.37790265e-02,  1.92417097e+00, -2.44862616e-01, -5.68522215e-02,
               2.03121640e-02,  5.78129925e-02, -1.54516980e-01, -2.08884850e-02,
              -6.85569420e-02,  8.78252149e-01, -1.31838828e-01,  1.94048379e-02,
              -5.56235500e-02,  5.72940521e-02,  1.45094758e-02, -1.94860280e-01,
              -1.27984853e-02, -2.65243696e-03, -7.03872368e-02, -7.05652870e-03,
              -3.37544642e-02, -1.02806285e-01, -8.01965669e-02,  4.61184718e-02,
              -1.27025638e-02, -7.39830136e-02,  8.98446441e-02,  1.40729979e-01,
              -9.99258608e-02, -4.85069072e-03, -4.24495488e-02, -7.02416599e-02,
               6.91987202e-02, -4.20149155e-02,  1.26300812e-01, -1.93088278e-02,
               4.34144661e-02, -5.75309955e-02,  6.81409705e-03, -4.52972390e-02,
              -1

In [12]:
import numpy as np
X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

In [18]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)
clf = MultinomialNB()
clf.fit(scaled_train_embed, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [20]:
from sklearn.metrics import classification_report

y_pred = clf.predict(scaled_test_embed)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.92      0.91      1024
           1       0.91      0.90      0.91       956

    accuracy                           0.91      1980
   macro avg       0.91      0.91      0.91      1980
weighted avg       0.91      0.91      0.91      1980



In [22]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors = 5, metric = 'euclidean')

clf.fit(X_train_2d, y_train)
y_pred = clf.predict(X_test_2d)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.96      0.97      1024
           1       0.96      0.97      0.97       956

    accuracy                           0.97      1980
   macro avg       0.97      0.97      0.97      1980
weighted avg       0.97      0.97      0.97      1980

