In [1]:
import pandas as pd
df= pd.read_csv('./data/Fake_Real_Data.csv')
df.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [2]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [3]:
wv.similarity(w1='great',w2='good')

0.729151

In [5]:
wv_great = wv['great']
wv_great.shape

(300,)

In [6]:
df.label.value_counts()

Fake    5000
Real    4900
Name: label, dtype: int64

In [7]:
df['label_num'] = df.label.map({'Fake':0,'Real':1})
df.head()

Unnamed: 0,Text,label,label_num
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0
1,U.S. conservative leader optimistic of common ...,Real,1
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0
4,Democrats say Trump agrees to work on immigrat...,Real,1


In [8]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [20]:
def preprocess_and_vectorize(text):
    import numpy as np
    doc = nlp(text)
    
    filtered_tokens = []
    for token in doc:
        if token.is_punct or token.is_stop:
            continue
        filtered_tokens.append(token.lemma_)
    return wv.get_mean_vector(filtered_tokens)

In [21]:
preprocess_and_vectorize("don't worry if you don't understand").shape

(300,)

In [23]:
df['text_vec'] = df.Text.apply(lambda text : preprocess_and_vectorize(text))

In [24]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(df.text_vec.values, df.label_num, test_size=0.2, 
                                                 stratify=df.label_num, 
                                                 random_state=2023)

In [27]:
import numpy as np
x_train_2d = np.stack(x_train)
x_test_2d = np.stack(x_test)

In [30]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('scaler', MinMaxScaler()),
    ('model', GradientBoostingClassifier())
])

clf.fit(x_train_2d,y_train)

y_pred = clf.predict(x_test_2d)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98      1000
           1       0.97      0.98      0.98       980

    accuracy                           0.98      1980
   macro avg       0.98      0.98      0.98      1980
weighted avg       0.98      0.98      0.98      1980

