In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 
import warnings
warnings.simplefilter('ignore')

In [2]:
df=pd.read_csv('fake_and_real_news.csv')
print(df.shape)
df.head()

(9900, 2)


Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [3]:
df.label.value_counts()

Fake    5000
Real    4900
Name: label, dtype: int64

In [4]:
df['label_num']=df.label.map({'Fake':0,'Real':1})

In [5]:
df.head(5)

Unnamed: 0,Text,label,label_num
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0
1,U.S. conservative leader optimistic of common ...,Real,1
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0
4,Democrats say Trump agrees to work on immigrat...,Real,1


In [6]:
import spacy
nlp=spacy.load('en_core_web_lg')

In [7]:
def preprocess(text):
    doc=nlp(text)
    filtered_tokens=[]
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
        
    return " ".join(filtered_tokens)

In [8]:
import gensim.downloader as api
wv=api.load("word2vec-google-news-300")

In [9]:
def preprocess_vector(text):
    doc=nlp(text)
    filtered_tokens=[]
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
        
    return wv.get_mean_vector(filtered_tokens)

In [13]:
text="Don't worry if you don't understand"

In [14]:
preprocess(text)

'worry understand'

In [15]:
preprocess_vector(text)

array([ 0.00235079, -0.00284596, -0.03638233,  0.00413919, -0.10635224,
        0.05758579,  0.13348952, -0.00689176,  0.05995331, -0.02875906,
       -0.02279907, -0.06939262, -0.01549996,  0.03538099, -0.0873947 ,
        0.10044542,  0.02490648,  0.11501945,  0.00442711, -0.07228819,
       -0.04447255,  0.02785169,  0.03338735,  0.02222995,  0.07053161,
        0.06723307,  0.03483712, -0.0873695 ,  0.04152397, -0.0969665 ,
       -0.00914938,  0.00555944, -0.0277799 , -0.00701522,  0.05281431,
       -0.01066206,  0.02466576,  0.02378148,  0.0279402 ,  0.05527755,
        0.01359304, -0.01062085,  0.06734448, -0.03492254, -0.08421434,
       -0.04324378, -0.03578918, -0.00817786, -0.02093195,  0.01856531,
       -0.07064191,  0.05023994, -0.03036207, -0.0412168 , -0.00684169,
        0.05189689, -0.04097777, -0.05789134,  0.044417  , -0.0470418 ,
       -0.01475445,  0.01373999, -0.00459672,  0.00419459,  0.04307397,
       -0.01500929, -0.08037488,  0.05358911, -0.0211829 , -0.00

In [23]:
wv.get_mean_vector(['worry','undertsand'],pre_normalize=False)[:3]

array([0.0409246 , 0.01449795, 0.0072031 ], dtype=float32)

In [18]:
v1=wv['worry']
v2=wv['understand']

In [22]:
np.mean([v1,v2],axis=0)[:3]

array([ 0.00976562, -0.00561523, -0.08905029], dtype=float32)

In [25]:
df['vector']=df['Text'].apply(lambda text: nlp(text).vector)

#by nlp text preprocessing into vector it will take 10 15 min

In [27]:
#by gensim text preprocessing into vector it will taker 10 15 min
df['gensim_vector']=df.Text.apply(lambda text: preprocess_vector(text))

In [30]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(df.Text,df.label_num,test_size=.2,random_state=2022,stratify=df.label_num)

In [31]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

In [32]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [33]:
from sklearn.metrics import classification_report,confusion_matrix

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [35]:
clf=Pipeline([
    ('v',CountVectorizer()),
    ('m',MultinomialNB())
])
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98      1000
           1       0.97      0.98      0.98       980

    accuracy                           0.98      1980
   macro avg       0.98      0.98      0.98      1980
weighted avg       0.98      0.98      0.98      1980

[[975  25]
 [ 24 956]]


In [36]:
clf=Pipeline([
    ('v',CountVectorizer()),
    ('m',DecisionTreeClassifier())
])
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1000
           1       1.00      1.00      1.00       980

    accuracy                           1.00      1980
   macro avg       1.00      1.00      1.00      1980
weighted avg       1.00      1.00      1.00      1980

[[1000    0]
 [   2  978]]


In [37]:
clf=Pipeline([
    ('v',CountVectorizer()),
    ('m',KNeighborsClassifier(n_neighbors=5,metric='euclidean'))
])
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.97      0.92      0.94      1000
           1       0.92      0.97      0.95       980

    accuracy                           0.94      1980
   macro avg       0.95      0.94      0.94      1980
weighted avg       0.95      0.94      0.94      1980

[[923  77]
 [ 33 947]]


In [38]:
clf=Pipeline([
    ('v',CountVectorizer()),
    ('m',RandomForestClassifier())
])
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1000
           1       1.00      1.00      1.00       980

    accuracy                           1.00      1980
   macro avg       1.00      1.00      1.00      1980
weighted avg       1.00      1.00      1.00      1980

[[998   2]
 [  1 979]]


In [39]:
clf=Pipeline([
    ('v',CountVectorizer()),
    ('m',GradientBoostingClassifier())
])
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1000
           1       1.00      1.00      1.00       980

    accuracy                           1.00      1980
   macro avg       1.00      1.00      1.00      1980
weighted avg       1.00      1.00      1.00      1980

[[1000    0]
 [   1  979]]


# Now we used TFidfvectorizer for text classification

In [41]:
clf=Pipeline([
    ('v',TfidfVectorizer()),
    ('m',MultinomialNB())
])
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.97      0.97      1000
           1       0.97      0.96      0.97       980

    accuracy                           0.97      1980
   macro avg       0.97      0.97      0.97      1980
weighted avg       0.97      0.97      0.97      1980

[[970  30]
 [ 37 943]]


In [42]:
clf=Pipeline([
    ('v',TfidfVectorizer()),
    ('m',DecisionTreeClassifier())
])
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1000
           1       1.00      1.00      1.00       980

    accuracy                           1.00      1980
   macro avg       1.00      1.00      1.00      1980
weighted avg       1.00      1.00      1.00      1980

[[999   1]
 [  1 979]]


In [43]:
clf=Pipeline([
    ('v',TfidfVectorizer()),
    ('m',GradientBoostingClassifier())
])
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1000
           1       1.00      1.00      1.00       980

    accuracy                           1.00      1980
   macro avg       1.00      1.00      1.00      1980
weighted avg       1.00      1.00      1.00      1980

[[999   1]
 [  1 979]]


In [44]:
clf=Pipeline([
    ('v',TfidfVectorizer()),
    ('m',RandomForestClassifier())
])
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1000
           1       1.00      0.99      1.00       980

    accuracy                           1.00      1980
   macro avg       1.00      1.00      1.00      1980
weighted avg       1.00      1.00      1.00      1980

[[1000    0]
 [   5  975]]


In [45]:
df.head(5)

Unnamed: 0,Text,label,label_num,vector,gensim_vector
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0,"[-0.6759837, 1.4263071, -2.318466, -0.451093, ...","[0.008657642, 0.019024342, -0.011917442, 0.032..."
1,U.S. conservative leader optimistic of common ...,Real,1,"[-1.8355803, 1.3101058, -2.4919677, 1.0268308,...","[0.010864096, 0.007960429, 0.0011915653, 0.014..."
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1,"[-1.9851209, 0.14389805, -2.4221718, 0.9133005...","[0.018134918, 0.0062743523, -0.005872244, 0.03..."
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0,"[-2.7812982, -0.16120885, -1.609772, 1.3624227...","[0.01255197, 0.012613623, 5.9780963e-05, 0.021..."
4,Democrats say Trump agrees to work on immigrat...,Real,1,"[-2.2010763, 0.9961637, -2.4088492, 1.128273, ...","[-0.0019059887, 0.011889367, 0.0035395357, 0.0..."


In [46]:
train_x,test_x,train_y,test_y=train_test_split(df.vector.values,df.label_num,test_size=.2,random_state=2022,stratify=df.label_num)

In [50]:
x_train_2d=np.stack(train_x)
x_test_2d=np.stack(test_x)

In [52]:
x_train_2d.shape

(7920, 300)

In [54]:
S=MinMaxScaler()
# It used to remove negative number into the array

In [55]:
x_train_scaled=S.fit_transform(x_train_2d)
x_test_scaled=S.fit_transform(x_test_2d)

In [56]:
x_train_scaled

array([[0.694047  , 0.55079013, 0.37440637, ..., 0.63519925, 0.21256268,
        0.550342  ],
       [0.63887703, 0.6380881 , 0.339745  , ..., 0.45948255, 0.17975926,
        0.54529274],
       [0.86002624, 0.7407522 , 0.33848292, ..., 0.56603384, 0.23786712,
        0.6054081 ],
       ...,
       [0.42227048, 0.5526547 , 0.44092816, ..., 0.20199436, 0.5265158 ,
        0.44908392],
       [0.554439  , 0.591344  , 0.29670918, ..., 0.3465815 , 0.4828024 ,
        0.5625617 ],
       [0.5523455 , 0.47177774, 0.46222222, ..., 0.56748337, 0.8158294 ,
        0.20865215]], dtype=float32)

In [57]:
R=RandomForestClassifier()
R.fit(x_train_scaled,train_y)

In [58]:
y_pred=R.predict(x_test_scaled)

In [59]:
print(classification_report(test_y,y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      1000
           1       0.99      1.00      1.00       980

    accuracy                           1.00      1980
   macro avg       1.00      1.00      1.00      1980
weighted avg       1.00      1.00      1.00      1980



In [60]:
train_x,test_x,train_y,test_y=train_test_split(df.gensim_vector.values,df.label_num,test_size=.2,random_state=2022,stratify=df.label_num)

In [61]:
x_train_2d=np.stack(train_x)
x_test_2d=np.stack(test_x)

In [62]:
x_train_scaled=S.fit_transform(x_train_2d)
x_test_scaled=S.fit_transform(x_test_2d)

In [63]:
R=RandomForestClassifier()
R.fit(x_train_scaled,train_y)

In [64]:
y_pred=R.predict(x_test_scaled)
print(classification_report(test_y,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.94      0.97      1000
           1       0.94      0.99      0.97       980

    accuracy                           0.97      1980
   macro avg       0.97      0.97      0.97      1980
weighted avg       0.97      0.97      0.97      1980



In [65]:
df['preprocessed_text']=df['Text'].apply(lambda text: preprocess(text))

In [68]:
x_train,x_test,y_train,y_test=train_test_split(df.preprocessed_text,df.label_num,test_size=.2,random_state=2022,stratify=df.label_num)

In [69]:
x_train

5454      Trump Brags Beautiful Chocolate Cake eat att...
2881      Backstreet Boys send Harsh message Trump use...
5948      Fox Host give Trump PATHETICALLY desperate a...
7576    Trump announce push speed desperately need inf...
539       WATCH Jon Voight give divisive Speech Trump ...
                              ...                        
4192      Hillary completely Blew away woman receive t...
4531      elderly California Man Savaged GOP congressm...
688     Trump administration defend travel ban Supreme...
9262    obamacare whiplash leave state insurer duel pr...
9145    Trump say suspect gunman shoot lawmaker dead W...
Name: preprocessed_text, Length: 7920, dtype: object

In [71]:
clf=Pipeline([
    ('v',TfidfVectorizer()),
    ('m',GradientBoostingClassifier())
])
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1000
           1       1.00      1.00      1.00       980

    accuracy                           1.00      1980
   macro avg       1.00      1.00      1.00      1980
weighted avg       1.00      1.00      1.00      1980

[[998   2]
 [  1 979]]


In [72]:
clf=Pipeline([
    ('v',CountVectorizer()),
    ('m',GradientBoostingClassifier())
])
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1000
           1       1.00      1.00      1.00       980

    accuracy                           1.00      1980
   macro avg       1.00      1.00      1.00      1980
weighted avg       1.00      1.00      1.00      1980

[[998   2]
 [  1 979]]


In [73]:
df['Text'][0]

' Top Trump Surrogate BRUTALLY Stabs Him In The Back: ‘He’s Pathetic’ (VIDEO) It s looking as though Republican presidential candidate Donald Trump is losing support even from within his own ranks. You know things are getting bad when even your top surrogates start turning against you, which is exactly what just happened on Fox News when Newt Gingrich called Trump  pathetic. Gingrich knows that Trump needs to keep his focus on Hillary Clinton if he even remotely wants to have a chance at defeating her. However, Trump has hurt feelings because many Republicans don t support his sexual assault against women have turned against him, including House Speaker Paul Ryan (R-WI). So, that has made Trump lash out as his own party.Gingrich said on Fox News: Look, first of all, let me just say about Trump, who I admire and I ve tried to help as much as I can. There s a big Trump and a little Trump. The little Trump is frankly pathetic. I mean, he s mad over not getting a phone call? Trump s referr

In [75]:
df['preprocessed_text'][0]
#here remove all stop word and punctiation in statement

'  Trump Surrogate BRUTALLY Stabs Pathetic video s look republican presidential candidate Donald Trump lose support rank know thing get bad surrogate start turn exactly happen Fox News Newt Gingrich call Trump   pathetic Gingrich know Trump need focus Hillary Clinton remotely want chance defeat Trump hurt feeling Republicans don t support sexual assault woman turn include House Speaker Paul Ryan R WI Trump lash party Gingrich say Fox News look let Trump admire ve try help s big trump little Trump little Trump frankly pathetic mean s mad get phone Trump s refer fact Paul Ryan didn t congratulate debate probably didn t win despite Trump s ego tell Gingrich add Donald Trump opponent Hillary Clinton Paul Ryan s anybody Trump doesn t realize person mad truly bad enemy ultimately lead defeat blame watch Politico Featured Photo Joe Raedle Getty Images'