In [1]:
import pandas as pd

In [2]:
df = pd.read_csv(r"F:\ML projects\Fake News\news\news.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
df.corr

<bound method DataFrame.corr of       Unnamed: 0                                              title  \
0           8476                       You Can Smell Hillary’s Fear   
1          10294  Watch The Exact Moment Paul Ryan Committed Pol...   
2           3608        Kerry to go to Paris in gesture of sympathy   
3          10142  Bernie supporters on Twitter erupt in anger ag...   
4            875   The Battle of New York: Why This Primary Matters   
...          ...                                                ...   
6330        4490  State Department says it can't find emails fro...   
6331        8062  The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...   
6332        8622  Anti-Trump Protesters Are Tools of the Oligarc...   
6333        4021  In Ethiopia, Obama seeks progress on peace, se...   
6334        4330  Jeb Bush Is Suddenly Attacking Trump. Here's W...   

                                                   text label  
0     Daniel Greenfield, a Shillman Journalism Fell

In [5]:
df.isna()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
6330,False,False,False,False
6331,False,False,False,False
6332,False,False,False,False
6333,False,False,False,False


In [6]:
#no null values in df

In [7]:
#lets clean the text by removing stopwords

In [10]:
from nltk.corpus import stopwords
import re

In [31]:
stop_words = set(stopwords.words("english"))

def cleanTxt(tx):
    tx = tx.lower()
    ts = re.findall(r"\b\w+\b", tx)
    tl = [word for word in ts if word not in stop_words]
    ctx = " ".join(tl)
    return ctx

df["no_stopwords"] = df["text"].apply(cleanTxt)

In [32]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label,no_stopwords
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,daniel greenfield shillman journalism fellow f...
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,google pinterest digg linkedin reddit stumbleu...
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,u secretary state john f kerry said monday sto...
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,kaydee king kaydeeking november 9 2016 lesson ...
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,primary day new york front runners hillary cli...


In [41]:
"""
now we create a pipeline with features Word2Vector and our classifier model
Word2Vector is a complex algorithm that learns vector representations (embeddings) of words based on their context in a large corpus of text. 
These embeddings show how words are related semantically and grammatically. 
"""

from gensim.models import Word2Vec
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

In [49]:
#create a custom Word2Vector transformer
class Word2VecTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vector_size=100, window=5, min_count=1, workers=4):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        self.model = None

    def fit(self, X, y=None, **fit_params):
        sentences = [sentence.split() for sentence in X]
        self.model = Word2Vec(sentences, vector_size=self.vector_size, window=self.window, min_count=self.min_count, workers=self.workers)
        return self

    def transform(self, X):
        return np.array([self._get_feature_vector(sentence) for sentence in X])

    def _get_feature_vector(self, sentence):
        words = sentence.split()
        words = [word for word in words if word in self.model.wv]
        if not words:
            return np.zeros(self.vector_size)
        return np.mean([self.model.wv[word] for word in words], axis=0)
        

In [50]:
pipe = Pipeline(
    [
        ('Word2Vec', Word2VecTransformer(vector_size=100, window=5, min_count=1, workers=4)),
        ('classifier', LogisticRegression(max_iter=1000))
    ]
)

In [None]:
#lets train the data

In [51]:
X = df["no_stopwords"]
y = df["label"]

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [53]:
pipe.fit(X_train, y_train)

In [54]:
prediction = pipe.predict(X_test)

In [56]:
accuracy = accuracy_score(y_test, prediction)
print(accuracy)

0.8902920284135754


In [60]:
cm = confusion_matrix(y_test, prediction)
cr = classification_report(y_test, prediction)
print("Confusion Matrix: ")
print(cm)
print("\n")
print('Classification Report')
print(cr)

Confusion Matrix: 
[[554  74]
 [ 65 574]]


Classification Report
              precision    recall  f1-score   support

        FAKE       0.89      0.88      0.89       628
        REAL       0.89      0.90      0.89       639

    accuracy                           0.89      1267
   macro avg       0.89      0.89      0.89      1267
weighted avg       0.89      0.89      0.89      1267



In [64]:
"""
test our model with custom data
"""
test1 = ["This news is fake"]

pred2 = pipe.predict(test1)
print(pred2)

['FAKE']
