In [3]:
#import pandas library
import pandas as pd

#read the dataset with name "Fake_Real_Data.csv" and store it in a variable df
df = pd.read_csv('fake_or_real_news.csv')

#print the shape of dataframe
df.shape

#print top 5 rows
df.head(5)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
#check the distribution of labels
df.label.value_counts()

label
REAL    3171
FAKE    3164
Name: count, dtype: int64

In [5]:
# Since it is imbalanced, select any balancing technique to make it balanced. 
#I've choosen undersampling. Anyhow it's almost equal. You can skip balancing technique also

In [6]:
min_sampling= 3164
df_real= df[df.label == 'REAL'].sample(min_sampling, random_state= 2022)
df_fake= df[df.label == 'FAKE'].sample(min_sampling, random_state= 2022)

In [7]:
df_balanced= pd.concat([df_fake, df_real], axis=0)
df_fake.shape

(3164, 4)

In [8]:
df_real.shape

(3164, 4)

In [9]:
df_balanced.label.value_counts()

label
FAKE    3164
REAL    3164
Name: count, dtype: int64

In [51]:
#Add the new column "label_num" which gives a unique number to each of these labels 
df_balanced['label_num'] = df_balanced['label'].apply(lambda x:1 if x== 'REAL' else 0)

#check the results with top 5 rows
df_balanced.head(5)
df_balanced.tail(5)

Unnamed: 0.1,Unnamed: 0,title,text,label,label_num
1055,2806,"A nuclear deal has been reached, but Iran must...",JUST AS negotiators were completing an agreeme...,REAL,1
4664,2146,Bad idea: Shell’s gearing up to start drilling...,After suspending its Arctic program for years ...,REAL,1
6062,1307,5 takeaways from New Hampshire,"Killing Obama administration rules, dismantlin...",REAL,1
2010,7255,Russia WW3 Weapon: Nikola Tesla’s Death Ray In...,New details are emerging about Nikola Tesla’s ...,FAKE,0
1996,6101,“Beware of the Shadow Government”: Ron Paul Ad...,\nThis article was written and originally publ...,FAKE,0


# Modelling without Pre-processing Text data

In [11]:
# import train-test-split from sklearn
import sklearn
from sklearn.model_selection import train_test_split

# Do the 'train-test' splitting with test size of 20% with random state of 2022 and stratify sampling too
X_train, X_test, y_train, y_test= train_test_split(df_balanced.label, df_balanced.label_num, 
                                                   test_size=0.2, random_state= 20, 
                                                   stratify= df_balanced.label_num)

In [13]:
# print the shapes of X_train and X_test
X_train.shape

(5062,)

In [14]:
X_test.shape

(1266,)

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


# create a pipeline object
clf = Pipeline([['Vectorization', CountVectorizer()], 
               ['nb', MultinomialNB()]])

# fit with X_train and y_train
clf.fit(X_test, y_test)


In [19]:
# get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)

In [21]:
# print the classfication report
print(classification_report(y_test,
    y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       633
           1       1.00      1.00      1.00       633

    accuracy                           1.00      1266
   macro avg       1.00      1.00      1.00      1266
weighted avg       1.00      1.00      1.00      1266



In [22]:
# Attempt 2 :

# using the sklearn pipeline module create a classification pipeline to classify the Data.
# Note:

# using CountVectorizer with unigram, bigram, and trigrams.
# use KNN as the classifier with n_neighbors of 10 and metric as 'cosine' distance.
# print the classification report.

In [52]:
# Unigram - KNeighborsClassifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report


# create a pipeline object
clf = Pipeline([['Vectorization', CountVectorizer(ngram_range=(1, 1))], 
               ['KNN', KNeighborsClassifier(n_neighbors=10, metric = 'euclidean')]])

# fit with X_train and y_train
clf.fit(X_test, y_test)

y_pred = clf.predict(X_test)

# print the classfication report
print(classification_report(y_test,
    y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       633
           1       1.00      1.00      1.00       633

    accuracy                           1.00      1266
   macro avg       1.00      1.00      1.00      1266
weighted avg       1.00      1.00      1.00      1266



In [26]:
# bigram - KNeighborsClassifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report


# create a pipeline object
clf = Pipeline([['Vectorization', CountVectorizer(ngram_range=(1,2))], 
               ['KNN', KNeighborsClassifier()]])

# fit with X_train and y_train
clf.fit(X_test, y_test)

y_pred = clf.predict(X_test)

# print the classfication report
print(classification_report(y_test,
    y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       633
           1       1.00      1.00      1.00       633

    accuracy                           1.00      1266
   macro avg       1.00      1.00      1.00      1266
weighted avg       1.00      1.00      1.00      1266



In [53]:
# trigram - KNeighborsClassifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report


# create a pipeline object
clf = Pipeline([['Vectorization', CountVectorizer(ngram_range=(1,3))], 
               ['KNN', KNeighborsClassifier(n_neighbors = 10, metric = 'cosine')]])

# fit with X_train and y_train
clf.fit(X_test, y_test)

y_pred = clf.predict(X_test)

# print the classfication report
print(classification_report(y_test,
    y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       633
           1       1.00      1.00      1.00       633

    accuracy                           1.00      1266
   macro avg       1.00      1.00      1.00      1266
weighted avg       1.00      1.00      1.00      1266



In [29]:
# Attempt 3 :

# using the sklearn pipeline module create a classification pipeline to classify the Data.
# Note:

# using CountVectorizer with only trigrams.
# use RandomForest as the classifier.
# print the classification report.

In [33]:
# trigram - RandomForestClassifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


# create a pipeline object
clf = Pipeline([['Vectorization', CountVectorizer(ngram_range=(1,3))], 
               ['RF', RandomForestClassifier()]])

# fit with X_train and y_train
clf.fit(X_test, y_test)

y_pred = clf.predict(X_test)

# print the classfication report
print(classification_report(y_test,
    y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       633
           1       1.00      1.00      1.00       633

    accuracy                           1.00      1266
   macro avg       1.00      1.00      1.00      1266
weighted avg       1.00      1.00      1.00      1266



In [34]:
# Bigram - naive_bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


# create a pipeline object
clf = Pipeline([['Vectorization', CountVectorizer(ngram_range=(1,2))], 
               ['nb', MultinomialNB()]])

# fit with X_train and y_train
clf.fit(X_test, y_test)

y_pred = clf.predict(X_test)

# print the classfication report
print(classification_report(y_test,
    y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       633
           1       1.00      1.00      1.00       633

    accuracy                           1.00      1266
   macro avg       1.00      1.00      1.00      1266
weighted avg       1.00      1.00      1.00      1266



In [37]:
# Unigram - naive_bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


# create a pipeline object
clf = Pipeline([['Vectorization', CountVectorizer(ngram_range=(1,1))], 
               ['nb', MultinomialNB(alpha=0.75)]])

# fit with X_train and y_train
clf.fit(X_test, y_test)

y_pred = clf.predict(X_test)

# print the classfication report
print(classification_report(y_test,
    y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       633
           1       1.00      1.00      1.00       633

    accuracy                           1.00      1266
   macro avg       1.00      1.00      1.00      1266
weighted avg       1.00      1.00      1.00      1266



# Modelling after Pre-processing Text data

In [42]:
import spacy

nlp= spacy.load("en_core_web_sm")
filtered_token= []
def preprocess(text):
    doc= nlp(text)
    for token in doc:
        if token.is_punct or token.is_stop:
            continue
        filtered_token.append(token.lemma_)
    return " ".join(filtered_token)

In [48]:
# create a new column "preprocessed_txt" and use the utility function above to get the clean data
# this will take some time, please be patient

In [50]:
df_balanced['processed_text'] = df_balanced.text.apply(preprocess)

MemoryError: Unable to allocate 3.24 MiB for an array with shape (830, 1024) and data type float32

In [None]:
df_balanced.head(5)

In [None]:
# Build a model with pre processed text
# import train-test-split from sklearn
import sklearn
from sklearn.model_selection import train_test_split


# Do the 'train-test' splitting with test size of 20% with random state of 2022 and stratify sampling too
X_train, X_test, y_train, y_test= train_test_split(df_balanced.label, df_balanced.label_num, 
                                                   test_size=0.2, random_state= 20, 
                                                   stratify= df_balanced.label_num)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


# create a pipeline object
clf = Pipeline([['Vectorization', CountVectorizer()], 
               ['nb', MultinomialNB()]])

# fit with X_train and y_train
clf.fit(X_test, y_test)

# predict using X_test
y_pred = clf.predict(X_test)

# print the classfication report
print(classification_report(y_test,
    y_pred))
