In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as pl

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.preprocessing import StandardScaler

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [4]:
df = pd.read_csv('../Data/Kaggle Dataset/fake_news.csv')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [6]:
len(df['text'][0])

7518

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  6335 non-null   int64 
 1   title       6335 non-null   object
 2   text        6335 non-null   object
 3   label       6335 non-null   object
dtypes: int64(1), object(3)
memory usage: 198.1+ KB


In [8]:
X = df.drop('label', axis=1)
X['content'] = X['title'] + X['text']
X.head()

Unnamed: 0.1,Unnamed: 0,title,text,content
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...","You Can Smell Hillary’s FearDaniel Greenfield,..."
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,Watch The Exact Moment Paul Ryan Committed Pol...
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,Kerry to go to Paris in gesture of sympathyU.S...
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",Bernie supporters on Twitter erupt in anger ag...
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,The Battle of New York: Why This Primary Matte...


In [9]:
y = df['label'].apply(lambda x: 1 if x=='FAKE' else 0)
y.head()

0    1
1    1
2    0
3    1
4    0
Name: label, dtype: int64

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X['content'], y, test_size=0.2, random_state=101)

In [11]:
# Let us now try to use count vectorizer as a feature extraction method to do the classification

In [12]:
vect= CountVectorizer(stop_words='english',max_features=100)
x_train_count = vect.fit_transform(X_train)
x_test_count = vect.fit_transform(X_test)

In [13]:
tfidf_vect = TfidfVectorizer(stop_words='english',max_features=10000)
x_train_tf = tfidf_vect.fit_transform(X_train)
x_test_tf = tfidf_vect.fit_transform(X_test)

In [14]:
import scipy.sparse as sp
train_data = sp.hstack((x_train_count,x_train_tf))
test_data = sp.hstack((x_test_count,x_test_tf))

In [18]:
# Let us apply now different classification algorithms

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from datetime import datetime

In [19]:
models = {'Logistic Regression':LogisticRegression(), 
         'Support Vector Classifier':SVC(),'Decision Tree':DecisionTreeClassifier(),
         'Random Forest':RandomForestClassifier(),'Neural Network':MLPClassifier(),
         'Stochastic Gradient Descent':SGDClassifier(), 'KNN':KNeighborsClassifier(),
         'Naive Bayes':GaussianNB()}

for model, algorithm in models.items():
    start_time = datetime.now() 
    pipe = Pipeline([('model', algorithm)])
    pipe.fit(train_data ,y_train)
    end_time = datetime.now() 
    prediction = pipe.predict(test_data)
    print("======= For {} ============".format(model))
    print('Accuracy Score : {} '.format(accuracy_score(prediction,y_test)))
    print('Confusion Matrix \n\n  ',confusion_matrix(prediction,y_test))
    print('\n Classification Report \n ')
    print(classification_report(prediction,y_test))
    time_difference = (end_time - start_time).total_seconds() * 10**3
    print("Execution time of program is: ", time_difference, "ms")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy Score : 0.5753749013417522 
Confusion Matrix 

   [[144  29]
 [509 585]]

 Classification Report 
 
              precision    recall  f1-score   support

           0       0.22      0.83      0.35       173
           1       0.95      0.53      0.69      1094

    accuracy                           0.58      1267
   macro avg       0.59      0.68      0.52      1267
weighted avg       0.85      0.58      0.64      1267

Execution time of program is:  866.058 ms
Accuracy Score : 0.6835043409629045 
Confusion Matrix 

   [[322  70]
 [331 544]]

 Classification Report 
 
              precision    recall  f1-score   support

           0       0.49      0.82      0.62       392
           1       0.89      0.62      0.73       875

    accuracy                           0.68      1267
   macro avg       0.69      0.72      0.67      1267
weighted avg       0.76      0.68      0.70      1267

Execution time of program is:  30814.726 ms
Accuracy Score : 0.6266771902131019 
Confu

KeyboardInterrupt: 