# Import Libraries

In [57]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, auc, roc_curve
from sklearn.naive_bayes import MultinomialNB

# Load Data

In [58]:
#Read the data
df=pd.read_csv("news.csv", engine="python", error_bad_lines=False)
#Get shape and head
df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


# Analyzing Dataset

## max, min, avg number of characters in text instances

In [None]:
max_char_count = 0
min_char_count = 99999999
total = 0
for i in range(len(df)):
  length = len(df.get("text")[i])
  total += length
  if min_char_count > length and length > 1:
    min_wchar_count = length
  elif max_char_count < length:
    max_char_count = length
    
print("minimum char count is " + str(min_char_count) + "\n",
      "maximum char count is " + str(max_char_count) + "\n",
      "avg char count is " + str(int(total / len(df))))

minimum char count is 99999999
 maximum char count is 1
 avg char count is 5062


## max, min, avg number of words in text instances

In [None]:
max_word_count = 0
min_word_count = 99999999
total = 0
count = df['text'].str.split().str.len()
# count.index = count.index.astype(str) + ' words:'
# count.sort_index(inplace=True)
for index in count:
  total += index
  if index > max_word_count:
    max_word_count = index
  elif index < min_word_count and index > 0:
    min_word_count = index

print("minimum word count is " + str(min_word_count) + "\n",
      "maximum word count is " + str(max_word_count) + "\n",
      "avg word count is " + str(int(total / len(df))))

minimum word count is 6
 maximum word count is 7503
 avg word count is 835


# Data Preprocessing

## get the labels

In [59]:
labels=df.label
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

## Split Dataset Into Training and Test Set

In [60]:
#Split the dataset
x_train,x_test,y_train,y_test=train_test_split(df['text'], labels, test_size=0.2, random_state=7)

## Feature Selection

### Tfidf vectorizer

In [61]:
#Initialize a TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7, max_features=500)

#Fit and transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

### count vectorizer

In [62]:
count_vec = CountVectorizer(binary=True, stop_words="english", max_features=1000)
count_vec_train = count_vec.fit_transform(x_train)
count_vec_test = count_vec.fit_transform(x_test)

# Train and Test

## Passive Aggresive Classifier

In [66]:
#Initialize a PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)
#Predict on the test set and calculate accuracy
y_pred=pac.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 88.87%


In [67]:
#Confusion Matrix
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])


array([[576,  62],
       [ 79, 550]])

In [69]:
f1_score(y_test,y_pred, average="macro")

0.8886666130711891

In [70]:
f1_score(y_test,y_pred, average="micro")

0.8887134964483031

In [71]:
#Initialize a PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(count_vec_train,y_train)
#Predict on the test set and calculate accuracy
y_pred=pac.predict(count_vec_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 54.7%




In [72]:
#Confusion Matrix
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

array([[322, 316],
       [258, 371]])

In [73]:
f1_score(y_test,y_pred, average="macro")

0.5462827097089753

In [74]:
f1_score(y_test,y_pred, average="micro")

0.5469613259668509

## Multinominal

In [75]:
mn_clf = MultinomialNB(alpha=0.1)
mn_clf.fit(tfidf_train,y_train)
y_pred = mn_clf.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 84.29%


In [76]:
#Confusion Matrix
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

array([[552,  86],
       [113, 516]])

In [77]:
f1_score(y_test,y_pred, average="macro")

0.8428091643791462

In [78]:
f1_score(y_test,y_pred, average="micro")

0.8429360694554064

In [80]:
mn_clf = MultinomialNB(alpha=0.1)
mn_clf.fit(count_vec_train,y_train)
y_pred = mn_clf.predict(count_vec_test)
score = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 59.51%


In [81]:
#Confusion Matrix
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

array([[250, 388],
       [125, 504]])

In [82]:
f1_score(y_test,y_pred, average="macro")

0.5781526545441801

In [83]:
f1_score(y_test,y_pred, average="micro")

0.5951065509076559

## SGD classifier

In [84]:
sgd_tfidf_clf = SGDClassifier()
sgd_tfidf_clf.fit(tfidf_train, y_train)
y_pred = sgd_tfidf_clf.predict(tfidf_test)
score = accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 89.74%


In [85]:
#Confusion Matrix
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

array([[567,  71],
       [ 59, 570]])

In [86]:
f1_score(y_test,y_pred, average="macro")

0.8973948470048839

In [87]:
f1_score(y_test,y_pred, average="micro")

0.8973954222573007

In [88]:
sgd_tfidf_clf = SGDClassifier()
sgd_tfidf_clf.fit(count_vec_train, y_train)
y_pred = sgd_tfidf_clf.predict(count_vec_test)
score = accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 52.41%


In [89]:
#Confusion Matrix
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

array([[314, 324],
       [279, 350]])

In [90]:
f1_score(y_test,y_pred, average="macro")

0.5236880709579157

In [91]:
f1_score(y_test,y_pred, average="micro")

0.5240726124704025

In [None]:
sk