In [1]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer

In [2]:
df=pd.read_csv('WELFake_Dataset.csv')
df=df.head(500)
df=df.dropna()
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
5,5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1


In [3]:
features=df[['title','text']]
features.head()

Unnamed: 0,title,text
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ..."
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will..."
5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...


In [4]:
label=df['label']
label.head()

0    1
2    1
3    0
4    1
5    1
Name: label, dtype: int64

In [5]:
headlines=[]
for row in range(0,len(features.index)):
    headlines.append(' '.join(str(x) for x in features.iloc[row,0:2]))
headlines[1]

'UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MOST CHARLOTTE RIOTERS WERE “PEACEFUL” PROTESTERS…In Her Home State Of North Carolina [VIDEO]  Now, most of the demonstrators gathered last night were exercising their constitutional and protected right to peaceful protest in order to raise issues and create change.    Loretta Lynch aka Eric Holder in a skirt'

In [6]:
lm=WordNetLemmatizer()
corpus=[]
for i in range(0,len(headlines)):
    r=re.sub('[^a-zA-Z]',' ',headlines[i])
    r=r.lower()
    words=word_tokenize(r)
    words=[lm.lemmatize(word) for word in words if word not in stopwords.words('english')]
    r=' '.join(words)
    corpus.append(r)
headlines=corpus
headlines[1]

'unbelievable obama attorney general say charlotte rioter peaceful protester home state north carolina video demonstrator gathered last night exercising constitutional protected right peaceful protest order raise issue create change loretta lynch aka eric holder skirt'

In [7]:
cv=CountVectorizer(ngram_range=(2,2))
features=cv.fit_transform(headlines).toarray()
features

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test=train_test_split(features,label,test_size=0.8,random_state=1)

In [9]:
from sklearn.naive_bayes import MultinomialNB
model_1=MultinomialNB()
model_1.fit(X_train,Y_train)
from sklearn.linear_model import LogisticRegression
model_2=LogisticRegression()
model_2.fit(X_train,Y_train)
from sklearn.tree import DecisionTreeClassifier
model_3=DecisionTreeClassifier()
model_3.fit(X_train,Y_train)

In [10]:
pred_1=model_1.predict(X_test)
pred_2=model_2.predict(X_test)
pred_3=model_3.predict(X_test)

In [11]:
import sklearn.metrics
print("Naive Bayes:\n ",sklearn.metrics.confusion_matrix(Y_test,pred_1))
print("Logistic Regression:\n ",sklearn.metrics.confusion_matrix(Y_test,pred_2))
print("Decision Tree:\n ",sklearn.metrics.confusion_matrix(Y_test,pred_3))

Naive Bayes:
  [[163  21]
 [ 94 117]]
Logistic Regression:
  [[ 22 162]
 [  5 206]]
Decision Tree:
  [[ 90  94]
 [ 26 185]]


In [12]:
print("Naive Bayes: ",sklearn.metrics.accuracy_score(Y_test,pred_1))
print("Logistic Regression: ",sklearn.metrics.accuracy_score(Y_test,pred_2))
print("Decision Tree: ",sklearn.metrics.accuracy_score(Y_test,pred_3))

Naive Bayes:  0.7088607594936709
Logistic Regression:  0.5772151898734177
Decision Tree:  0.6962025316455697
