In [17]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline


In [18]:
fake=pd.read_csv('train.csv')
true=pd.read_csv('test.csv')

In [19]:
fake.shape


(10240, 2)

In [20]:
true.shape


(2551, 2)

In [21]:
# Add flag to track fake and real
fake['target'] = 'fake'
true['target'] = 'true'


In [22]:
# Concatenate dataframes
data = pd.concat([fake, true]).reset_index(drop = True)
data.shape


(12791, 3)

In [23]:
# Shuffle the data
from sklearn.utils import shuffle
data = shuffle(data)
data = data.reset_index(drop=True)

In [24]:
# Check the data
data.head()

Unnamed: 0,Statement,Label,target
0,While CEOs of some large charities have big sa...,False,fake
1,Says Russ Feingolds campaign is powered by sma...,True,fake
2,"If black America were a country, itd be the 15...",False,fake
3,Florida's legal challenge of the federal healt...,False,fake
4,Josh Mandel is lying about his own record. The...,False,fake


In [25]:
# Convert to lowercase

data['Statement'] = data['Statement'].apply(lambda x: x.lower())
data.head()

Unnamed: 0,Statement,Label,target
0,while ceos of some large charities have big sa...,False,fake
1,says russ feingolds campaign is powered by sma...,True,fake
2,"if black america were a country, itd be the 15...",False,fake
3,florida's legal challenge of the federal healt...,False,fake
4,josh mandel is lying about his own record. the...,False,fake


In [26]:

# Remove punctuation

import string

def punctuation_removal(text):
    all_list = [char for char in text if char not in string.punctuation]
    clean_str = ''.join(all_list)
    return clean_str

data['Statement'] = data['Statement'].apply(punctuation_removal)

In [27]:
# Check
data.head()

Unnamed: 0,Statement,Label,target
0,while ceos of some large charities have big sa...,False,fake
1,says russ feingolds campaign is powered by sma...,True,fake
2,if black america were a country itd be the 15t...,False,fake
3,floridas legal challenge of the federal health...,False,fake
4,josh mandel is lying about his own record the ...,False,fake


In [29]:
# Removing stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

data['Statement'] = data['Statement'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asima\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [30]:

data.head()

Unnamed: 0,Statement,Label,target
0,ceos large charities big salaries heads vetera...,False,fake
1,says russ feingolds campaign powered smalldoll...,True,fake
2,black america country itd 15th wealthiest nati...,False,fake
3,floridas legal challenge federal health care b...,False,fake
4,josh mandel lying record rating ohios investme...,False,fake


In [None]:
# How many fake and real articles?
print(data.groupby(['target'])['Statement'].count())
data.groupby(['target'])['Statement'].count().plot(kind="bar")
plt.show()

In [38]:
# Split the data
X_train,X_test,y_train,y_test = train_test_split(data['Statement'], data.target, test_size=0.2, random_state=42)

In [39]:
dct = dict()

from sklearn.naive_bayes import MultinomialNB

NB_classifier = MultinomialNB()
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', NB_classifier)])

model = pipe.fit(X_train, y_train)
prediction = model.predict(X_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

dct['Naive Bayes'] = round(accuracy_score(y_test, prediction)*100,2)

accuracy: 79.33%


In [40]:
# Vectorizing and applying TF-IDF
from sklearn.linear_model import LogisticRegression

pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', LogisticRegression())])

# Fitting the model
model = pipe.fit(X_train, y_train)

# Accuracy
prediction = model.predict(X_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
dct['Logistic Regression'] = round(accuracy_score(y_test, prediction)*100,2)

accuracy: 79.29%


In [49]:
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', clf)])

model = pipe.fit(X_train, y_train)
prediction = model.predict(X_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
dct['SVM'] = round(accuracy_score(y_test, prediction)*100,2)

accuracy: 79.25%
