In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import nltk
from nltk.stem.porter import PorterStemmer

In [2]:
df = pd.read_csv(r"D:\Data Science\FakeNewsDetection\data\train.csv")
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


Number of records

In [3]:
df.size

104000


Check for Null values

In [4]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

Unique authors

#### Handling Null values
<br> We cannot use any imputation technique as this is all text data and filling in missing title or author name would be a new problem statement in its own
<br> If we calculate, we have less than 2% of the data missing and computing Author name and Title would not be a good choice, we can simply drop these rows

In [5]:
df.dropna(inplace= True)

In [6]:
df.size

91425

In [7]:
df.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

### Checking for Outliers

Outliers in this case can be any article that is very small, we can check the average length of articles and check which are

Check number of unique users

In [8]:
df['author'].nunique()

3838

We have multiple articles from same author. <br>
There could be a case that articles by some authors have higer propablity of being unreliable/reliable 

### Pre-processing the "text" data

1. Apply Word Tokenize
2. Remove Punctuation's and Apostrophe.
3. Remove stopwords from the 'text' column for each row
4. Changing all words to lower case (so that algorithm does not consider the same word with different casing as different words.)
5. Apply Stemming (Alternatively we can apply Lemmatisation)
Additionally we can also consider converting Numbers to words.

In [43]:
x_test = df['text']

In [44]:
x_test.head()

0    House Dem Aide: We Didn’t Even See Comey’s Let...
1    Ever get the feeling your life circles the rou...
2    Why the Truth Might Get You Fired October 29, ...
3    Videos 15 Civilians Killed In Single US Airstr...
4    Print \nAn Iranian woman has been sentenced to...
Name: text, dtype: object

In [9]:
X = df['text']
y = df['label']

In [10]:
#from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import re
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aashish\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
words_to_remove = set(stopwords.words('english'))

In [12]:
#type(words_to_remove)

In [13]:
corpus = []
ps = PorterStemmer()

In [14]:
for i in range(0, len(X)):
    words = re.sub('[^a-zA-Z]', ' ', X.iloc[i])
    words = word_tokenize(words)
    words = [word for word in words if word not in words_to_remove]
    words = [ps.stem(word).lower() for word in words]
    words = ' '.join(words)
    corpus.append(words)

In [15]:
corpus[10:12]

['the mysteri surround the third reich nazi germani still subject debat mani observ some believ nazi germani control adolf hitler possess supernatur power larg employ pseudo scienc period howev also hold belief mere specul without proven fact over year research search extens answer mysteri activ associ nazi germani nazi germani invad russia formerli ussr second world war june at time german armi progress deep russian territori gain ground close capit moscow russian could counter attack eventu drive nazi back dure nazi occup russia nazi built secret militari base around arctic code name schatzgrab treasur hunter reportedli instrument war russia the base primarili use tactic weather station plan strateg movement nazi troop warship submarin the base also hous emin nazi scientist conduct mani experi help progress german win war it wide specul time nazi use base contact alien extraterrestri be the controversi ahnenerb even link base the ahnenerb institut nazi germani respons research archae

Following are the options to encode the text data to numeric data
1. One-Hot encoding - Not good for the purpose, as the number of occurrences and context of occurrence will be lost in this case. (information loss)
2. Count Vector Encoding- We will have the advantage of having a number of occurrences
3. TF-IDF - We will have the count for the occurrence of the word in each article vs the total occurrence of the word. 
4. Co-occurrence - Building a co-occurrence matrix to see the words occurring together.
5. Word embedding- This is great when we want our machine to understand the context
<br>We can use Count Vector in this case as we just need to check the occurrence of the words in the document, we not need the model to be complex.

### Applying CountVectorizer

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
vectorizer = CountVectorizer(max_features = 1500)

In [18]:
X = vectorizer.fit_transform(corpus)

In [19]:
X = X.toarray()

In [20]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0]], dtype=int64)

Test Train Split

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [22]:
y_train.shape, X_train.shape

((14628,), (14628, 1500))

### Selecting Model for Training

This is a Binary Classification problem, we can using Voting Classifier with multiple models that are used for Binary classification

In [23]:
from sklearn import model_selection 
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

In [24]:
svc_clf = SVC()
log_clf = LogisticRegression()
knn_clf = KNeighborsClassifier(6)

We can additionally add a step to perform hyper-parameter tunning using GridSearchCV

In [25]:
voting_clf_model = VotingClassifier(estimators=[('svc',svc_clf),('log',log_clf),('knn',knn_clf)])

In [26]:
voting_clf_model = voting_clf_model.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
y_pred = voting_clf_model.predict(X_test)

In [28]:
from sklearn.metrics import recall_score, precision_score

In [29]:
model_precision_score = precision_score(y_test, y_pred)
model_recall_score = recall_score(y_test, y_pred)

In [30]:
model_precision_score

0.9153567110036276

In [31]:
model_recall_score

0.9363017934446506

### Import actual Test Data

In [35]:
new_test_data = pd.read_csv(r"D:\Data Science\FakeNewsDetection\data\test.csv")

In [36]:
new_test_data.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [62]:
new_test_data['text'] = new_test_data['text'].astype(str)

We need to apply the same the same pre-processing to the Test data set

In [37]:
X_test1 = new_test_data['text']

In [38]:
type(X_test1)

pandas.core.series.Series

In [39]:
X_test1.head()

0    PALO ALTO, Calif.  —   After years of scorning...
1    Russian warships ready to strike terrorists ne...
2    Videos #NoDAPL: Native American Leaders Vow to...
3    If at first you don’t succeed, try a different...
4    42 mins ago 1 Views 0 Comments 0 Likes 'For th...
Name: text, dtype: object

In [40]:
#X_test1['text'] = X_test1['text'].astype("string")

In [41]:
test_corpus= []

In [45]:
for i in range(0, len(X_test1)):
    test_words = re.sub('[^a-zA-Z]', ' ', str(X_test1.iloc[i]))
    test_words = word_tokenize(test_words)
    test_words = [word for word in test_words if word not in words_to_remove]
    test_words = [ps.stem(word).lower() for word in test_words]
    test_words = ' '.join(test_words)
    test_corpus.append(test_words)

In [48]:
test_corpus[11:13]

['sourc cnbc articl robert ferri arctic sea ice melt rate far faster anyon thought alreadi wildli perhap the member institut physic climat geoengin scale must consid last resort there lessen attempt otherwis correct harm impact human economi earth ecolog climat is raytheon the weather raytheon corpor third largest weapon manufactur partner haarp raethon also tell weather american meteorolog servic am lead corpor weather modif nano technolog well advanc weather weapon system here raytheon ray gun crowd control weather modif weapon system weather forecast raytheon still think alter weather presid john f kennedi secret societi methan',
 'written shaun bradley mandatori vaccin open new frontier govern control through war drug bureaucrat arbitrarili dictat peopl put bodi violat pale comparison forcibl medic million voluntari inform consent essenti secur individu right without self ownership never respect the liber stronghold california trailblaz encroach new practic recent pass law mandat c

In [49]:
X_test_vector = vectorizer.fit_transform(corpus)

In [51]:
X_test_vector = X_test_vector.toarray()

In [52]:
X_test_vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0]], dtype=int64)

In [53]:
y_pred_new = voting_clf_model.predict(X_test_vector)

In [58]:
y_pred_new = pd.DataFrame(y_pred_new)

In [60]:
y_pred_new.shape

(18285, 1)

In [61]:
new_test_data['label'] = y_pred_new
new_test_data.head()

Unnamed: 0,id,title,author,text,label
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning...",1
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...,1
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,1
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different...",1
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,1


In [66]:
new_test_data[new_test_data['label']==0]['id'].count()

2896

In [67]:
new_test_data[new_test_data['label']==1]['id'].count()

2304

In [68]:
df.to_csv("submit.csv", columns=['id','label'])