In [68]:
import os
from random import random
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.externals import joblib

### Load sample from FakeNewsCorpus

In [69]:
# only read in 20k rows at this point
#df = pd.read_csv("news_cleaned_2018_02_13.csv", nrows=20000)
df = pd.read_csv("news_cleaned_2018_02_13.csv", skiprows=lambda x: x>0 and random() > 0.01)
#df = pd.read_csv("news_cleaned_sample50.csv")
df.info()
df['type'].unique().tolist()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84972 entries, 0 to 84971
Data columns (total 17 columns):
Unnamed: 0          84972 non-null object
id                  84972 non-null object
domain              84972 non-null object
type                80912 non-null object
url                 84972 non-null object
content             84971 non-null object
scraped_at          84971 non-null object
inserted_at         84972 non-null object
updated_at          84972 non-null object
title               84235 non-null object
authors             46987 non-null object
keywords            0 non-null float64
meta_keywords       81585 non-null object
meta_description    40572 non-null object
tags                19511 non-null object
summary             0 non-null float64
source              18757 non-null object
dtypes: float64(2), object(15)
memory usage: 11.0+ MB


  interactivity=interactivity, compiler=compiler, result=result)


['hate',
 'fake',
 'reliable',
 'political',
 'conspiracy',
 'satire',
 'junksci',
 'bias',
 nan,
 'unknown',
 'unreliable',
 'rumor',
 'clickbait',
 '2018-02-07 23:39:33.852696']

In [70]:
#df.describe()
df['type'].describe()

count        80912
unique          13
top       reliable
freq         19064
Name: type, dtype: object

In [71]:
# filter rows where 'type' == 'fake'
df[ df['type'] == 'fake' ].describe()

Unnamed: 0,keywords,summary
count,0.0,0.0
mean,,
std,,
min,,
25%,,
50%,,
75%,,
max,,


In [72]:
# filter rows where 'type' == 'reliable'
df[ df['type'] == 'reliable' ].describe()

Unnamed: 0,keywords,summary
count,0.0,0.0
mean,,
std,,
min,,
25%,,
50%,,
75%,,
max,,


In [74]:
# create 2 dfs from filtered df
bindf1 = df[df['type'] == 'fake']
bindf2 = df[df['type'] == 'reliable']
# then concat into single df
bindf = pd.concat([bindf1, bindf2], ignore_index=True)
bindf.head()

Unnamed: 0.1,Unnamed: 0,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary,source
0,147,173,beforeitsnews.com,fake,http://beforeitsnews.com/prophecy/2018/01/larg...,Obama’s The Greatest Criminal In History Say T...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Largest Turnover in Midterm History Mark Taylo...,,,[''],,,,
1,1693,1936,beforeitsnews.com,fake,http://beforeitsnews.com/self-sufficiency/2018...,Slow Cooker Recipe: Pesto Mozzarella Chicken P...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Slow Cooker Recipe: Pesto Mozzarella Chicken P...,The Berkey Guy Blog,,[''],,,,
2,1840,2094,beforeitsnews.com,fake,http://beforeitsnews.com/alternative/2018/01/c...,Proximity also encompasses temporality and is ...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Conviviality — Fear and Prejudice,Human Wrongs Watch,,[''],,,,
3,2331,2669,beforeitsnews.com,fake,http://beforeitsnews.com/opinion-conservative/...,Muslim Newsweek Editor says sexual abuse of ch...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Muslim Newsweek Editor says sexual abuse of ch...,Atlas Shrugs,,[''],,,,
4,2760,3140,beforeitsnews.com,fake,http://beforeitsnews.com/watercooler-topics/20...,German Schools Use Sand-Filled Vests to Calm D...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,German Schools Use Sand-Filled Vests to Calm D...,Oddity Central,,[''],,,,


### Train-test split

In [75]:
X_train, X_test, y_train, y_test = train_test_split(bindf.content, bindf.type, test_size=.2)

In [76]:
y_train.head()

15761    reliable
2203         fake
1615         fake
22477    reliable
9338     reliable
Name: type, dtype: object

In [77]:
X_train.head()

15761    Whether the increase was small or large, the c...
2203     (Before It's News)\n\nPhoto is courtesy of Gag...
1615     (GST, CRWE, FPP, GMO) Stock Report from Stock-...
22477    Opinion » Should Beach Privatization Be Allowe...
9338     Healthcare | Sun Nov 20, 2016 | 10:28pm EST BR...
Name: content, dtype: object

### Feature extraction + classification pipeline

In [78]:
# construct model from pipeline
lin_svc = Pipeline([
    ('vect', TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')),
    ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3))), 
    ('classification', LinearSVC(penalty="l2"))
    ])

In [79]:
# fit model
clf_fake_reliable = lin_svc.fit(X_train, y_train)

### Evaluate classifier performance

In [80]:
from sklearn import metrics

In [81]:
# testing predictions
import numpy as np
predicted = clf_fake_reliable.predict(X_test)
print(f"Prediction accuracy: {np.mean(predicted == y_test)}")

Prediction accuracy: 0.951293487957181


In [82]:
print(metrics.classification_report(y_test, predicted, target_names=list(y_test.unique())))

             precision    recall  f1-score   support

   reliable       0.94      0.90      0.92      1787
       fake       0.95      0.98      0.96      3818

avg / total       0.95      0.95      0.95      5605



In [83]:
metrics.confusion_matrix(y_test, predicted)

array([[1608,  179],
       [  94, 3724]])

### Pickle classifier for easy future loading

In [84]:
# train & pickle classifiers
print('Training classifier')
#clf_tidy_PI = train_classifier(tidy_PI)
print('Pickling...')
joblib.dump(clf_fake_reliable, 'clf_fake_reliable_rr.pkl')

Training classifier
Pickling...


['clf_fake_reliable_rr.pkl']