## HEADLINER: Real or Fake News?

In [None]:
import pandas as pd
import numpy as np

In [None]:
real1 = pd.read_csv('./real-news/realnews4.csv')
real2 = pd.read_csv('./real-news/realnews5.csv')
real = pd.concat([real1, real2])
real['class'] = 'real'
print('The shape of the concatenated real file is', real.shape)
real = real.drop_duplicates(subset=['text'], keep='first')
real = real.dropna(subset=['text'])
real = real.reset_index(drop=True)
print('After dropping null text and duplicate text, the shape of the concatenated real file is', real.shape)

sites = list(real['site'])
real_sites = ['cnn', 'politico', 'abcnews.go', 'google', 'bbc.com', 'economist', 'nytimes', 'pbs', 'cbs', 'nbcnews', 'bloomberg', 'npr', 'c-span', 'independent', 'apnews', 'thehill', 'fivethirtyeight', 'forbes', 'money.cnn']
ind = []
for j in range(len(sites)):
    if any(substring in sites[j] for substring in real_sites) == True:
        ind.append(j)
    else:
        pass
real = real[real.index.isin(ind)]
real = real.reset_index(drop=True)
print('After keeping rows with credible site urls, the shape of the real file is', real.shape )
real.head(2)

In [None]:
fake = pd.read_csv('../Data_sets/fake.csv')
print('The shape of the data is', fake.shape)
fake['class'] = 'fake'
fake = fake.drop_duplicates(subset=['text'], keep='first')
fake = fake.dropna(subset=['text'])
fake = fake.reset_index(drop=True)
print('After dropping null text and duplicate text, the shape of the fake file is', fake.shape)
fake.head(2)

In [None]:
real_subset = real.loc[:, ["author", "title", "published", "site", "text", "main_image", "shares", "ord_in_thread", "spam_score", "participants_count", "replies_count", "likes", "class"]]
fake_subset = fake.loc[:, ["author", "title", "published", "site_url", "text", "main_img_url", "shares", "ord_in_thread", "spam_score", "participants_count", "replies_count", "likes", "class"]]
fake_subset = fake_subset.rename(index=str, columns={"site_url": "site", "main_img_url": "main_image"})
data = pd.concat([real_subset, fake_subset])
print('The shape of the concatenated dataset is', data.shape)

In [None]:
from itertools import chain
words = list(chain(data['text']))
text = []
for i in range(len(words)):
    w = words[i].decode('utf-8')
    w = w.encode('ascii', 'ignore')
    text.append(w)
data['text'] = text

In [None]:
# data.to_csv('./real_fake_data.csv', index=False)

### Create vocabulary lists 
Created a list of top 1000 and top 000 frequency occuring words in fake and real news feed.

In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('./real_fake_data.csv')
data = data.dropna(subset=['text'])

IOError: File ../Data_sets/real_fake_data.csv does not exist

In [None]:
real_data = list(data['text'][data['class'] == 'real'])
fake_data = list(data['text'][data['class'] == 'fake'])

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

def parse_top_vocab(data):
    count_vectorizer = CountVectorizer(stop_words='english', strip_accents='unicode', decode_error = 'ignore')
    tfidf_transformer = TfidfTransformer()
    
    term_counts = count_vectorizer.fit_transform(data)
    term_names = count_vectorizer.get_feature_names()
    
    term_tfidf = tfidf_transformer.fit_transform(term_counts)
    
    pairs = dict(zip(term_names, np.asarray(term_tfidf.mean(axis=0)).ravel()))
    sorted_all = sorted(pairs.items(), key=lambda x: -x[1])
    sorted_top_1000 = sorted_all[:1000]
    sorted_top_7000 = sorted_all[:7000]
    
    top_1000 = [(i[0]).encode('ascii', 'ignore') for i in sorted_top_1000]
    top_7000 = [(i[0]).encode('ascii', 'ignore') for i in sorted_top_7000]
    return(top_1000, top_7000)

In [None]:
real_vocab_1000, real_vocab_7000 = parse_top_vocab(real_data)
fake_vocab_1000, fake_vocab_7000 = parse_top_vocab(fake_data)

### Training Naive Bayes Classifier using fake_vocab_7000 list

In [None]:
real_data = data[data['class'] == 'real']
fake_data = data[data['class'] == 'fake'].sample(n=3000, random_state=0)
data = pd.concat([real_data, fake_data])
data = data.reset_index(drop=True)
data.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['class'], test_size=0.25, random_state=0)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
train_count_vectorizer = CountVectorizer(strip_accents='unicode', decode_error = 'ignore', stop_words='english', vocabulary=fake_vocab_7000)
train_term_counts = train_count_vectorizer.fit_transform(X_train.values)
train_term_counts.shape

In [None]:
from sklearn.externals import joblib
joblib.dump(train_count_vectorizer, 'vectorizer_final.pkl')

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
train_tfidf = TfidfTransformer()
train_tfidf_output = train_tfidf.fit_transform(train_term_counts)
train_tfidf_output.shape

In [None]:
test_count_vectorizer = CountVectorizer(strip_accents='unicode', decode_error = 'ignore', stop_words='english', vocabulary=fake_vocab_7000)
test_term_counts = test_count_vectorizer.fit_transform(X_test.values)
test_term_counts.shape

test_tfidf = TfidfTransformer()
test_tfidf_output = test_tfidf.fit_transform(test_term_counts)
test_tfidf_output.shape

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=1.0).fit(train_tfidf_output, y_train.values)
predicted = clf.predict(test_tfidf_output)

## Metrics and Validation

In [None]:
from sklearn import metrics
metrics.accuracy_score(y_test.values, predicted)

In [None]:
probs = pd.DataFrame(clf.predict_proba(test_tfidf_output), columns=['prob_fake', 'prob_real'], index=X_test.index)
probs['predicted'] = predicted
probs['real'] = y_test.values
probs.head()

In [None]:
from sklearn.externals import joblib
joblib.dump(clf, 'mnnb_model_final.pkl')