In [None]:
import pandas as pd
import numpy as np
import pickle
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.stem import porter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.dummy import DummyClassifier
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report

import itertools


import os
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn')

## Loading from MongoDB

In [None]:
#Launches Mongo Client
from pymongo import MongoClient

client = MongoClient()
biased_news = client.events.biased_news

In [None]:
#Create news bias events
db = client.events
biased_news = db.biased_news

In [None]:
#check duplicate articles
list(biased_news.aggregate([{'$group' : {'_id': '$title', 'count': {'$sum': 1}}},
    {'$match': {'count': {'$gte': 2}}},
    ]))

In [None]:
#Create cursor
cursor = biased_news.find()

In [None]:
#Loads from Mongo
true_df = pd.DataFrame(list(cursor))

In [None]:
#Delete columns and drop null values
del true_df['sub_title']
del true_df['_id']
true_df = true_df.dropna(how='any')

In [None]:
#Iterates through dataframe and drops redundant articles
for title in eliminate.index:
    huh = true_df['title'] == title
    wha = true_df[huh]
    label = wha.index
    true_df = true_df.drop(labels=label[1:],axis=0)

## Creating a corpus

In [None]:
# Creates a list of stop words
stopwords = stopwords.words()

In [None]:
def clean_text(text):
    '''Removes stop words and changes word to stem words'''
    cleaned_text = []
    for post in text:
        cleaned_words = []
        for word in post.split():
            low_word = word.lower()#stemmer.stem(word.lower())
            if low_word not in stopwords:
                cleaned_words.append(low_word)
        cleaned_text.append(' '.join(cleaned_words))
    return cleaned_text

In [None]:
#Clean the text
cleaned_text = clean_text(true_df.body)

In [None]:
def drop_nouns(text_list):
    #Drops the nouns
    no_nouns = []
    cleaned_text = []
    phrases = []
    for x in text_list:
        words = pos_tag(word_tokenize(x))
        werdz3 = ['NNP', 'NN', 'NNP', 'NNPs', 'NNS', ',', '.', ':', '(', ')', '#', '``']
        werdz = [s for s in words if s[-1] not in werdz3]
        no_nouns.append(' '.join(werdz))
    return no_nouns

In [None]:
#Drops nouns and other terms from the text
final_round_clean = drop_nouns(cleaned_text)

In [None]:
#Fits tfidf vectorizer
tfidf = TfidfVectorizer(ngram_range=(1, 4),  
                                   stop_words='english', 
                                   #token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_df = 0.6)
cor_tfidf = tfidf.fit_transform(final_round_clean)

## Dimensionality Reduction

In [None]:
#Fits LSA
lsa = TruncatedSVD(140, algorithm = 'arpack')
corpus_lsa = lsa.fit_transform(cor_tfidf)
corpus_lsa = Normalizer(copy=False).fit_transform(corpus_lsa)

In [None]:
#Check explained variance
sum(lsa.explained_variance_ratio_)

In [None]:
#Get terms
terms = tfidf.get_feature_names()

In [None]:
#check topics modeled
for i, comp in enumerate(lsa.components_):
    Terms_in_Comp = zip(terms,comp)
    sorted_Terms = sorted(Terms_in_Comp, key = lambda x: x[1], reverse=True) [:10]
    print("Topic %d:" %i)
    for term in sorted_Terms:
        print(term[0])
    print(" ")

## Fitting a model

In [None]:
#Creates test-train split
X = corpus_lsa
y = true_df.source
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#check against dummy classifer
dumb = DummyClassifier(strategy = "stratified", random_state=0)
dumb.fit(X_train, y_train)
print(dumb.score(X_test, y_test))
print(dumb.score(X_train, y_train))

In [None]:
#Fits a random forest classifier
rf2 = RandomForestClassifier(n_estimators=50)#, max_depth=36)
rf2.fit(X_train, y_train)
print(rf2.score(X_test, y_test))
print(rf2.score(X_train, y_train))

In [None]:
#Check the confusion matrix
names = ['Fox News', 'National Review', 'Breitbart', 'Info wars', 'Global Research',
       'Activist Post', 'Reuters', 'Associate Press',
       'Alabama Today', 'Huffington Post', 'Daily Beast', 'Mother Jones']
plt.figure(dpi=100)
cm = confusion_matrix(y_test, rf2.predict(X_test), labels =names)
plt.imshow(cm, cmap=plt.cm.Blues)
plt.grid(False)
plt.colorbar();
plt.xticks([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], ('FN', 'NR', 'B', 'IW', 'GR',
       'ActP', 'R', 'AP',
       'AT', 'HP', 'DB', 'MJ'))
plt.yticks([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], ('Fox News', 'National Review', 'Breitbart', 'Info wars', 'Global Research',
       'Activist Post', 'Reuters', 'Associated Press',
       'Alabama Today', 'Huffington Post', 'Daily Beast', 'Mother Jones'));
plt.ylabel("True Source")
plt.xlabel("Predicted Source");
fmt = '.1f'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    plt.text(j, i, format(cm[i, j]),#, fmt),
             horizontalalignment="center",
             color="white" if cm[i, j] > thresh else "black")

In [None]:
#Check the classification report
print(classification_report(y_test, rf2.predict(X_test), target_names=names))