In [47]:
# See Top Features - Train all models using the best classifier that has coef_ function available
# LinearSVC, in this case.
# Challenges - May only be interested in using the top models indicated by Load_Pickle.ipynb's final_results accuracies.
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import sklearn
import nltk
from sklearn import naive_bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis 
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from math import sqrt
import joblib
import string
import re
import sys
import datetime
import html
import os
import timeit
nltk.download('punkt')
nltk.download('wordnet')

# sklearn=0.23.1, pandas=1.0.1
print(sklearn.__version__)
print(pd.__version__)

0.23.1
1.0.1


[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Gather Data

### 2 sources

In [48]:
# Let's see what features we have.
length_req = 800  # Character length requirement for articles
confidence_level = .90

left1 = pd.read_excel("csvs/Huffington.xlsx", 
                  names=["date", "article"])
#left1 = pd.read_csv('Huffington.csv')
print("We have {:,} left1-wing records".format(left1.shape[0]))

left1['date'].fillna("", inplace=True)
left1['article'].fillna("", inplace=True)
    
for x in range(left1.shape[0]):
    if len(left1['article'][x]) < length_req:
        left1.drop(x, inplace=True)

print("We have {:,} left1-wing records > {} characters long".format(left1.shape[0], length_req))

# Only keep the unique article rows and their values
left1.drop_duplicates("article", keep='first', inplace=True)

print("{:,} Records are unique".format(
    left1.shape[0]))

left1['date'] = left1['date'].str.replace(',', '')

left1.head()

We have 4,355 left1-wing records
We have 4,314 left1-wing records > 800 characters long
4,304 Records are unique


Unnamed: 0,date,article
0,2020-01-01,"Elizabeth Warren Rips ‘Fawning, Spineless’ Rep..."
1,2020-01-01,These Major New Laws Take Effect Today From ma...
2,2020-01-01,U.S. To Deploy Hundreds Of Troops To Iraq Afte...
3,2020-01-01,Police And Protesters Clash In New Year’s Rall...
4,2020-01-01,Trump Shrugs Off Kim Jong Un’s Nuclear Testing...


In [49]:
# Let's see what features we have.
right1 = pd.read_excel("csvs/NewsMax.xlsx",
                     names=["date", "article"])
print("We have {:,} right1-wing records".format(right1.shape[0]))

right1['date'].fillna("", inplace=True)
right1['article'].fillna("", inplace=True)

for x in range(right1.shape[0]):
    if len(right1['article']) < length_req:
        right1.drop(x, inplace=True)

print("We have {:,} right1-wing records > {} characters long".format(right1.shape[0], length_req))

# Only keep the unique rows and their values
right1.drop_duplicates("article", keep='first', inplace=True)

print("{:,} Records are unique".format(
    right1.shape[0]))

right1['date'] = right1['date'].str.replace(',', '')

right1.head()

We have 5,829 right1-wing records
We have 5,829 right1-wing records > 800 characters long
5,780 Records are unique


Unnamed: 0,date,article
0,2020-01-01,Klobuchar Hits Trump's Golf: 'I Spent 0 Days' ...
1,2020-01-01,WashPost: Bloomberg's China Ties Are Conflict ...
2,2020-01-01,James Woolsey to Newsmax TV: Target the IRGC N...
3,2020-01-01,Report: US Service Members Killed in Afghanist...
4,2020-01-01,Bernie Kerik to Newsmax TV: Trump Sends World ...


In [50]:
# Check for null values
right1.isnull().sum(axis=0)

date       0
article    0
dtype: int64

In [51]:
left1.isnull().sum(axis=0)

date       0
article    0
dtype: int64

In [52]:
# If there were null values, the below will replace them.
left1['date'].fillna("", inplace=True)
right1['date'].fillna("", inplace=True)
#right['article'].fillna("", inplace=True)
#right.isnull().sum(axis=0)

In [53]:
# Search for non-null valued rows
# Useful for fixing when data went into three or more columns instead of two in the Excel sheet.
#left[left['date'].notnull()]
#right[right['date'].notnull()]

# Finds all null valued rows
left1[~left1['date'].notnull()]
#right[~right['date'].notnull()]

Unnamed: 0,date,article


In [54]:
left1['pole'] = 0  # Make a column 'pole', assign a value of 0 to indicate left1 articles
print("We have {:,} left1-wing records".format(left1.shape[0]))
right1['pole'] = 1
print("We have {:,} right1-wing records".format(right1.shape[0]))

if (right1.shape[0] > left1.shape[0]):
    print("Getting random sample of right1-wing records")
    right1 = right1.sample(left1.shape[0]) # randomly generate samples from right1 equal to left1's length. (Can be set to random_state=0)
else:
    print("Getting random sample of left1-wing records")
    left1 = left1.sample(right1.shape[0]) # randomly generate samples from right1 equal to left1's length. (Can be set to random_state=0)

all_data1 = pd.concat([left1, right1]) # combine left1 and right1 datasets
column_names = all_data1.columns.values
all_data1.columns = column_names
all_data1 = all_data1.sample(frac=1).reset_index(drop=True) # randomly shuffle your rows
all_data1['date'] = all_data1['date'].str.replace(',', '')
all_data1.head()

We have 4,304 left1-wing records
We have 5,780 right1-wing records
Getting random sample of right1-wing records


Unnamed: 0,date,article,pole
0,2020-02-19,Pete Buttigieg’s Test Has Arrived In this week...,0
1,2020-04-14,Gov. Cuomo: Trump Is Not 'King' and Can't Reop...,1
2,2020-07-30,Rep. John Lewis Tells Youth to 'Let Freedom Ri...,1
3,2020-07-19,How The Black Lives Matter Generation Remember...,0
4,2020-04-28,Health Care Workers Stand Up To Anti-Lockdown ...,0


In [55]:
print("{:,} / {:,} Current Records".format(
    all_data1['pole'].sum(),
    all_data1.shape[0]))

# Only keep the unique rows and their values
all_data1.drop_duplicates("article", keep=False, inplace=True)
print("{:,} Records are unique".format(
    all_data1.shape[0]))

print("{:,} / {:,} Current Records".format(
    all_data1['pole'].sum(),
    all_data1.shape[0]))

4,304 / 8,608 Current Records
8,608 Records are unique
4,304 / 8,608 Current Records


### 4 sources

In [56]:
left1 = pd.read_excel("csvs/Huffington.xlsx", 
                  names=["date", "article"])
#left = pd.read_csv('Huffington.csv')
print("We have {:,} left-wing records".format(left1.shape[0]))

for x in range(left1.shape[0]):
    if len(left1['article'][x]) < length_req:
        left1.drop(x, inplace=True)

print("We have {:,} left-wing records > {} characters long".format(left1.shape[0], length_req))

# Only keep the unique article rows and their values
left1.drop_duplicates("article", keep='first', inplace=True)

print("{:,} Records are unique".format(
    left1.shape[0]))

left1['date'] = left1['date'].str.replace(',', '')

left1.head()

We have 4,355 left-wing records
We have 4,314 left-wing records > 800 characters long
4,304 Records are unique


Unnamed: 0,date,article
0,2020-01-01,"Elizabeth Warren Rips ‘Fawning, Spineless’ Rep..."
1,2020-01-01,These Major New Laws Take Effect Today From ma...
2,2020-01-01,U.S. To Deploy Hundreds Of Troops To Iraq Afte...
3,2020-01-01,Police And Protesters Clash In New Year’s Rall...
4,2020-01-01,Trump Shrugs Off Kim Jong Un’s Nuclear Testing...


In [57]:
left2 = pd.read_excel("csvs/Salon.xlsx", 
                  names=["date", "article"])
#left = pd.read_csv('Huffington.csv')
print("We have {:,} left-wing records".format(left2.shape[0]))

left2['date'].fillna("", inplace=True)
left2['article'].fillna("", inplace=True)

for x in range(left2.shape[0]):
    if len(left2['article'][x]) < length_req:
        left2.drop(x, inplace=True)

print("We have {:,} left-wing records > {} characters long".format(left2.shape[0], length_req))

# Only keep the unique article rows and their values
left2.drop_duplicates("article", keep='first', inplace=True)

print("{:,} Records are unique".format(
    left2.shape[0]))

left2['date'] = left2['date'].str.replace(',', '')

left2.head()

We have 2,303 left-wing records
We have 2,278 left-wing records > 800 characters long
1,987 Records are unique


Unnamed: 0,date,article
0,2020-01-15,"Lev Parnas, the indicted associate of former N..."
1,2020-01-26,President Donald Trump's comments about Thomas...
2,2020-05-31,This article originally appeared at Common Dre...
3,2020-05-31,Trump admin rush migrant children deportations...
4,2020-05-31,"The late Gore Vidal once confessed, with chara..."


In [58]:
left_min = [left1.shape[0], left2.shape[0]]
left_min_entries = min(left_min)

left1 = left1.sample(left_min_entries)
left2 = left2.sample(left_min_entries)

print("Huffpost: {} entries".format(left1.shape[0]))
print("Salon: {} entries".format(left2.shape[0]))

Huffpost: 1987 entries
Salon: 1987 entries


In [59]:
left = pd.concat([left1, left2])
print("We have {:,} left-wing records".format(left.shape[0]))

We have 3,974 left-wing records


In [60]:
# Let's see what features we have.
right1 = pd.read_excel("csvs/NewsMax.xlsx",
                     names=["date", "article"])
print("We have {:,} right-wing records".format(right1.shape[0]))

#count = right.shape[0]

for x in range(right1.shape[0]):
    if len(right1['article'][x]) < length_req:
        right1.drop(x, inplace=True)

print("We have {:,} right-wing records > {} characters long".format(right1.shape[0], length_req))

# Only keep the unique rows and their values
right1.drop_duplicates("article", keep='first', inplace=True)

print("{:,} Records are unique".format(
    right1.shape[0]))

right1['date'] = right1['date'].str.replace(',', '')

right1.head()

We have 5,829 right-wing records
We have 5,637 right-wing records > 800 characters long
5,588 Records are unique


Unnamed: 0,date,article
0,2020-01-01,Klobuchar Hits Trump's Golf: 'I Spent 0 Days' ...
1,2020-01-01,WashPost: Bloomberg's China Ties Are Conflict ...
2,2020-01-01,James Woolsey to Newsmax TV: Target the IRGC N...
3,2020-01-01,Report: US Service Members Killed in Afghanist...
4,2020-01-01,Bernie Kerik to Newsmax TV: Trump Sends World ...


In [61]:
right2 = pd.read_excel("csvs/Redstate.xlsx",
                     names=["date", "article"])
print("We have {:,} right-wing records".format(right2.shape[0]))

#count = right.shape[0]

for x in range(right2.shape[0]):
    if len(right2['article'][x]) < length_req:
        right2.drop(x, inplace=True)

print("We have {:,} right-wing records > {} characters long".format(right2.shape[0], length_req))

# Only keep the unique rows and their values
right2.drop_duplicates("article", keep='first', inplace=True)

print("{:,} Records are unique".format(
    right2.shape[0]))

right2['date'] = right2['date'].str.replace(',', '')

right2.head()

We have 2,270 right-wing records
We have 2,207 right-wing records > 800 characters long
2,111 Records are unique


Unnamed: 0,date,article
0,2020-01-15,Even CNN Calls Out Democrats For How They Cele...
1,2020-01-26,The Isolation of Connectivity Image by Thom...
2,2020-05-29,Mayor of Minneapolis Lets Rioters Burn Down Po...
3,2020-05-29,Watch: CNN Crew Arrested by MN State Police Li...
4,2020-05-31,Buffalo Mayor Has Best Response to Rioter Who ...


In [62]:
right_min = [right1.shape[0], right2.shape[0]]
right_min_entries = min(right_min)

right1 = right1.sample(right_min_entries)
right2 = right2.sample(right_min_entries)

print("NewsMax: {} entries".format(right1.shape[0]))
print("Redstate: {} entries".format(right2.shape[0]))

NewsMax: 2111 entries
Redstate: 2111 entries


In [63]:
right = pd.concat([right1, right2])
print("We have {:,} right-wing records".format(right.shape[0]))

We have 4,222 right-wing records


In [64]:
# Check for null values
right.isnull().sum(axis=0)

date       6
article    0
dtype: int64

In [65]:
left.isnull().sum(axis=0)

date       0
article    0
dtype: int64

In [66]:
# If there were null values, the below will replace them.
left['date'].fillna("", inplace=True)
right['date'].fillna("", inplace=True)
#right['article'].fillna("", inplace=True)
#right.isnull().sum(axis=0)

In [67]:
# Search for non-null valued rows
# Useful for fixing when data went into three or more columns instead of two in the Excel sheet.
#left[left['date'].notnull()]
#right[right['date'].notnull()]

# Finds all null valued rows
left[~left['date'].notnull()]
#right[~right['date'].notnull()]

Unnamed: 0,date,article


In [68]:
left['pole'] = 0  # Make a column 'pole', assign a value of 0 to indicate left articles
print("We have {:,} left-wing records".format(left.shape[0]))
right['pole'] = 1
print("We have {:,} right-wing records".format(right.shape[0]))

if (right.shape[0] > left.shape[0]):
    print("Getting random sample of right-wing records")
    right = right.sample(left.shape[0]) # randomly generate samples from right equal to left's length. (Can be set to random_state=0)
else:
    print("Getting random sample of left-wing records")
    left = left.sample(right.shape[0]) # randomly generate samples from right equal to left's length. (Can be set to random_state=0)

all_data2 = pd.concat([left, right]) # combine left and right datasets
column_names = all_data2.columns.values
all_data2.columns = column_names
all_data2 = all_data2.sample(frac=1).reset_index(drop=True) # randomly shuffle your rows
all_data2['date'] = all_data2['date'].str.replace(',', '')
all_data2.head()

We have 3,974 left-wing records
We have 4,222 right-wing records
Getting random sample of right-wing records


Unnamed: 0,date,article,pole
0,2020-03-25,You have to feel sorry for President Trump. He...,0
1,2020-06-10,This article originally appeared on AlterNet. ...,0
2,2020-03-25,An Arizona man died after ingesting a fish tan...,0
3,2020-05-31,"Once again, a video was released: George Floyd...",0
4,2020-06-30,"It seems as if it happened ages ago, but you m...",0


In [69]:
print("{:,} / {:,} Current Records".format(
    all_data2['pole'].sum(),
    all_data2.shape[0]))

# Only keep the unique rows and their values
all_data2.drop_duplicates("article", keep=False, inplace=True)
print("{:,} Records are unique".format(
    all_data2.shape[0]))

print("{:,} / {:,} Current Records".format(
    all_data2['pole'].sum(),
    all_data2.shape[0]))

3,974 / 7,948 Current Records
7,948 Records are unique
3,974 / 7,948 Current Records


### 6 sources

In [70]:
left1 = pd.read_excel("csvs/Huffington.xlsx", 
                  names=["date", "article"])
#left = pd.read_csv('Huffington.csv')
print("We have {:,} left-wing records".format(left1.shape[0]))

for x in range(left1.shape[0]):
    if len(left1['article'][x]) < length_req:
        left1.drop(x, inplace=True)

print("We have {:,} left-wing records > {} characters long".format(left1.shape[0], length_req))

# Only keep the unique article rows and their values
left1.drop_duplicates("article", keep='first', inplace=True)

print("{:,} Records are unique".format(
    left1.shape[0]))

left1['date'] = left1['date'].str.replace(',', '')

left1.head()

We have 4,355 left-wing records
We have 4,314 left-wing records > 800 characters long
4,304 Records are unique


Unnamed: 0,date,article
0,2020-01-01,"Elizabeth Warren Rips ‘Fawning, Spineless’ Rep..."
1,2020-01-01,These Major New Laws Take Effect Today From ma...
2,2020-01-01,U.S. To Deploy Hundreds Of Troops To Iraq Afte...
3,2020-01-01,Police And Protesters Clash In New Year’s Rall...
4,2020-01-01,Trump Shrugs Off Kim Jong Un’s Nuclear Testing...


In [71]:
left2 = pd.read_excel("csvs/Salon.xlsx", 
                  names=["date", "article"])
#left = pd.read_csv('Huffington.csv')
print("We have {:,} left-wing records".format(left2.shape[0]))

left2['date'].fillna("", inplace=True)
left2['article'].fillna("", inplace=True)

for x in range(left2.shape[0]):
    if len(left2['article'][x]) < length_req:
        left2.drop(x, inplace=True)

print("We have {:,} left-wing records > {} characters long".format(left2.shape[0], length_req))

# Only keep the unique article rows and their values
left2.drop_duplicates("article", keep='first', inplace=True)

print("{:,} Records are unique".format(
    left2.shape[0]))

left2['date'] = left2['date'].str.replace(',', '')

left2.head()

We have 2,303 left-wing records
We have 2,278 left-wing records > 800 characters long
1,987 Records are unique


Unnamed: 0,date,article
0,2020-01-15,"Lev Parnas, the indicted associate of former N..."
1,2020-01-26,President Donald Trump's comments about Thomas...
2,2020-05-31,This article originally appeared at Common Dre...
3,2020-05-31,Trump admin rush migrant children deportations...
4,2020-05-31,"The late Gore Vidal once confessed, with chara..."


In [72]:
left3 = pd.read_excel("csvs/Rawstory.xlsx", 
                  names=["date", "article"])
#left = pd.read_csv('Huffington.csv')
print("We have {:,} left-wing records".format(left3.shape[0]))

for x in range(left3.shape[0]):
    if len(left3['article'][x]) < length_req:
        left3.drop(x, inplace=True)

print("We have {:,} left-wing records > {} characters long".format(left3.shape[0], length_req))

# Only keep the unique article rows and their values
left3.drop_duplicates("article", keep='first', inplace=True)

print("{:,} Records are unique".format(
    left3.shape[0]))

left3['date'] = left3['date'].str.replace(',', '')

left3.head()

We have 1,160 left-wing records
We have 1,159 left-wing records > 800 characters long
1,159 Records are unique


Unnamed: 0,date,article
0,2020-01-15,‘Uninterested in evidence’: Embattled Susan Co...
1,2020-02-07,Tornado warning issued for 2 New Jersey counti...
2,2020-02-07,Trump official busted for telling ‘blatant lie...
3,2020-02-07,Republicans face ‘Electoral College time bomb’...
4,2020-02-07,We’re a former coal company and we support gov...


In [73]:
left_min = [left1.shape[0], left2.shape[0], left3.shape[0]]
left_min_entries = min(left_min)

left1 = left1.sample(left_min_entries)
left2 = left2.sample(left_min_entries)
left3 = left3.sample(left_min_entries)

print("Huffpost: {} entries".format(left1.shape[0]))
print("Salon: {} entries".format(left2.shape[0]))
print("Rawstory: {} entries".format(left3.shape[0]))

Huffpost: 1159 entries
Salon: 1159 entries
Rawstory: 1159 entries


In [74]:
left = pd.concat([left1, left2, left3])
print("We have {:,} left-wing records".format(left.shape[0]))

We have 3,477 left-wing records


In [75]:
# Let's see what features we have.
right1 = pd.read_excel("csvs/NewsMax.xlsx",
                     names=["date", "article"])
print("We have {:,} right-wing records".format(right1.shape[0]))

#count = right.shape[0]

for x in range(right1.shape[0]):
    if len(right1['article'][x]) < length_req:
        right1.drop(x, inplace=True)

print("We have {:,} right-wing records > {} characters long".format(right1.shape[0], length_req))

# Only keep the unique rows and their values
right1.drop_duplicates("article", keep='first', inplace=True)

print("{:,} Records are unique".format(
    right1.shape[0]))

right1['date'] = right1['date'].str.replace(',', '')

right1.head()

We have 5,829 right-wing records
We have 5,637 right-wing records > 800 characters long
5,588 Records are unique


Unnamed: 0,date,article
0,2020-01-01,Klobuchar Hits Trump's Golf: 'I Spent 0 Days' ...
1,2020-01-01,WashPost: Bloomberg's China Ties Are Conflict ...
2,2020-01-01,James Woolsey to Newsmax TV: Target the IRGC N...
3,2020-01-01,Report: US Service Members Killed in Afghanist...
4,2020-01-01,Bernie Kerik to Newsmax TV: Trump Sends World ...


In [76]:
right2 = pd.read_excel("csvs/Redstate.xlsx",
                     names=["date", "article"])
print("We have {:,} right-wing records".format(right2.shape[0]))

#count = right.shape[0]

for x in range(right2.shape[0]):
    if len(right2['article'][x]) < length_req:
        right2.drop(x, inplace=True)

print("We have {:,} right-wing records > {} characters long".format(right2.shape[0], length_req))

# Only keep the unique rows and their values
right2.drop_duplicates("article", keep='first', inplace=True)

print("{:,} Records are unique".format(
    right2.shape[0]))

right2['date'] = right2['date'].str.replace(',', '')

right2.head()

We have 2,270 right-wing records
We have 2,207 right-wing records > 800 characters long
2,111 Records are unique


Unnamed: 0,date,article
0,2020-01-15,Even CNN Calls Out Democrats For How They Cele...
1,2020-01-26,The Isolation of Connectivity Image by Thom...
2,2020-05-29,Mayor of Minneapolis Lets Rioters Burn Down Po...
3,2020-05-29,Watch: CNN Crew Arrested by MN State Police Li...
4,2020-05-31,Buffalo Mayor Has Best Response to Rioter Who ...


In [77]:
right3 = pd.read_excel("csvs/WashingtonExaminer.xlsx",
                     names=["date", "article"])
print("We have {:,} right-wing records".format(right3.shape[0]))

#count = right.shape[0]

for x in range(right3.shape[0]):
    if len(right3['article'][x]) < length_req:
        right3.drop(x, inplace=True)

print("We have {:,} right-wing records > {} characters long".format(right3.shape[0], length_req))

# Only keep the unique rows and their values
right3.drop_duplicates("article", keep='first', inplace=True)

print("{:,} Records are unique".format(
    right3.shape[0]))

right3['date'] = right3['date'].str.replace(',', '')

right3.head()

We have 1,117 right-wing records
We have 1,104 right-wing records > 800 characters long
983 Records are unique


Unnamed: 0,date,article
0,2020-01-16,'No one's giggling around here': Democrats def...
1,2020-02-19,Bloomberg would sell business and financial in...
2,2020-02-19,Uncertainty reigns over Democratic presidentia...
3,2020-02-19,Five things to watch in the Nevada Democratic ...
4,2020-02-19,'Some guy wearing a dress': Bloomberg once cal...


In [78]:
right_min = [right1.shape[0], right2.shape[0], right3.shape[0]]
right_min_entries = min(right_min)

right1 = right1.sample(right_min_entries)
right2 = right2.sample(right_min_entries)
right3 = right3.sample(right_min_entries)

print("NewsMax: {} entries".format(right1.shape[0]))
print("Redstate: {} entries".format(right2.shape[0]))
print("WashingtonExaminer: {} entries".format(right3.shape[0]))

NewsMax: 983 entries
Redstate: 983 entries
WashingtonExaminer: 983 entries


In [79]:
right = pd.concat([right1, right2, right3])
print("We have {:,} right-wing records".format(right.shape[0]))

We have 2,949 right-wing records


In [80]:
# Check for null values
right.isnull().sum(axis=0)

date       4
article    0
dtype: int64

In [81]:
left.isnull().sum(axis=0)

date       9
article    0
dtype: int64

In [82]:
# If there were null values, the below will replace them.
left['date'].fillna("", inplace=True)
right['date'].fillna("", inplace=True)
#right['article'].fillna("", inplace=True)
#right.isnull().sum(axis=0)

In [83]:
# Search for non-null valued rows
# Useful for fixing when data went into three or more columns instead of two in the Excel sheet.
#left[left['date'].notnull()]
#right[right['date'].notnull()]

# Finds all null valued rows
left[~left['date'].notnull()]
#right[~right['date'].notnull()]

Unnamed: 0,date,article


In [84]:
left['pole'] = 0  # Make a column 'pole', assign a value of 0 to indicate left articles
print("We have {:,} left-wing records".format(left.shape[0]))
right['pole'] = 1
print("We have {:,} right-wing records".format(right.shape[0]))

if (right.shape[0] > left.shape[0]):
    print("Getting random sample of right-wing records")
    right = right.sample(left.shape[0]) # randomly generate samples from right equal to left's length. (Can be set to random_state=0)
else:
    print("Getting random sample of left-wing records")
    left = left.sample(right.shape[0]) # randomly generate samples from right equal to left's length. (Can be set to random_state=0)
    
all_data3 = pd.concat([left, right]) # combine left and right datasets
column_names = all_data3.columns.values
all_data3.columns = column_names
all_data3 = all_data3.sample(frac=1).reset_index(drop=True) # randomly shuffle your rows
all_data3['date'] = all_data3['date'].str.replace(',', '')
all_data3.head()

We have 3,477 left-wing records
We have 2,949 right-wing records
Getting random sample of left-wing records


Unnamed: 0,date,article,pole
0,2020-07-18,Pelosi Blames Trump for ‘Kidnapping Protesters...,1
1,2020-05-11,Remember the Ventilator Crisis? The Press Now ...,1
2,2020-07-10,A revolutionized White House briefing should n...,1
3,2020-01-22,It’s Not Just Washington State. Iranians Are S...,0
4,2020-06-04,AOC Endorses Engel's Challenger in Primary Rep...,1


In [85]:
print("{:,} / {:,} Current Records".format(
    all_data3['pole'].sum(),
    all_data3.shape[0]))

# Only keep the unique rows and their values
all_data3.drop_duplicates("article", keep=False, inplace=True)
print("{:,} Records are unique".format(
    all_data3.shape[0]))

print("{:,} / {:,} Current Records".format(
    all_data3['pole'].sum(),
    all_data3.shape[0]))

2,949 / 5,898 Current Records
5,898 Records are unique
2,949 / 5,898 Current Records


In [86]:
print("2 sources: {} unique, current records".format(all_data1.shape[0]))
print("4 sources: {} unique, current records".format(all_data2.shape[0]))
print("6 sources: {} unique, current records".format(all_data3.shape[0]))

2 sources: 8608 unique, current records
4 sources: 7948 unique, current records
6 sources: 5898 unique, current records


# Do the below 2 blocks for each run

In [87]:
# 3 dataframes
all_dataframes = [all_data1, all_data2, all_data3]
all_dataframes[0].head()

Unnamed: 0,date,article,pole
0,2020-02-19,Pete Buttigieg’s Test Has Arrived In this week...,0
1,2020-04-14,Gov. Cuomo: Trump Is Not 'King' and Can't Reop...,1
2,2020-07-30,Rep. John Lewis Tells Youth to 'Let Freedom Ri...,1
3,2020-07-19,How The Black Lives Matter Generation Remember...,0
4,2020-04-28,Health Care Workers Stand Up To Anti-Lockdown ...,0


In [88]:
# Remove years and digits from being used as features
def remove_years(text):
    text = re.sub(r"[0-9]{4}", " ", text)
    text = re.sub(r"[0-9]{3}", " ", text)
    text = re.sub(r"[0-9]{2}", " ", text)
    text = re.sub(r"[0-9]{1}", " ", text)
    text = re.sub(r"[0-9]{4}ish", " ", text)
    text = re.sub(r"[0-9]{3}ish", " ", text)
    text = re.sub(r"[0-9]{2}ish", " ", text)
    text = re.sub(r"[0-9]{1}ish", " ", text)
    text = re.sub(r"2020-[0-9]{2}-[0-9]{2}", " ", text)
    return text

In [89]:
# Proper Noun Removal function
# Do this before lowercasing all text
from nltk.tag import pos_tag

def proper_noun_removal(text):
    tagged_sent = pos_tag(text.split())
    return [w for w, pos in tagged_sent if pos != 'NNP']

In [90]:
# Lemmatization after Word Tokenization
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in word_tokenize(text)]

In [91]:
# Clean each dataframe's data.
for all_data in all_dataframes:
    
    # URL decoding function for decoding any string of html characters...
    all_data['article'] = all_data['article'].astype(str).apply(lambda x: html.unescape(x))
    
    # Store the original comments in a new column
    all_data['unfiltered_articles'] = all_data['article'].values
    all_data['unfiltered_articles_lowercase'] = all_data['article'].values
    all_data['article_PNremoved'] = all_data['article'].values

all_dataframes[0].head()

Unnamed: 0,date,article,pole,unfiltered_articles,unfiltered_articles_lowercase,article_PNremoved
0,2020-02-19,Pete Buttigieg’s Test Has Arrived In this week...,0,Pete Buttigieg’s Test Has Arrived In this week...,Pete Buttigieg’s Test Has Arrived In this week...,Pete Buttigieg’s Test Has Arrived In this week...
1,2020-04-14,Gov. Cuomo: Trump Is Not 'King' and Can't Reop...,1,Gov. Cuomo: Trump Is Not 'King' and Can't Reop...,Gov. Cuomo: Trump Is Not 'King' and Can't Reop...,Gov. Cuomo: Trump Is Not 'King' and Can't Reop...
2,2020-07-30,Rep. John Lewis Tells Youth to 'Let Freedom Ri...,1,Rep. John Lewis Tells Youth to 'Let Freedom Ri...,Rep. John Lewis Tells Youth to 'Let Freedom Ri...,Rep. John Lewis Tells Youth to 'Let Freedom Ri...
3,2020-07-19,How The Black Lives Matter Generation Remember...,0,How The Black Lives Matter Generation Remember...,How The Black Lives Matter Generation Remember...,How The Black Lives Matter Generation Remember...
4,2020-04-28,Health Care Workers Stand Up To Anti-Lockdown ...,0,Health Care Workers Stand Up To Anti-Lockdown ...,Health Care Workers Stand Up To Anti-Lockdown ...,Health Care Workers Stand Up To Anti-Lockdown ...


In [92]:
# Proper Noun Removal
# Do this before lowercasing all text.
for all_data in all_dataframes:
    all_data['article_PNremoved'] = all_data['article_PNremoved'].apply(proper_noun_removal)

    # Turn the list of tokens back into a sentence
    all_data['article_PNremoved'] = [' '.join(comment) for comment in all_data['article_PNremoved']]

all_dataframes[0].head()

Unnamed: 0,date,article,pole,unfiltered_articles,unfiltered_articles_lowercase,article_PNremoved
0,2020-02-19,Pete Buttigieg’s Test Has Arrived In this week...,0,Pete Buttigieg’s Test Has Arrived In this week...,Pete Buttigieg’s Test Has Arrived In this week...,"Arrived In this week’s caucuses, the Democrati..."
1,2020-04-14,Gov. Cuomo: Trump Is Not 'King' and Can't Reop...,1,Gov. Cuomo: Trump Is Not 'King' and Can't Reop...,Gov. Cuomo: Trump Is Not 'King' and Can't Reop...,Is Not 'King' and States warned not to push re...
2,2020-07-30,Rep. John Lewis Tells Youth to 'Let Freedom Ri...,1,Rep. John Lewis Tells Youth to 'Let Freedom Ri...,Rep. John Lewis Tells Youth to 'Let Freedom Ri...,"to 'Let in before civil rights icon died, he w..."
3,2020-07-19,How The Black Lives Matter Generation Remember...,0,How The Black Lives Matter Generation Remember...,How The Black Lives Matter Generation Remember...,How The Lives the civil rights icon who some c...
4,2020-04-28,Health Care Workers Stand Up To Anti-Lockdown ...,0,Health Care Workers Stand Up To Anti-Lockdown ...,Health Care Workers Stand Up To Anti-Lockdown ...,Workers To In putting me and my family at risk...


In [93]:
# Clean each dataframe's data
removal = ["article", "article_PNremoved"]

for all_data in all_dataframes:
    for column in removal:
        # Clean the text
        # The apostrophes here are very specific.
        # The regular ' will not work from keyboard.
        # You have to use print(all_data.head()) and copy and paste that apostrophe into the str.replace()
        all_data[column] = all_data[column].str.lower()
        all_data[column] = all_data[column].str.replace(r"u.s.", "united states")
        all_data[column] = all_data[column].str.replace(r"what’s", "what is")
        all_data[column] = all_data[column].str.replace(r"’s", " ")
        all_data[column] = all_data[column].str.replace(r"’ve", "have")
        all_data[column] = all_data[column].str.replace(r"’re", "are")
        all_data[column] = all_data[column].str.replace(r"’ll", "will")
        all_data[column] = all_data[column].str.replace(r"can’t", "can not")
        all_data[column] = all_data[column].str.replace(r"aren’t", "are not")
        all_data[column] = all_data[column].str.replace(r"couldn’t", "could not")
        all_data[column] = all_data[column].str.replace(r"didn’t", "did not")
        all_data[column] = all_data[column].str.replace(r"doesn’t", "does not")
        all_data[column] = all_data[column].str.replace(r"don’t", "do not")
        all_data[column] = all_data[column].str.replace(r"hadn’t", "had not")
        all_data[column] = all_data[column].str.replace(r"hasn’t", "has not")
        all_data[column] = all_data[column].str.replace(r"haven’t", "have not")
        all_data[column] = all_data[column].str.replace(r"isn’t", "is not")
        all_data[column] = all_data[column].str.replace(r"shouldn’t", "should not")
        all_data[column] = all_data[column].str.replace(r"wasn’t", "was not")
        all_data[column] = all_data[column].str.replace(r"weren’t", "were not")
        all_data[column] = all_data[column].str.replace(r"won’t", "will not")
        all_data[column] = all_data[column].str.replace(r"wouldn’t", "would not")
        all_data[column] = all_data[column].str.replace(r"mustn’t", "must not")
        all_data[column] = all_data[column].str.replace(r"i’m", "i am")

        # As are these apostrophes...
        all_data[column] = all_data[column].str.replace(r"what's", "what is")
        all_data[column] = all_data[column].str.replace(r"'s", " ")
        all_data[column] = all_data[column].str.replace(r"'ve", "have")
        all_data[column] = all_data[column].str.replace(r"'re", "are")
        all_data[column] = all_data[column].str.replace(r"'ll", "will")
        all_data[column] = all_data[column].str.replace(r"can't", "can not")
        all_data[column] = all_data[column].str.replace(r"aren't", "are not")
        all_data[column] = all_data[column].str.replace(r"couldn't", "could not")
        all_data[column] = all_data[column].str.replace(r"didn't", "did not")
        all_data[column] = all_data[column].str.replace(r"doesn't", "does not")
        all_data[column] = all_data[column].str.replace(r"don't", "do not")
        all_data[column] = all_data[column].str.replace(r"hadn't", "had not")
        all_data[column] = all_data[column].str.replace(r"hasn't", "has not")
        all_data[column] = all_data[column].str.replace(r"haven't", "have not")
        all_data[column] = all_data[column].str.replace(r"isn't", "is not")
        all_data[column] = all_data[column].str.replace(r"shouldn't", "should not")
        all_data[column] = all_data[column].str.replace(r"wasn't", "was not")
        all_data[column] = all_data[column].str.replace(r"weren't", "were not")
        all_data[column] = all_data[column].str.replace(r"won't", "will not")
        all_data[column] = all_data[column].str.replace(r"wouldn't", "would not")
        all_data[column] = all_data[column].str.replace(r"mustn't", "must not")
        all_data[column] = all_data[column].str.replace(r"i'm", "i am")
        #print(all_data.head())

        # Custom word removal - these appear too frequently and do not help with generalization.
        custom = {"donald j trump", "donald j. trump", "donald john trump", 'newsmax', 'redstate', 'washingtonexaminer', 'washington examiner', 
                  "huffpost", 'salon', 'rawstory', 'raw story',
                  "___", "donald trump", "trump", "pelosi"}

        # Did not work
        #for word in custom:
        #    all_data["article"]= all_data["article"].replace(word, " ") 

        # Did not work
        #all_data[column] = all_data[column].apply(lambda x: ' '.join([word for word in x.split() if word not in (custom)]))

        # Did work
        #def customWord(text):
        #    custom = {"huffpost", "newsmax"}
        #    return text.replace("huffpost", " ") 

        for word in custom:
            all_data[column] = all_data[column].apply(lambda x: x.replace(word, " "))

        # Check to see the words were replaced
        #print(all_data[all_data[column].str.contains("newsmax")])

        # Remove years and digits from being used as features
        all_data[column] = all_data[column].apply(lambda x: remove_years(x))
        #print(all_data.head())

        # Punctuation removal - CountVectorizer() does this for you
        #punctuation = {'.', ':', ';', "\'", '\"', '!', '?', "\’", '(', ')', '-'}

        #print(all_data[all_data[column].str.contains("19")])
        
all_dataframes[0].head()

Unnamed: 0,date,article,pole,unfiltered_articles,unfiltered_articles_lowercase,article_PNremoved
0,2020-02-19,pete buttigieg test has arrived in this week ...,0,Pete Buttigieg’s Test Has Arrived In this week...,Pete Buttigieg’s Test Has Arrived In this week...,"arrived in this week caucuses, the democratic..."
1,2020-04-14,gov. cuomo: is not 'king' and can not reopen...,1,Gov. Cuomo: Trump Is Not 'King' and Can't Reop...,Gov. Cuomo: Trump Is Not 'King' and Can't Reop...,is not 'king' and states warned not to push re...
2,2020-07-30,rep. john lewis tells youth to 'let freedom ri...,1,Rep. John Lewis Tells Youth to 'Let Freedom Ri...,Rep. John Lewis Tells Youth to 'Let Freedom Ri...,"to 'let in before civil rights icon died, he w..."
3,2020-07-19,how the black lives matter generation remember...,0,How The Black Lives Matter Generation Remember...,How The Black Lives Matter Generation Remember...,how the lives the civil rights icon who some c...
4,2020-04-28,health care workers stand up to anti-lockdown ...,0,Health Care Workers Stand Up To Anti-Lockdown ...,Health Care Workers Stand Up To Anti-Lockdown ...,workers to in putting me and my family at risk...


In [94]:
# CountVectorizers for onegram, bigram, trigram
# Does not work as intended
#max_df_value = 0.75
#min_df_value = 4

#count_vect1 = CountVectorizer(min_df = min_df_value, max_df = max_df_value, stop_words='english', max_features=5000)
#count_vect2 = CountVectorizer(min_df = min_df_value, max_df = max_df_value, stop_words='english', ngram_range=(1, 2), max_features=5000)
#count_vect3 = CountVectorizer(min_df = min_df_value, max_df = max_df_value, stop_words='english', ngram_range=(1, 3), max_features=5000)

#count_vectorizers = [count_vect1, count_vect2, count_vect3]

In [95]:
# Shows you the most frequently occurring words seen in text labelled as left or right
# This works on the most recent clf, but should relatively represent the same features used by each classifier
# so long as the parameters for min_df and max_df for each classifier are the same
def show_most_informative_features(vectorizer, clf, n=20):
    top_features = []
    right = []
    left = []
    feature_names = vectorizer.get_feature_names() # Get all the feature names that CountVectorizer() is using
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names)) # Put all feature names with their weights. Sort.
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1]) 
    print("\t\t%-15s\t\t\t%-15s" % ('Left', 'Right'))
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))    
        top_features.append([fn_1, coef_1, fn_2, coef_2])
        right.append([coef_2, fn_2])
        left.append([coef_1, fn_1])
    for x in range(len(left)):
        right.append([left[x][0], left[x][1]])
    print()
    return top_features, right

In [96]:
# Different Parameters
params = ['h_and_n_onegram_sr_nol_nopr', 'h_and_n_bigram_sr_nol_nopr', 'h_and_n_trigram_sr_nol_nopr', 
          'h_and_n_onegram_sr_nol_pr', 'h_and_n_bigram_sr_nol_pr', 'h_and_n_trigram_sr_nol_pr',
          'hs_and_nr_onegram_sr_nol_nopr', 'hs_and_nr_bigram_sr_nol_nopr', 'hs_and_nr_trigram_sr_nol_nopr',
          'hs_and_nr_onegram_sr_nol_pr', 'hs_and_nr_bigram_sr_nol_pr', 'hs_and_nr_trigram_sr_nol_pr',
          'hsr_and_nrw_onegram_sr_nol_nopr', 'hsr_and_nrw_bigram_sr_nol_nopr', 'hsr_and_nrw_trigram_sr_nol_nopr',
          'hsr_and_nrw_onegram_sr_nol_pr', 'hsr_and_nrw_bigram_sr_nol_pr', 'hsr_and_nrw_trigram_sr_nol_pr',
          'h_and_n_onegram_sr_l_nopr', 'h_and_n_bigram_sr_l_nopr', 'h_and_n_trigram_sr_l_nopr', 
          'h_and_n_onegram_sr_l_pr', 'h_and_n_bigram_sr_l_pr', 'h_and_n_trigram_sr_l_pr',
          'hs_and_nr_onegram_sr_l_nopr', 'hs_and_nr_bigram_sr_l_nopr', 'hs_and_nr_trigram_sr_l_nopr',
          'hs_and_nr_onegram_sr_l_pr', 'hs_and_nr_bigram_sr_l_pr', 'hs_and_nr_trigram_sr_l_pr',
          'hsr_and_nrw_onegram_sr_l_nopr', 'hsr_and_nrw_bigram_sr_l_nopr', 'hsr_and_nrw_trigram_sr_l_nopr',
          'hsr_and_nrw_onegram_sr_l_pr', 'hsr_and_nrw_bigram_sr_l_pr', 'hsr_and_nrw_trigram_sr_l_pr']

In [98]:
top_features_dict = {}
top_features_arr = []
number_count_vectorizers = 3
count = 0

for all_data in all_dataframes:
    for column in removal:
        
        if (count > 17):
            # Lemmatization after Word Tokenization
            lemmatizer = WordNetLemmatizer()
            all_data[column] = all_data[column].apply(lemmatize_text)

            # Turn the list of tokens back into a sentence
            all_data[column] = [' '.join(comment) for comment in all_data[column]]
        
        for x in range(number_count_vectorizers):

            # Randomly shuffle the rows before the split each time
            all_data = all_data.sample(frac=1)

            # Try an 80/20 train/test split
            train, test = train_test_split(all_data, test_size=0.2)

            if (x == 0):
                count_vect = CountVectorizer(min_df = 4, max_df = .75, stop_words='english', max_features=5000)
            elif (x == 1):
                count_vect = CountVectorizer(min_df = 4, max_df = .75, stop_words='english', ngram_range=(1, 2), max_features=5000)
            else:
                count_vect = CountVectorizer(min_df = 4, max_df = .75, stop_words='english', ngram_range=(1, 3), max_features=5000)
           # 
           # X_train_counts = count_vect.fit_transform(train[column])
           # X_train_counts.shape

           # tfidf_transformer = TfidfTransformer()
           # X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
           # X_train_tfidf.shape

           # clf = SGDClassifier().fit(X_train_tfidf, train.pole)
            
            clf = Pipeline([('vect', count_vect),
                                ('tfidf', TfidfTransformer()),
                                ('clf', LinearSVC(random_state=0)),
            ])
            
            clf = clf.fit(train.article, train.pole)
            
            # Accuracy
            predicted = clf.predict(test.unfiltered_articles)
            results = metrics.confusion_matrix(test.pole, predicted)
            accuracy = metrics.accuracy_score(test.pole, predicted)
            print(params[count])
            print("LinearSVC Accuracy Score: ", accuracy)

            # Grab and Print the top features for each different set of parameters
            #top_features = show_most_informative_features(count_vect, clf, 40)
            top_features_list, top_features = show_most_informative_features(count_vect, clf.named_steps['clf'], 40)
            
            # Put them in a dictionary
            top_features_dict[params[count]] = top_features_list
            
            # Put the top features in an array
            for x in range(len(top_features)):
                top_features_arr.append(top_features[x])
            
            # Let's save our model
            joblib.dump(clf, 'SVC/' + params[count] + ".pkl")

            count += 1

h_and_n_onegram_sr_nol_nopr
LinearSVC Accuracy Score:  0.8281068524970964
		Left           			Right          
	-4.2589	ap             		3.6417	reports        
	-3.6140	check          		3.1057	healthcare     
	-3.1920	reportedly     		2.7406	democrat       
	-2.9758	users          		2.6270	dems           
	-2.8478	repeatedly     		2.4777	hill           
	-2.7988	covid          		2.3439	according      
	-2.7550	host           		2.1988	ny             
	-2.7085	claimed        		1.8808	dem            
	-2.4370	suggested      		1.7949	politico       
	-2.3275	month          		1.7941	riots          
	-2.2959	warned         		1.7755	aoc            
	-2.2778	calif          		1.7290	fall           
	-2.1605	amid           		1.6489	gizzi          
	-2.1261	progressive    		1.5318	posted         
	-2.1192	yougov         		1.5150	wh             
	-2.0877	correction     		1.4964	ch             
	-2.0416	apparently     		1.4933	discunited     
	-2.0367	later          		1.4687	criticism      
	-2.0324

h_and_n_bigram_sr_nol_pr
LinearSVC Accuracy Score:  0.8461091753774681
		Left           			Right          
	-4.1357	ap             		3.8107	reports        
	-3.8701	washington ap  		3.1273	healthcare     
	-3.2671	check          		2.8195	dems           
	-2.9471	reportedly     		2.8174	hill           
	-2.8548	covid          		2.5412	democrat       
	-2.7532	claimed        		2.1770	ny             
	-2.5316	twitter users  		2.0869	according      
	-2.5313	repeatedly     		1.9111	hill reported  
	-2.5261	suggested      		1.8489	politico       
	-2.4267	warned         		1.6873	told fox       
	-2.3984	progressive    		1.6696	global coronavirus
	-2.3257	host           		1.6549	riots          
	-2.2890	users          		1.5803	tweeted        
	-2.2441	zelensky       		1.5792	news outlet    
	-2.2105	correction     		1.5688	mike bloomberg 
	-2.1639	yougov         		1.5500	according hill 
	-2.1342	month          		1.5282	virus          
	-1.9022	apparently     		1.5229	republican president
	-1

hs_and_nr_trigram_sr_nol_nopr
LinearSVC Accuracy Score:  0.8830188679245283
		Left           			Right          
	-3.6210	article originally		2.8114	democrat       
	-3.4356	originally appeared		2.7563	hill           
	-3.3641	article originally appeared		2.6519	reports        
	-3.1599	appeared       		2.5951	dems           
	-2.9207	originally     		2.4622	ap photo       
	-2.5547	writer         		2.1758	healthcare     
	-2.4598	check          		2.1250	ny             
	-2.3934	repeatedly     		1.8506	wuhan          
	-2.3292	month          		1.7892	rioters        
	-2.1889	democratic     		1.6649	alex           
	-2.1546	host           		1.5476	told fox       
	-2.1407	yougov         		1.5455	axios          
	-2.1367	reportedly     		1.5035	ca             
	-1.9626	users          		1.4994	communist      
	-1.9497	ap             		1.4791	riots          
	-1.9294	sponsored      		1.4781	leftist        
	-1.8843	image          		1.4644	woke           
	-1.8286	disease        		1.4312	joh

hsr_and_nrw_onegram_sr_nol_nopr
LinearSVC Accuracy Score:  0.8423728813559322
		Left           			Right          
	-4.2243	appeared       		2.1955	hill           
	-3.5206	originally     		2.1746	alex           
	-2.8412	host           		2.0333	democrat       
	-2.5138	article        		1.8866	media          
	-2.5075	writer         		1.8387	left           
	-2.2806	covid          		1.7973	healthcare     
	-1.9575	explained      		1.6604	wuhan          
	-1.9400	month          		1.6482	related        
	-1.8876	ky             		1.6260	aoc            
	-1.7963	users          		1.4585	dems           
	-1.7516	repeatedly     		1.4016	laar           
	-1.6832	calif          		1.3910	data           
	-1.6303	suggested      		1.3801	place          
	-1.6280	kind           		1.3679	inquiry        
	-1.6147	check          		1.3404	rioters        
	-1.6141	progressive    		1.2738	riots          
	-1.5935	care           		1.2611	leftist        
	-1.5907	read           		1.2439	borders        
	-1.

hsr_and_nrw_bigram_sr_nol_pr
LinearSVC Accuracy Score:  0.8652542372881356
		Left           			Right          
	-3.5796	appeared       		2.7112	democrat       
	-3.1130	article originally		1.8567	aoc            
	-2.8196	originally appeared		1.6417	told fox       
	-2.5411	host           		1.5318	wuhan          
	-2.4587	originally     		1.5174	media          
	-2.4216	twitter users  		1.5096	dems           
	-2.1707	writer         		1.4772	healthcare     
	-2.0739	covid          		1.4761	rioters        
	-1.9809	article        		1.4436	ap photo       
	-1.9633	right wing     		1.4090	riots          
	-1.8604	repeatedly     		1.3917	left           
	-1.8229	president joe  		1.3804	data           
	-1.7917	month          		1.3789	posts          
	-1.7840	check          		1.3417	leftist        
	-1.7609	progressive    		1.3334	hill           
	-1.7568	image          		1.3268	alex           
	-1.7256	ky             		1.3151	told politico  
	-1.5819	treatment      		1.3089	republican said


### Look at all possible unique features

In [107]:
# Using keep='false' in drop_duplicates is not appropriate here...
unique = np.array(top_features_arr)
unique_rows = np.unique(unique, axis=1)
final_features_df = pd.DataFrame(unique,
                   columns=['value', 'feature'])
final_features_df.head()

Unnamed: 0,value,feature
0,3.932500355554021,reports
1,3.823050335460375,reports
2,3.8161506727343366,reports
3,3.8107319715082952,reports
4,3.641659823684234,reports


In [112]:
# keep = False: if there's 2 occurences of a word it drops both entries.....it doesn't keep one.

# keep = first, drops all duplicates except for first occurence
# keep = False, drops all duplicates...does not keep even one for unique purposes
# keep = last, drops all duplicates except for the last occurence
final_features_df.drop_duplicates('feature', keep=False, inplace=True)
final_features_df.head()

Unnamed: 0,value,feature
214,1.5317889809668035,posted
289,1.4290840821701043,democratic gov
292,1.425683901180964,said said
304,1.4124845798896726,want
317,1.3973836859458693,activities


In [113]:
final_features_df[final_features_df['feature'].str.contains("reports")]

Unnamed: 0,value,feature


In [114]:
final_features_df[final_features_df['feature'].str.contains("healthcare")]

Unnamed: 0,value,feature


In [115]:
# Using keep = 'first' or keep='last' is appropriate...
unique = np.array(top_features_arr)
unique_rows = np.unique(unique, axis=1)
final_features_df = pd.DataFrame(unique,
                   columns=['value', 'feature'])
final_features_df.head()

Unnamed: 0,value,feature
0,3.932500355554021,reports
1,3.823050335460375,reports
2,3.8161506727343366,reports
3,3.8107319715082952,reports
4,3.641659823684234,reports


In [118]:
# keep = 'first': Seems to be what I need.

# keep = first, drops all duplicates except for first occurence
# keep = False, drops all duplicates...does not keep even one for unique purposes
# keep = last, drops all duplicates except for the last occurence
final_features_df.drop_duplicates('feature', keep='first', inplace=True)
final_features_df.head()

Unnamed: 0,value,feature
0,3.932500355554021,reports
6,3.2264103845891325,hill
7,3.224506154884109,healthcare
15,2.9691161560846884,dems
17,2.917305230892811,alex


In [119]:
final_features_df[final_features_df['feature'].str.contains("reports")]

Unnamed: 0,value,feature
0,3.932500355554021,reports
122,1.8330685405653704,news reports


In [120]:
final_features_df[final_features_df['feature'].str.contains("healthcare")]

Unnamed: 0,value,feature
7,3.224506154884109,healthcare


In [121]:
# Store the top features
left_features = []
right_features = []

for x in range(len(top_features_arr)):
    if top_features_arr[x][0] > 0:
        right_features.append([top_features_arr[x][0], top_features_arr[x][1]])
    else:
        left_features.append([top_features_arr[x][0], top_features_arr[x][1]])

In [122]:
# Left features
left_arr = np.array(left_features)
left = pd.DataFrame(left_arr, columns=['value', 'feature'])
left.drop_duplicates('feature', keep='last', inplace=True)
left.head()

Unnamed: 0,value,feature
2,-1.290704361244009,monday
4,-1.2916306738210863,harvey
9,-1.31755636169225,conference
11,-1.3190698040053763,central
17,-1.332989252352926,toll


In [123]:
# Right features
right_arr = np.array(right_features)
right = pd.DataFrame(right_arr, columns=['value', 'feature'])
right.drop_duplicates('feature', keep='first', inplace=True)
right.head()

Unnamed: 0,value,feature
0,3.932500355554021,reports
6,3.2264103845891325,hill
7,3.224506154884109,healthcare
15,2.9691161560846884,dems
17,2.917305230892811,alex


In [124]:
# All features
all_data = pd.concat([right, left])
all_data.head()

Unnamed: 0,value,feature
0,3.932500355554021,reports
6,3.2264103845891325,hill
7,3.224506154884109,healthcare
15,2.9691161560846884,dems
17,2.917305230892811,alex


In [125]:
# Some are seen twice on left and right here...
# Now keep=False is appropriate...as that is noise.

#for x in range(all_data.shape[0]):
#    print([all_data.value[x], all_data.feature[x]])
print(all_data.to_string())

                   value                      feature
0     3.9325003555540214                      reports
6     3.2264103845891325                         hill
7      3.224506154884109                   healthcare
15    2.9691161560846884                         dems
17     2.917305230892811                         alex
18    2.8604847412960104                     democrat
51    2.4622448392266563                     ap photo
57    2.3439219909390343                    according
58    2.3424998325442505                           ny
61    2.3039161767928875                        wuhan
69     2.227813824108311                hill reported
89    2.0437999069881134                         left
91     2.021687651886388                         mike
92     2.004913431265343                     politico
98    1.9525983346840972                     told fox
104   1.9073663927749793                          dem
106   1.8994594308070847                         fall
107   1.8865516633062396    

# Final Features

In [126]:
# Here, keep=False is actually useful.
# If a feature shows up in the left and right...it shouldn't be in either.
# That's due to articles being split 80/20 in a strange fashion 
# or 6 sources vs 4 or 2 could bring in more occurences of the word for that particular side.
# Ex: 'president donald' - frequency of -5.12 in left....4.82 in right.
# - This wouldn't make sense as this bigram can't appear 5 times more in left articles than right
# - and 4.8 times more in right articles than left at the same time.
all_data.drop_duplicates('feature', keep=False, inplace=True)
print(all_data.to_string())

                   value                      feature
0     3.9325003555540214                      reports
6     3.2264103845891325                         hill
7      3.224506154884109                   healthcare
15    2.9691161560846884                         dems
17     2.917305230892811                         alex
18    2.8604847412960104                     democrat
51    2.4622448392266563                     ap photo
57    2.3439219909390343                    according
58    2.3424998325442505                           ny
61    2.3039161767928875                        wuhan
69     2.227813824108311                hill reported
89    2.0437999069881134                         left
91     2.021687651886388                         mike
92     2.004913431265343                     politico
98    1.9525983346840972                     told fox
104   1.9073663927749793                          dem
106   1.8994594308070847                         fall
107   1.8865516633062396    