In [1]:
# Load all models from pickle files and see results.

# Challenges - will not be able to see features with their weights for RandomForestClassifier or VoteClassifier
# Will need to use LinearSVC to see the top words with their associated weights - use the "coef_" function
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import sklearn
import nltk
from sklearn import naive_bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis 
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from math import sqrt
import joblib
import string
import re
import sys
import datetime
import html
import os
import timeit
nltk.download('punkt')
nltk.download('wordnet')

# sklearn=0.23.1, pandas=1.0.1
print(sklearn.__version__)
print(pd.__version__)

0.23.1
1.0.1


[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Load a model and test on some entries - Try testing on an entire dataset of another source's articles!
text_clf = joblib.load('h_and_n_trigram_sr_nol_nopr.pkl')
length_req = 800  # Character length requirement for articles
confidence_level = .53

left_source = ["Huffington", "Slate", "Salon", "TalkingPointsMemo", "Alternet", "Rawstory"]

print("Left Sources:")
for source in left_source:
    test_data = pd.read_excel("csvs/" + source + ".xlsx", 
                  names=["date", "article"])
    
    print(source + ":")
    print("We have {:,} records".format(test_data.shape[0]))
    
    test_data['date'].fillna("", inplace=True)
    test_data['article'].fillna("", inplace=True)

    for x in range(test_data.shape[0]):
        if len(test_data['article'][x]) < length_req:
            test_data.drop(x, inplace=True)

    print("We have {:,} records > {} characters long".format(test_data.shape[0], length_req))

    # Only keep the unique article rows and their values
    test_data.drop_duplicates("article", keep='first', inplace=True)

    print("{:,} Records are unique".format(
        test_data.shape[0]))

    test_data['date'] = test_data['date'].str.replace(',', '')
    
    # This is a more clean and thorough url decoding function for decoding any character string...
    test_data['article'] = test_data['article'].astype(str).apply(lambda x: html.unescape(x))

    #test_data.head()
    
    # Check for null values
    #print(test_data.isnull().sum(axis=0))
    
    # If there were null values, the below will replace them. - Sometimes dates are missing when transferred over.
    test_data['date'].fillna("", inplace=True)
    #print(test_data.isnull().sum(axis=0))
    
    ## All pole entries - Have a look at how confident the model is on each individual entry.
    ## Confidence level - State how many pole entries you want to see that the model has classified above a specific confidence level
    confident_entries = 0

    test_data['pole'] = 0  # Make a column 'pole', assign a value of 0 to indicate left articles

    # predicted - should be an array of the predictions of the model in order that the articles come in.
    predicted = text_clf.predict(test_data['article'])
    class_probabilities = text_clf.predict_proba(test_data['article'])

    for x in range(test_data.shape[0]):
        if class_probabilities[x][0] > confidence_level:
            confident_entries += 1

    ## Pole entries above the chosen confidence level
    print("Number of entries above %.2f confidence %s: %i / %i" %(confidence_level, "(see confidence_level)", confident_entries, test_data.shape[0]))

    # Accuracy
    acc = metrics.accuracy_score(test_data.pole, predicted)
    print("Entire dataset accuracy: {:.4f}".format(acc), end='\n\n')
    #print("Entire dataset accuracy:", acc, end='\n\n')

Left Sources:
Huffington:
We have 4,355 records
We have 4,314 records > 800 characters long
4,304 Records are unique
Number of entries above 0.53 confidence (see confidence_level): 4074 / 4304
Entire dataset accuracy: 0.9568

Slate:
We have 1,566 records
We have 1,498 records > 800 characters long
1,156 Records are unique
Number of entries above 0.53 confidence (see confidence_level): 869 / 1156
Entire dataset accuracy: 0.7837

Salon:
We have 2,303 records
We have 2,278 records > 800 characters long
1,987 Records are unique
Number of entries above 0.53 confidence (see confidence_level): 1644 / 1987
Entire dataset accuracy: 0.8546

TalkingPointsMemo:
We have 2,888 records
We have 2,820 records > 800 characters long
1,896 Records are unique
Number of entries above 0.53 confidence (see confidence_level): 1056 / 1896
Entire dataset accuracy: 0.5955

Alternet:
We have 963 records
We have 931 records > 800 characters long
929 Records are unique
Number of entries above 0.53 confidence (see co

In [3]:
# Now test on the right sources
right_source = ["NewsMax", "NationalReview", "Redstate", "TheBulwark", "WashingtonExaminer"]

print("Right Sources:")
for source in right_source:
    test_data = pd.read_excel("csvs/" + source + ".xlsx", 
                  names=["date", "article"])
    
    print(source + ":")
    print("We have {:,} records".format(test_data.shape[0]))
    
    # Check for null values
    #print(test_data.isnull().sum(axis=0))
    
    # If there were null values, the below will replace them. - Sometimes dates are missing when transferred over.
    test_data['date'].fillna("", inplace=True)
    test_data['article'].fillna("", inplace=True)
    #print(test_data.isnull().sum(axis=0))

    for x in range(test_data.shape[0]):
        if len(test_data['article'][x]) < length_req:
            test_data.drop(x, inplace=True)

    print("We have {:,} records > {} characters long".format(test_data.shape[0], length_req))

    # Only keep the unique article rows and their values
    test_data.drop_duplicates("article", keep='first', inplace=True)

    print("{:,} Records are unique".format(
        test_data.shape[0]))

    test_data['date'] = test_data['date'].str.replace(',', '')
    
    # This is a more clean and thorough url decoding function for decoding any character string...
    test_data['article'] = test_data['article'].astype(str).apply(lambda x: html.unescape(x))

    #test_data.head()
    
    ## All pole entries - Have a look at how confident the model is on each individual entry.
    ## Confidence level - State how many pole entries you want to see that the model has classified above a specific confidence level
    confident_entries = 0

    test_data['pole'] = 1  # Make a column 'pole', assign a value of 1 to indicate right articles

    # predicted - should be an array of the predictions of the model in order that the articles come in.
    predicted = text_clf.predict(test_data['article'])
    class_probabilities = text_clf.predict_proba(test_data['article'])

    for x in range(test_data.shape[0]):
        if class_probabilities[x][0] > confidence_level:
            confident_entries += 1

    ## Pole entries above the chosen confidence level
    print("Number of entries above %.2f confidence %s: %i / %i" %(confidence_level, "(see confidence_level)", confident_entries, test_data.shape[0]))

    # Accuracy
    acc = metrics.accuracy_score(test_data.pole, predicted)
    print("Entire dataset accuracy: {:.4f}".format(acc), end='\n\n')
    #print("Entire dataset accuracy:", acc, end='\n\n')

Right Sources:
NewsMax:
We have 5,829 records
We have 5,637 records > 800 characters long
5,588 Records are unique
Number of entries above 0.53 confidence (see confidence_level): 315 / 5588
Entire dataset accuracy: 0.9315

NationalReview:
We have 619 records
We have 605 records > 800 characters long
578 Records are unique
Number of entries above 0.53 confidence (see confidence_level): 299 / 578
Entire dataset accuracy: 0.4394

Redstate:
We have 2,270 records
We have 2,207 records > 800 characters long
2,111 Records are unique
Number of entries above 0.53 confidence (see confidence_level): 1170 / 2111
Entire dataset accuracy: 0.4017

TheBulwark:
We have 1,413 records
We have 1,405 records > 800 characters long
578 Records are unique
Number of entries above 0.53 confidence (see confidence_level): 388 / 578
Entire dataset accuracy: 0.2924

WashingtonExaminer:
We have 1,117 records
We have 1,104 records > 800 characters long
983 Records are unique
Number of entries above 0.53 confidence (s

In [29]:
# Access the classifier and CountVectorizer() in the Pipeline object from the .pkl file
try:
    clf = text_clf.named_steps['clf']
except:
    clf = text_clf.named_steps['eclf']
count_vect = text_clf.named_steps['vect']
print(clf)
print(count_vect)

VotingClassifier(estimators=[('svc',
                              CalibratedClassifierCV(base_estimator=LinearSVC(random_state=0))),
                             ('sgd',
                              CalibratedClassifierCV(base_estimator=SGDClassifier(random_state=0))),
                             ('rf',
                              RandomForestClassifier(n_estimators=200,
                                                     random_state=0))],
                 voting='soft')
CountVectorizer(max_df=0.75, max_features=5000, min_df=4, ngram_range=(1, 3),
                stop_words='english')


In [30]:
# See all feature words - not too useful
# But can be...if you know some top ones from LinearSVC's.
feature_names = count_vect.get_feature_names()
print(feature_names)



### Display an accuracy report for all pickle files

In [31]:
pickle_files = ['h_and_n_onegram_sr_nol_nopr.pkl', 'h_and_n_onegram_sr_l_nopr.pkl',
                'h_and_n_bigram_sr_nol_nopr.pkl', 'h_and_n_bigram_sr_l_nopr.pkl',
                'h_and_n_trigram_sr_nol_nopr.pkl', 'h_and_n_trigram_sr_l_nopr.pkl',
                'hs_and_nr_onegram_sr_nol_nopr.pkl', 'hs_and_nr_onegram_sr_l_nopr.pkl',
                'hs_and_nr_bigram_sr_nol_nopr.pkl', 'hs_and_nr_bigram_sr_l_nopr.pkl',
                'hs_and_nr_trigram_sr_nol_nopr.pkl', 'hs_and_nr_trigram_sr_l_nopr.pkl',
                'hsr_and_nrw_onegram_sr_nol_nopr.pkl', 'hsr_and_nrw_onegram_sr_l_nopr.pkl',
                'hsr_and_nrw_bigram_sr_nol_nopr.pkl', 'hsr_and_nrw_bigram_sr_l_nopr.pkl',
                'hsr_and_nrw_trigram_sr_nol_nopr.pkl', 'hsr_and_nrw_trigram_sr_l_nopr.pkl',
                'h_and_n_onegram_sr_nol_pr.pkl', 'h_and_n_onegram_sr_l_pr.pkl',
                'h_and_n_bigram_sr_nol_pr.pkl', 'h_and_n_bigram_sr_l_pr.pkl',
                'h_and_n_trigram_sr_nol_pr.pkl', 'h_and_n_trigram_sr_l_pr.pkl',
                'hs_and_nr_onegram_sr_nol_pr.pkl', 'hs_and_nr_onegram_sr_l_pr.pkl',
                'hs_and_nr_bigram_sr_nol_pr.pkl', 'hs_and_nr_bigram_sr_l_pr.pkl',
                'hs_and_nr_trigram_sr_nol_pr.pkl', 'hs_and_nr_trigram_sr_l_pr.pkl',
                'hsr_and_nrw_onegram_sr_nol_pr.pkl', 'hsr_and_nrw_onegram_sr_l_pr.pkl',
                'hsr_and_nrw_bigram_sr_nol_pr.pkl', 'hsr_and_nrw_bigram_sr_l_pr.pkl',
                'hsr_and_nrw_trigram_sr_nol_pr.pkl', 'hsr_and_nrw_trigram_sr_l_pr.pkl']

In [32]:
# Test on all sources
confidence_level = .53

left_source = ["Huffington", "Slate", "Salon", "TalkingPointsMemo", "Alternet", "Rawstory"]
#left_test = zip(left_source, pickle_files) 
left_overall_acc = []
huff_acc = []
salon_acc = []
rawstory_acc = []

print("Left Sources - Testing against")
print("%45s  %s  %s  %s  %s  %s  %s" %(left_source[0], left_source[1], left_source[2], left_source[3], 
                                       left_source[4], left_source[5], "Overall (excluding TalkingPointsMemo)"))
for pickle in pickle_files:
    accuracys = []
    total_acc = 0
    for source in left_source:
        text_clf = joblib.load(pickle)
        test_data = pd.read_excel("csvs/" + source + ".xlsx", 
                      names=["date", "article"])

        # Check for null values
        test_data['date'].fillna("", inplace=True)
        test_data['article'].fillna("", inplace=True)

        for x in range(test_data.shape[0]):
            if len(test_data['article'][x]) < length_req:
                test_data.drop(x, inplace=True)

        # Only keep the unique article rows and their values
        test_data.drop_duplicates("article", keep='first', inplace=True)

        test_data['date'] = test_data['date'].str.replace(',', '')

        # This is a more clean and thorough url decoding function for decoding any character string...
        test_data['article'] = test_data['article'].astype(str).apply(lambda x: html.unescape(x))

        #test_data.head()

        # Check for null values
        #print(test_data.isnull().sum(axis=0))

        # If there were null values, the below will replace them. - Sometimes dates are missing when transferred over.
        test_data['date'].fillna("", inplace=True)
        #print(test_data.isnull().sum(axis=0))

        ## All pole entries - Have a look at how confident the model is on each individual entry.
        ## Confidence level - State how many pole entries you want to see that the model has classified above a specific confidence level
        confident_entries = 0

        test_data['pole'] = 0  # Make a column 'pole', assign a value of 0 to indicate left articles

        # predicted - should be an array of the predictions of the model in order that the articles come in.
        predicted = text_clf.predict(test_data['article'])
        class_probabilities = text_clf.predict_proba(test_data['article'])

        for x in range(test_data.shape[0]):
            if class_probabilities[x][0] > confidence_level:
                confident_entries += 1

        # Accuracy
        acc = metrics.accuracy_score(test_data.pole, predicted)
        #print("Entire dataset accuracy:", acc, end='\n\n')
        
        if (source == "Huffington"):
            huff_acc.append(acc*100)
        elif (source == "Salon"):
            salon_acc.append(acc*100)
        elif (source == "Rawstory"):
            rawstory_acc.append(acc*100)
        else:
            pass
        
        accuracys.append(acc*100)
    
    # Display pickle file used with accuracys
    for x in range(len(accuracys)):
        if x == 3:
            pass
        else:
            total_acc += accuracys[x]
    total_acc = total_acc / (len(accuracys) - 1)
    left_overall_acc.append(total_acc)
    print("%-35s%.3f      %.3f %.3f %5.3f         %10.3f    %.3f    %.3f" % (pickle, accuracys[0], accuracys[1], 
                                                                             accuracys[2], accuracys[3], accuracys[4], 
                                                                             accuracys[5], total_acc))

Left Sources - Testing against
                                   Huffington  Slate  Salon  TalkingPointsMemo  Alternet  Rawstory  Overall (excluding TalkingPointsMemo)
h_and_n_onegram_sr_nol_nopr.pkl    43.030      65.138 57.826 28.006             48.332    56.083    54.082
h_and_n_onegram_sr_l_nopr.pkl      93.262      73.183 81.178 53.586             63.940    74.806    77.274
h_and_n_bigram_sr_nol_nopr.pkl     96.050      79.758 86.613 58.439             67.169    79.465    81.811
h_and_n_bigram_sr_l_nopr.pkl       93.216      77.422 84.600 55.591             66.738    76.273    79.650
h_and_n_trigram_sr_nol_nopr.pkl    95.678      78.374 85.455 59.546             67.277    79.551    81.267
h_and_n_trigram_sr_l_nopr.pkl      41.380      58.824 62.104 28.903             44.349    47.196    50.770
hs_and_nr_onegram_sr_nol_nopr.pkl  85.362      56.920 99.396 49.262             60.388    72.649    74.943
hs_and_nr_onegram_sr_l_nopr.pkl    84.108      59.862 99.497 49.789             62

In [33]:
# Now test on the right sources
right_source = ["NewsMax", "NationalReview", "Redstate", "WashingtonExaminer", "TheBulwark"]
right_overall_acc = []
newsmax_acc = []
redstate_acc = []
washington_acc = []

print("Right Sources - Testing against")
print("%45s  %s  %s  %s  %s  %s" %(right_source[0], right_source[1], right_source[2], 
                                    right_source[3], right_source[4], "Overall (Excluding Bulwark)"))
for pickle in pickle_files:
    accuracys = []
    total_acc = 0
    for source in right_source:
        text_clf = joblib.load(pickle)
        test_data = pd.read_excel("csvs/" + source + ".xlsx", 
                      names=["date", "article"])

        # If there were null values, the below will replace them. - Sometimes dates are missing when transferred over.
        test_data['date'].fillna("", inplace=True)
        test_data['article'].fillna("", inplace=True)
        #print(test_data.isnull().sum(axis=0))

        for x in range(test_data.shape[0]):
            if len(test_data['article'][x]) < length_req:
                test_data.drop(x, inplace=True)

        # Only keep the unique article rows and their values
        test_data.drop_duplicates("article", keep='first', inplace=True)

        test_data['date'] = test_data['date'].str.replace(',', '')

        # This is a more clean and thorough url decoding function for decoding any character string...
        test_data['article'] = test_data['article'].astype(str).apply(lambda x: html.unescape(x))

        ## All pole entries - Have a look at how confident the model is on each individual entry.
        ## Confidence level - State how many pole entries you want to see that the model has classified above a specific confidence level
        confident_entries = 0

        test_data['pole'] = 1  # Make a column 'pole', assign a value of 1 to indicate right articles

        # predicted - should be an array of the predictions of the model in order that the articles come in.
        predicted = text_clf.predict(test_data['article'])
        class_probabilities = text_clf.predict_proba(test_data['article'])

        for x in range(test_data.shape[0]):
            if class_probabilities[x][0] > confidence_level:
                confident_entries += 1
                
        # Accuracy
        acc = metrics.accuracy_score(test_data.pole, predicted)
        #print("Entire dataset accuracy:", acc, end='\n\n')
    
        if (source == "NewsMax"):
            newsmax_acc.append(acc*100)
        elif (source == "Redstate"):
            redstate_acc.append(acc*100)
        elif (source == "WashingtonExaminer"):
            washington_acc.append(acc*100)
        else:
            pass
    
        accuracys.append(acc*100)
    
    # Display pickle file used with accuracys
    for x in range(len(accuracys) - 1):
        total_acc += accuracys[x]
    total_acc = total_acc / (len(accuracys) - 1)
    right_overall_acc.append(total_acc)
    print("%-35s   %.3f   %.3f          %.3f    %5.3f          %10.3f      %.3f" % (pickle, accuracys[0], accuracys[1], accuracys[2],
                                                                     accuracys[3], accuracys[4], total_acc))

Right Sources - Testing against
                                      NewsMax  NationalReview  Redstate  WashingtonExaminer  TheBulwark  Overall (Excluding Bulwark)
h_and_n_onegram_sr_nol_nopr.pkl       81.747   56.228          45.713    72.737              23.702      64.106
h_and_n_onegram_sr_l_nopr.pkl         93.074   48.962          43.297    63.683              30.796      62.254
h_and_n_bigram_sr_nol_nopr.pkl        93.218   45.675          37.944    63.886              25.260      60.181
h_and_n_bigram_sr_l_nopr.pkl          92.860   47.405          40.786    66.836              30.450      61.972
h_and_n_trigram_sr_nol_nopr.pkl       93.146   43.945          40.171    64.191              29.239      60.363
h_and_n_trigram_sr_l_nopr.pkl         85.075   55.190          54.192    73.550              33.045      67.002
hs_and_nr_onegram_sr_nol_nopr.pkl     87.205   68.166          99.005    68.057              55.536      80.608
hs_and_nr_onegram_sr_l_nopr.pkl       86.382   63.6

### This cell starts accuracy across every source

In [57]:
final_results = []
print("Entire dataset accuracy per pickle file:")
for x in range(len(left_overall_acc)):
    total_accuracy = 0
    total_accuracy += (left_overall_acc[x] + right_overall_acc[x]) / 2
    print("%-35s %.3f" % (pickle_files[x], total_accuracy))
    final_results.append([pickle_files[x], total_accuracy])

Entire dataset accuracy per pickle file:
h_and_n_onegram_sr_nol_nopr.pkl     59.094
h_and_n_onegram_sr_l_nopr.pkl       69.764
h_and_n_bigram_sr_nol_nopr.pkl      70.996
h_and_n_bigram_sr_l_nopr.pkl        70.811
h_and_n_trigram_sr_nol_nopr.pkl     70.815
h_and_n_trigram_sr_l_nopr.pkl       58.886
hs_and_nr_onegram_sr_nol_nopr.pkl   77.776
hs_and_nr_onegram_sr_l_nopr.pkl     76.983
hs_and_nr_bigram_sr_nol_nopr.pkl    78.453
hs_and_nr_bigram_sr_l_nopr.pkl      78.046
hs_and_nr_trigram_sr_nol_nopr.pkl   78.042
hs_and_nr_trigram_sr_l_nopr.pkl     77.934
hsr_and_nrw_onegram_sr_nol_nopr.pkl 83.469
hsr_and_nrw_onegram_sr_l_nopr.pkl   82.596
hsr_and_nrw_bigram_sr_nol_nopr.pkl  83.348
hsr_and_nrw_bigram_sr_l_nopr.pkl    82.675
hsr_and_nrw_trigram_sr_nol_nopr.pkl 83.729
hsr_and_nrw_trigram_sr_l_nopr.pkl   82.522
h_and_n_onegram_sr_nol_pr.pkl       70.855
h_and_n_onegram_sr_l_pr.pkl         69.163
h_and_n_bigram_sr_nol_pr.pkl        69.260
h_and_n_bigram_sr_l_pr.pkl          70.302
h_and_n_trigr

In [58]:
print("Sorted final results:")
final_results.sort(key=lambda x: x[1], reverse=True)
for x in range(len(final_results)):
    print("%-35s %.3f" % (final_results[x][0], final_results[x][1]))

Sorted final results:
hsr_and_nrw_trigram_sr_nol_nopr.pkl 83.729
hsr_and_nrw_onegram_sr_nol_nopr.pkl 83.469
hsr_and_nrw_bigram_sr_nol_nopr.pkl  83.348
hsr_and_nrw_bigram_sr_l_nopr.pkl    82.675
hsr_and_nrw_onegram_sr_l_nopr.pkl   82.596
hsr_and_nrw_trigram_sr_l_nopr.pkl   82.522
hsr_and_nrw_onegram_sr_nol_pr.pkl   81.099
hsr_and_nrw_bigram_sr_nol_pr.pkl    80.947
hsr_and_nrw_trigram_sr_nol_pr.pkl   80.853
hsr_and_nrw_onegram_sr_l_pr.pkl     79.423
hs_and_nr_bigram_sr_nol_nopr.pkl    78.453
hs_and_nr_bigram_sr_l_nopr.pkl      78.046
hs_and_nr_trigram_sr_nol_nopr.pkl   78.042
hs_and_nr_trigram_sr_l_nopr.pkl     77.934
hs_and_nr_onegram_sr_nol_nopr.pkl   77.776
hs_and_nr_bigram_sr_nol_pr.pkl      77.064
hs_and_nr_onegram_sr_l_nopr.pkl     76.983
hs_and_nr_trigram_sr_nol_pr.pkl     76.857
hs_and_nr_onegram_sr_nol_pr.pkl     76.314
hs_and_nr_onegram_sr_l_pr.pkl       75.757
hs_and_nr_bigram_sr_l_pr.pkl        75.729
hs_and_nr_trigram_sr_l_pr.pkl       75.525
h_and_n_bigram_sr_nol_nopr.pkl  

In [59]:
# Final results according to n-grams.
pickle_results = {}
onegram = []
bigram = []
trigram = []
onegram_overall = 0
bigram_overall = 0
trigram_overall = 0
total_one = 0
total_bi = 0
total_tri = 0

# Separate the results according to gram usage.
for x in range(len(final_results)):
    if "onegram" in final_results[x][0]:
        onegram.append(final_results[x][1])
    elif "bigram" in final_results[x][0]:
        bigram.append(final_results[x][1])
    else:
        trigram.append(final_results[x][1])

# Dictionary
#pickle_results["onegram"] = onegram
#pickle_results["bigram"] = bigram
#pickle_results["trigram"] = trigram
#for key, value in pickle_results.items():
#    if key == "onegram":
#        print(value)

for x in range(len(onegram)):
    total_one += onegram[x]
    total_bi += bigram[x]
    total_tri += trigram[x]
    
onegram_overall = total_one / len(onegram)
bigram_overall = total_bi / len(bigram)
trigram_overall = total_tri / len(trigram)

print("         onegram  bigram   trigram")
for x in range(len(onegram)):
    print("         %.3f   %.3f   %.3f  " % (onegram[x], bigram[x], trigram[x]))
print("\nOverall: %.3f   %.3f   %.3f  " % (onegram_overall, bigram_overall, trigram_overall))

         onegram  bigram   trigram
         83.469   83.348   83.729  
         82.596   82.675   82.522  
         81.099   80.947   80.853  
         79.423   78.453   78.042  
         77.776   78.046   77.934  
         76.983   77.064   76.857  
         76.314   75.729   75.525  
         75.757   70.996   70.815  
         70.855   70.811   69.798  
         69.764   70.302   69.044  
         69.163   69.260   58.886  
         59.094   59.619   55.925  

Overall: 75.191   74.771   73.327  


In [60]:
# See the difference between grams
pickle_files_by_gram = ['h_and_n_onegram_sr_nol_nopr.pkl', 'h_and_n_bigram_sr_nol_nopr.pkl', 'h_and_n_trigram_sr_nol_nopr.pkl',
                        'h_and_n_onegram_sr_l_nopr.pkl', 'h_and_n_bigram_sr_l_nopr.pkl', 'h_and_n_trigram_sr_l_nopr.pkl',
                        'hs_and_nr_onegram_sr_nol_nopr.pkl', 'hs_and_nr_bigram_sr_nol_nopr.pkl', 'hs_and_nr_trigram_sr_nol_nopr.pkl', 
                        'hs_and_nr_onegram_sr_l_nopr.pkl', 'hs_and_nr_bigram_sr_l_nopr.pkl', 'hs_and_nr_trigram_sr_l_nopr.pkl',
                        'hsr_and_nrw_onegram_sr_nol_nopr.pkl', 'hsr_and_nrw_bigram_sr_nol_nopr.pkl', 'hsr_and_nrw_trigram_sr_nol_nopr.pkl', 
                        'hsr_and_nrw_onegram_sr_l_nopr.pkl', 'hsr_and_nrw_bigram_sr_l_nopr.pkl', 'hsr_and_nrw_trigram_sr_l_nopr.pkl',
                        'h_and_n_onegram_sr_nol_pr.pkl', 'h_and_n_bigram_sr_nol_pr.pkl', 'h_and_n_trigram_sr_nol_pr.pkl', 
                        'h_and_n_onegram_sr_l_pr.pkl', 'h_and_n_bigram_sr_l_pr.pkl', 'h_and_n_trigram_sr_l_pr.pkl',
                        'hs_and_nr_onegram_sr_nol_pr.pkl', 'hs_and_nr_bigram_sr_nol_pr.pkl', 'hs_and_nr_trigram_sr_nol_pr.pkl', 
                        'hs_and_nr_onegram_sr_l_pr.pkl', 'hs_and_nr_bigram_sr_l_pr.pkl', 'hs_and_nr_trigram_sr_l_pr.pkl',
                        'hsr_and_nrw_onegram_sr_nol_pr.pkl', 'hsr_and_nrw_bigram_sr_nol_pr.pkl', 'hsr_and_nrw_trigram_sr_nol_pr.pkl', 
                        'hsr_and_nrw_onegram_sr_l_pr.pkl', 'hsr_and_nrw_bigram_sr_l_pr.pkl', 'hsr_and_nrw_trigram_sr_l_pr.pkl']

In [61]:
grams = []
one = 0
bi = 0
tri = 0
one_count = 0
bi_count = 0 
tri_count = 0
multiples = [2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 32, 35]

for param in pickle_files_by_gram:
    for x in range(len(final_results)):
        if param in final_results[x][0]:
            grams.append((final_results[x][0], final_results[x][1]))

for x in range(len(grams)):
    # Mutiples of 3...
    if (x % 3 == 0):
        if x == 0:
            one = grams[x][1]
        if one == 0:
            one = grams[x][1]
        print("\n%-35s   %.3f" % (grams[x][0], grams[x][1]))
    else:
        if bi == 0:
            bi = grams[x][1]
        if x in multiples:
            tri = grams[x][1]
        print("%-35s   %.3f" % (grams[x][0], grams[x][1]))
    
    # As x reaches a number in the list, clear variables and loop again
    if x in multiples:
        if (one > bi and one > tri):
            one_count += 1
        elif (bi > one and bi > tri):
            bi_count += 1
        else:
            tri_count += 1
        one = 0
        bi = 0
        tri = 0

print("\nOnegram > Bigram and Trigram accuracy:   %i / %i times" % (one_count, len(pickle_files_by_gram) / 3))
print("Bigram > Onegram and Bigram accuracy:    %i / %i times" % (bi_count, len(pickle_files_by_gram) / 3))
print("Trigram > Onegram and Trigram accuracy:  %i / %i times" % (tri_count, len(pickle_files_by_gram) / 3))


h_and_n_onegram_sr_nol_nopr.pkl       59.094
h_and_n_bigram_sr_nol_nopr.pkl        70.996
h_and_n_trigram_sr_nol_nopr.pkl       70.815

h_and_n_onegram_sr_l_nopr.pkl         69.764
h_and_n_bigram_sr_l_nopr.pkl          70.811
h_and_n_trigram_sr_l_nopr.pkl         58.886

hs_and_nr_onegram_sr_nol_nopr.pkl     77.776
hs_and_nr_bigram_sr_nol_nopr.pkl      78.453
hs_and_nr_trigram_sr_nol_nopr.pkl     78.042

hs_and_nr_onegram_sr_l_nopr.pkl       76.983
hs_and_nr_bigram_sr_l_nopr.pkl        78.046
hs_and_nr_trigram_sr_l_nopr.pkl       77.934

hsr_and_nrw_onegram_sr_nol_nopr.pkl   83.469
hsr_and_nrw_bigram_sr_nol_nopr.pkl    83.348
hsr_and_nrw_trigram_sr_nol_nopr.pkl   83.729

hsr_and_nrw_onegram_sr_l_nopr.pkl     82.596
hsr_and_nrw_bigram_sr_l_nopr.pkl      82.675
hsr_and_nrw_trigram_sr_l_nopr.pkl     82.522

h_and_n_onegram_sr_nol_pr.pkl         70.855
h_and_n_bigram_sr_nol_pr.pkl          69.260
h_and_n_trigram_sr_nol_pr.pkl         69.798

h_and_n_onegram_sr_l_pr.pkl           69.163
h_

In [62]:
# See the difference between lemmatization vs no lemmatization
pickle_files_by_lemmatization = ['h_and_n_onegram_sr_nol_nopr.pkl', 'h_and_n_onegram_sr_l_nopr.pkl',
                'h_and_n_bigram_sr_nol_nopr.pkl', 'h_and_n_bigram_sr_l_nopr.pkl',
                'h_and_n_trigram_sr_nol_nopr.pkl', 'h_and_n_trigram_sr_l_nopr.pkl',
                'hs_and_nr_onegram_sr_nol_nopr.pkl', 'hs_and_nr_onegram_sr_l_nopr.pkl',
                'hs_and_nr_bigram_sr_nol_nopr.pkl', 'hs_and_nr_bigram_sr_l_nopr.pkl',
                'hs_and_nr_trigram_sr_nol_nopr.pkl', 'hs_and_nr_trigram_sr_l_nopr.pkl',
                'hsr_and_nrw_onegram_sr_nol_nopr.pkl', 'hsr_and_nrw_onegram_sr_l_nopr.pkl',
                'hsr_and_nrw_bigram_sr_nol_nopr.pkl', 'hsr_and_nrw_bigram_sr_l_nopr.pkl',
                'hsr_and_nrw_trigram_sr_nol_nopr.pkl', 'hsr_and_nrw_trigram_sr_l_nopr.pkl',
                'h_and_n_onegram_sr_nol_pr.pkl', 'h_and_n_onegram_sr_l_pr.pkl',
                'h_and_n_bigram_sr_nol_pr.pkl', 'h_and_n_bigram_sr_l_pr.pkl',
                'h_and_n_trigram_sr_nol_pr.pkl', 'h_and_n_trigram_sr_l_pr.pkl',
                'hs_and_nr_onegram_sr_nol_pr.pkl', 'hs_and_nr_onegram_sr_l_pr.pkl',
                'hs_and_nr_bigram_sr_nol_pr.pkl', 'hs_and_nr_bigram_sr_l_pr.pkl',
                'hs_and_nr_trigram_sr_nol_pr.pkl', 'hs_and_nr_trigram_sr_l_pr.pkl',
                'hsr_and_nrw_onegram_sr_nol_pr.pkl', 'hsr_and_nrw_onegram_sr_l_pr.pkl',
                'hsr_and_nrw_bigram_sr_nol_pr.pkl', 'hsr_and_nrw_bigram_sr_l_pr.pkl',
                'hsr_and_nrw_trigram_sr_nol_pr.pkl', 'hsr_and_nrw_trigram_sr_l_pr.pkl']

In [63]:
lemmas = []
one = 0
bi = 0
one_count = 0
bi_count = 0 
multiples = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35]

for param in pickle_files_by_lemmatization:
    for x in range(len(final_results)):
        if param in final_results[x][0]:
            lemmas.append((final_results[x][0], final_results[x][1]))

for x in range(len(lemmas)):
    # Mutiples of 2...
    if (x % 2 == 0):
        if x == 0:
            one = lemmas[x][1]
        if one == 0:
            one = lemmas[x][1]
        print("\n%-35s   %.3f" % (lemmas[x][0], lemmas[x][1]))
    else:
        if bi == 0:
            bi = lemmas[x][1]
        print("%-35s   %.3f" % (lemmas[x][0], lemmas[x][1]))
        
    # As x reaches a number in the list, clear variables and loop again
    if x in multiples:
        if (one > bi):
            one_count += 1
        else:
            bi_count += 1
        one = 0
        bi = 0

print("\nNo Lemmatization > Lemmatization accuracy:   %i / %i times" % (one_count, len(pickle_files_by_lemmatization) / 2))
print("Lemmatization > No Lemmatization accuracy:   %i / %i times" % (bi_count, len(pickle_files_by_lemmatization) / 2))


h_and_n_onegram_sr_nol_nopr.pkl       59.094
h_and_n_onegram_sr_l_nopr.pkl         69.764

h_and_n_bigram_sr_nol_nopr.pkl        70.996
h_and_n_bigram_sr_l_nopr.pkl          70.811

h_and_n_trigram_sr_nol_nopr.pkl       70.815
h_and_n_trigram_sr_l_nopr.pkl         58.886

hs_and_nr_onegram_sr_nol_nopr.pkl     77.776
hs_and_nr_onegram_sr_l_nopr.pkl       76.983

hs_and_nr_bigram_sr_nol_nopr.pkl      78.453
hs_and_nr_bigram_sr_l_nopr.pkl        78.046

hs_and_nr_trigram_sr_nol_nopr.pkl     78.042
hs_and_nr_trigram_sr_l_nopr.pkl       77.934

hsr_and_nrw_onegram_sr_nol_nopr.pkl   83.469
hsr_and_nrw_onegram_sr_l_nopr.pkl     82.596

hsr_and_nrw_bigram_sr_nol_nopr.pkl    83.348
hsr_and_nrw_bigram_sr_l_nopr.pkl      82.675

hsr_and_nrw_trigram_sr_nol_nopr.pkl   83.729
hsr_and_nrw_trigram_sr_l_nopr.pkl     82.522

h_and_n_onegram_sr_nol_pr.pkl         70.855
h_and_n_onegram_sr_l_pr.pkl           69.163

h_and_n_bigram_sr_nol_pr.pkl          69.260
h_and_n_bigram_sr_l_pr.pkl            70.302

In [64]:
# See the difference between lemmatization vs no lemmatization
pickle_files_by_propernoun = ['h_and_n_onegram_sr_nol_nopr.pkl', 'h_and_n_onegram_sr_nol_pr.pkl',
                              'h_and_n_onegram_sr_l_nopr.pkl', 'h_and_n_onegram_sr_l_pr.pkl',
                              'h_and_n_bigram_sr_nol_nopr.pkl', 'h_and_n_bigram_sr_nol_pr.pkl',
                              'h_and_n_bigram_sr_l_nopr.pkl', 'h_and_n_bigram_sr_l_pr.pkl',
                              'h_and_n_trigram_sr_nol_nopr.pkl', 'h_and_n_trigram_sr_nol_pr.pkl',
                              'h_and_n_trigram_sr_l_nopr.pkl', 'h_and_n_trigram_sr_l_pr.pkl',
                              'hs_and_nr_onegram_sr_nol_nopr.pkl', 'hs_and_nr_onegram_sr_nol_pr.pkl',
                              'hs_and_nr_onegram_sr_l_nopr.pkl', 'hs_and_nr_onegram_sr_l_pr.pkl',
                              'hs_and_nr_bigram_sr_nol_nopr.pkl', 'hs_and_nr_bigram_sr_nol_pr.pkl',
                              'hs_and_nr_bigram_sr_l_nopr.pkl', 'hs_and_nr_bigram_sr_l_pr.pkl',
                              'hs_and_nr_trigram_sr_nol_nopr.pkl', 'hs_and_nr_trigram_sr_nol_pr.pkl',
                              'hs_and_nr_trigram_sr_l_nopr.pkl', 'hs_and_nr_trigram_sr_l_pr.pkl',
                              'hsr_and_nrw_onegram_sr_nol_nopr.pkl', 'hsr_and_nrw_onegram_sr_nol_pr.pkl',
                              'hsr_and_nrw_onegram_sr_l_nopr.pkl', 'hsr_and_nrw_onegram_sr_l_pr.pkl',
                              'hsr_and_nrw_bigram_sr_nol_nopr.pkl', 'hsr_and_nrw_bigram_sr_nol_pr.pkl',
                              'hsr_and_nrw_bigram_sr_l_nopr.pkl', 'hsr_and_nrw_bigram_sr_l_pr.pkl',
                              'hsr_and_nrw_trigram_sr_nol_nopr.pkl', 'hsr_and_nrw_trigram_sr_nol_pr.pkl',
                              'hsr_and_nrw_trigram_sr_l_nopr.pkl', 'hsr_and_nrw_trigram_sr_l_pr.pkl']         

In [65]:
propernoun = []
one = 0
bi = 0
one_count = 0
bi_count = 0 
multiples = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35]

for param in pickle_files_by_propernoun:
    for x in range(len(final_results)):
        if param in final_results[x][0]:
            propernoun.append((final_results[x][0], final_results[x][1]))

for x in range(len(propernoun)):
    # Mutiples of 2...
    if (x % 2 == 0):
        if x == 0:
            one = propernoun[x][1]
        if one == 0:
            one = propernoun[x][1]
        print("\n%-35s   %.3f" % (propernoun[x][0], propernoun[x][1]))
    else:
        if bi == 0:
            bi = propernoun[x][1]
        print("%-35s   %.3f" % (propernoun[x][0], propernoun[x][1]))
        
    # As x reaches a number in the list, clear variables and loop again
    if x in multiples:
        if (one > bi):
            one_count += 1
        else:
            bi_count += 1
        one = 0
        bi = 0

print("\nNo Proper Noun Removal > Proper Noun Removal accuracy:   %i / %i times" % (one_count, len(pickle_files_by_propernoun) / 2))
print("Proper Noun Removal > No Proper Noun Removal accuracy:   %i / %i times" % (bi_count, len(pickle_files_by_propernoun) / 2))


h_and_n_onegram_sr_nol_nopr.pkl       59.094
h_and_n_onegram_sr_nol_pr.pkl         70.855

h_and_n_onegram_sr_l_nopr.pkl         69.764
h_and_n_onegram_sr_l_pr.pkl           69.163

h_and_n_bigram_sr_nol_nopr.pkl        70.996
h_and_n_bigram_sr_nol_pr.pkl          69.260

h_and_n_bigram_sr_l_nopr.pkl          70.811
h_and_n_bigram_sr_l_pr.pkl            70.302

h_and_n_trigram_sr_nol_nopr.pkl       70.815
h_and_n_trigram_sr_nol_pr.pkl         69.798

h_and_n_trigram_sr_l_nopr.pkl         58.886
h_and_n_trigram_sr_l_pr.pkl           69.044

hs_and_nr_onegram_sr_nol_nopr.pkl     77.776
hs_and_nr_onegram_sr_nol_pr.pkl       76.314

hs_and_nr_onegram_sr_l_nopr.pkl       76.983
hs_and_nr_onegram_sr_l_pr.pkl         75.757

hs_and_nr_bigram_sr_nol_nopr.pkl      78.453
hs_and_nr_bigram_sr_nol_pr.pkl        77.064

hs_and_nr_bigram_sr_l_nopr.pkl        78.046
hs_and_nr_bigram_sr_l_pr.pkl          75.729

hs_and_nr_trigram_sr_nol_nopr.pkl     78.042
hs_and_nr_trigram_sr_nol_pr.pkl       76.857

In [66]:
# See the classifier each model uses.
for x in range(len(final_results)):
    text_clf = joblib.load(final_results[x][0])
    
    try:
        clf = text_clf.named_steps['clf']
        count_vect = text_clf.named_steps['vect']
        print("%s: %.3f" % (final_results[x][0], final_results[x][1]))
        print(clf)
        print("%s \n" % (count_vect))
    except:
        clf = text_clf.named_steps['eclf']
        count_vect = text_clf.named_steps['vect']
        print("%s: %.3f" % (final_results[x][0], final_results[x][1]))
        print(clf)
        print("%s \n" % (count_vect))

hsr_and_nrw_trigram_sr_nol_nopr.pkl: 83.729
VotingClassifier(estimators=[('svc',
                              CalibratedClassifierCV(base_estimator=LinearSVC(random_state=0))),
                             ('sgd',
                              CalibratedClassifierCV(base_estimator=SGDClassifier(random_state=0))),
                             ('rf',
                              RandomForestClassifier(n_estimators=200,
                                                     random_state=0))],
                 voting='soft')
CountVectorizer(max_df=0.75, max_features=5000, min_df=4, ngram_range=(1, 3),
                stop_words='english') 

hsr_and_nrw_onegram_sr_nol_nopr.pkl: 83.469
VotingClassifier(estimators=[('svc',
                              CalibratedClassifierCV(base_estimator=LinearSVC(random_state=0))),
                             ('sgd',
                              CalibratedClassifierCV(base_estimator=SGDClassifier(random_state=0))),
                             ('rf',
   

hs_and_nr_trigram_sr_l_nopr.pkl: 77.934
VotingClassifier(estimators=[('svc',
                              CalibratedClassifierCV(base_estimator=LinearSVC(random_state=0))),
                             ('sgd',
                              CalibratedClassifierCV(base_estimator=SGDClassifier(random_state=0))),
                             ('rf',
                              RandomForestClassifier(n_estimators=200,
                                                     random_state=0))],
                 voting='soft')
CountVectorizer(max_df=0.75, max_features=5000, min_df=4, ngram_range=(1, 3),
                stop_words='english') 

hs_and_nr_onegram_sr_nol_nopr.pkl: 77.776
VotingClassifier(estimators=[('svc',
                              CalibratedClassifierCV(base_estimator=LinearSVC(random_state=0))),
                             ('sgd',
                              CalibratedClassifierCV(base_estimator=SGDClassifier(random_state=0))),
                             ('rf',
         

h_and_n_trigram_sr_nol_pr.pkl: 69.798
VotingClassifier(estimators=[('svc',
                              CalibratedClassifierCV(base_estimator=LinearSVC(random_state=0))),
                             ('sgd',
                              CalibratedClassifierCV(base_estimator=SGDClassifier(random_state=0))),
                             ('rf',
                              RandomForestClassifier(n_estimators=200,
                                                     random_state=0))],
                 voting='soft')
CountVectorizer(max_df=0.75, max_features=5000, min_df=4, ngram_range=(1, 3),
                stop_words='english') 

h_and_n_onegram_sr_l_nopr.pkl: 69.764
VotingClassifier(estimators=[('svc',
                              CalibratedClassifierCV(base_estimator=LinearSVC(random_state=0))),
                             ('sgd',
                              CalibratedClassifierCV(base_estimator=SGDClassifier(random_state=0))),
                             ('rf',
               

In [67]:
# RandomForest and VotingClassifier - Have no coef_ function
# Therefore, you cannot see the frequency of each feature those classifiers are using
# You'll need to use the 2nd best classifier to see the frequency of the top features.
# Use LinearSVC classifier in Top_Features.ipynb (SGD - 3rd best)
text_clf = joblib.load(final_results[1][0])
clf = text_clf.named_steps['eclf']
count_vect = text_clf.named_steps['vect']
top_features = show_most_informative_features(count_vect, clf, 40)

AttributeError: 'VotingClassifier' object has no attribute 'coef_'

In [5]:
# Test against some new Fox News - Opinion articles
input_arr = ["2020-08-03, Deroy Murdock: Orwellian Democrats claim Portland's violence = peace Andy McCarthy provides insight to AG Bill testifying before the House judiciary committee NEW YORK — Denial is not just a river in Egypt, as the ancient joke goes. It’s the Democrat/media/Left’s comprehensive response to the nationwide riots that rage on, seven weeks after the funeral of police-brutality victim George Floyd. His memory has been kidnapped by the most violent insurrectionists America has seen since 1968. “Portland, Oregon is not out of control,” U.S. Rep. Earl Blumenauer, D-Ore., reassured his House colleagues on July 21. What about the projectiles, fires and explosions that have rocked that city since early June? If you have seen such things on TV, they must have been special effects. Or perhaps you have been experiencing acid flashbacks — a not-so-subtle reminder of your colorful youth. “Do you disavow the violence from Antifa that’s happening in Portland right now?” journalist Austen Fleccas asked Rep. Jerrold Nadler, D-N.Y., on Sunday. “That’s a myth that’s being spread only in Washington, D.C.,” Nadler replied. PORTLAND POLICE OVERTIME PAY EXCEEDED $5.3M SINCE PROTESTS STARTED Two days later, Nadler further explained Portland’s mythical instability. He excoriated William Barr at a hearing/verbal abuse session that the House Judiciary Committee arranged for the attorney general. Clearly enraged by the Trump administration’s surge of federal officers sent to Portland to combat the carnage that is not happening there, Nadler said, “The president wants footage for his campaign ads, and you appear to be serving it up to him as ordered.” The Judiciary chairman added: “Now you are projecting fear and violence nationwide in pursuit of obvious political objectives. Shame on you, Mr. Barr. Shame on you.” “The playbook is to create the impression that there is violence, that he must send in federal troops,” Rep. Zoe Lofgren, D-Calif., said at Tuesday’s inquisition. “And that is how he [President Trump] hopes to win the election.” “People are showing up because the troops are there,” added Lofgren. “Most of them are non-violent.” Of course, as a 13-term congresswoman, Lofgren knows the difference between troops, who are not there, and federal civilian officers, who are confronting the impression of violence.   “Most of the protests have been peaceful, Mr. Barr. You know that,” scolded Rep. Debbie Mucarsel-Powell, D-Fla. “In most of these cities, the protests had begun to wind down before you marched in and confronted the protesters.” Gov. Kate Brown, D-Ore., wrote Wednesday via Twitter that federal agents “have acted as an occupying force & brought violence.” Brown’s words echoed those of Portland's Democratic Mayor Ted Wheeler. As he wrote via Twitter on July 14, “my biggest immediate concern is the violence federal officers brought to our streets in recent days… We do not need or want their help.” But on July 3, Wheeler blamed others for his city’s chaos. “I remain deeply concerned, however, by groups who continue to perpetrate violence and vandalism on our streets,” Wheeler wrote via Twitter, a day before federal Homeland Security officers reached Portland. “This has been going on for more than a month now,” Wheeler continued, suggesting that Stumptown’s unrest began around June 3, four weeks before ”Trump’s troops” arrived and, as Leftist liars contend, magically transformed peaceful protesters into rioters, all so Trump would have scary villains to attack en route to reelection. Wheeler added, in consecutive Twitter messages on July 3: “Groups continue to target the Justice Center, threatening the safety of hundreds of inmates and employees inside...They continue to hurt small businesses owned by people of color, instill fear in communities of color, and start fires in buildings with people inside, in one specific case, even bolting emergency doors so that they could not escape.” Unlike House Democrats, who mainly told Barr to shut up, Republicans introduced a radical reform: They asked Barr questions and let him answer. “As far as the weapons you mentioned, let me get this straight,” said Rep. Steve Chabot, R-Ohio. He listed “rifles, explosives, knives, saws, sledgehammers, Tasers, slingshots, rocks, bricks, lasers. Have I missed anything?” “You have missed some things, but that’s a good list,” Barr replied. “They have these powerful slingshots with ball bearings that they shoot. They have used pellet guns, we believe. We have found those projectiles have penetrated Marshals to the bone. They use the lasers to blind the Marshals. They do start fires. They start fires, if they can get the fire inside or through the windows. And they start fires along the outside of the courthouse. When the Marshals come out to try to deal with the fires, they are assaulted.” “Federal courthouses are under attack,” Barr reminded the Committee on the Judiciary, no less. Seemingly exasperated with oblivious, or totally dishonest, Democrats, Barr wondered: “Since when is it OK to try to burn down a federal court? If someone went down the street to the Prettyman Court here, that beautiful courthouse we have right at the bottom of the Hill, and started breaking windows and firing industrial grade fireworks, and, to start a fire, throw kerosene balloons and start fires in the court. Is that OK? Is that OK now?”        Federal officers advance on demonstrators during a Black Lives Matter protest at the Mark O. Hatfield US Courthouse, July 25, 2020.<strong> </strong> (AP Photo/Marcio Jose Sanchez)        According to DHS, “Violent anarchists targeted surveillance cameras around the Hatfield Courthouse, rendering them inoperable.” Also on July 19-20, “The U.S. Marshals Service reported communications jamming last night, which may have caused significant problems with their radio communications.” The Associated Press’ Michael Balsamo embedded himself inside the Mark Hatfield U.S. Courthouse. His dispatches, via Twitter, are chilling. “I watched as injured officers were hauled inside. In one case, the commercial firework came over so fast the officer didn’t have time to respond. It burned through his sleeves & he had bloody gashes on both forearms. Another had a concussion from being hit in the head w/ a mortar.” Balsamo added: “The lights inside the courthouse have to be turned off for safety & the light from high-powered lasers bounced across the lobby almost all night. The fear is palpable. Three officers were struck in the last few weeks & still haven’t regained their vision.”   DHS’s deployment to Portland is not ritual chest-beating. It’s not toxic masculinity. It’s the law. According to 40 U.S. Code § 1315, the Secretary of Homeland Security “shall protect the buildings, grounds, and property that are owned, occupied, or secured by the Federal Government.” Acting Secretary Chad Wolf would break federal law if he left the courthouse undefended and let Antifa & Co. burn it to the ground. Of course, if the Hatfield Courthouse went up in smoke, the same people decrying President Trump’s supposed fascism would erupt like Klaxons: “Why didn’t he stop this? He was asleep at the switch! Wake up, Mr. President!”   Attorney General Barr had every reason to be perplexed by the radical Democrats’ institutional indifference toward these relentless onslaughts against federal personnel and what they are guarding: a palace of justice. “What makes me concerned for the country is this is the first time in my memory that the leaders of one of our great two political parties, the Democratic Party, are not coming out and condemning mob violence and the attack on federal courts,” Barr said. “Why can’t we just say violence against federal courts has to stop? Could we hear something like that?” ",       
"2020-08-03, Andrew McCarthy: Court rejection of Boston bomber’s death sentence seems based on hostility to death penalty  A couple of weeks back, when the Justice Department endeavored to restart executions of inmates sentenced to death by juries for unspeakable murders only to have federal judges (appointed by President Obama, in the main) throw up roadblocks, I repeated an observation I’ve made several times over the years. “Because much of the bench is hostile to the death penalty, judges are wont to fashion reasons not to impose it, some of which have nothing ostensibly to do with the death penalty and make prosecution of other types of criminals more difficult,” I wrote. We saw this again Friday. A federal appeals court in Boston threw out the death sentence of Dzhokar Tsarnaev, who brutally killed three people and injured more than 260 others when he and his late brother, Tamerlan, bombed the 2013 Boston Marathon. FEDERAL APPEALS COURT VACATES BOSTON MARATHON BOMBER DZHOKHAR TSARNAEV'S DEATH SENTENCE The three-judge panel consisted of two Obama appointees, Judges O. Rogeriee Thompson (who wrote the nearly 200-page opinion) and William J. Kayatta Jr., who formed the majority. A Reagan appointee, Juan R. Torruella, concurred in the result and much of the reasoning. Because of the decision’s girth, more time will be needed to study it. The upshot of the ruling, however, is that the trial judge failed to ensure that the Boston jury could be fair and impartial in light of all the prejudicial pretrial publicity. There is a strong suggestion that the trial judge should have granted a change of venue. This seems utterly unpersuasive to me. To start with, if there is grave doubt that Tsarnaev got a fair trial under the circumstances, then why does the court leave the bulk of his convictions undisturbed? A terrorist who bombs Boston is not going to be viewed with detachment and objectivity if he is instead tried in Philadelphia or Houston. The court does reverse three firearms convictions, but on technical legal grounds not because of jury prejudice. (Aside: Most Americans will be puzzled by the technical legal rationale, which leads to the court’s conclusion that Tsarnaev, a terrorist, was not engaged in a “crime of violence” while he was carrying a firearm.) Yet, the court takes pains to assure everyone that Tsarnaev “will remain confined to prison for the rest of his life.” The only remaining questions are whether the government will choose to re-try the death penalty phase of the case, and whether a new jury will unanimously vote for a capital sentence in a proceeding that the reviewing court — someday, years from now — decides passes its evolving standards of fairness. Why? If the jury was inflamed by unfair prejudice from the start, then why does the court believe Tsarnaev’s convictions should stand? That a minimum sentence of life imprisonment must stand? That only the death penalty must be revisited? I prosecuted terrorists in a courthouse that was a few blocks away from the World Trade Center that they had conspired to bomb. Our courtroom was similarly within easy walking distance of the FBI’s New York field office and the Holland Tunnel, which were also on the jihadists’ target list. To be sure, it was not a death penalty case, but the same issues of prejudicial pretrial publicity existed. The suggestion that it is not possible for a defendant to get a fair trial in the city he has terrorized is far-fetched.   The court intimates that the challenge of insulating a jury from publicity is more daunting today than it has ever been because the Internet and social media make publicity ubiquitous. To my mind, that undercuts the claim that changes of venue are warranted to ensure a fair trial. Domestic terrorist attacks are national stories. Obviously, people who live in a city that has been attacked stand a greater chance of knowing a victim of the attack, or of being personally affected by the fallout of the attack. But such jurors can easily be weeded out in a competent voir dire examination. Beyond that, nobody approves of terrorists. A terrorist who bombs Boston is not going to be viewed with detachment and objectivity if he is instead tried in Philadelphia or Houston. Moreover, the people in those cities are going to have been nearly as inundated by publicity about the atrocity as Bostonians.   In a criminal case, the issue with jurors is never whether they approve of egregious conduct. It is whether they can put aside their natural disapproval, figure out what factually happened, and faithfully apply the law as instructed by the judge. We’ll have to study the lengthy opinion. At first blush, though, it certainly appears that Friday’s ruling has at least as much to do with judicial hostility to capital punishment as to concerns about the due process implications of intense media coverage.",
"2020-08-03, Tucker Carlson: Equality under law is slipping away The Jeffrey Epstein case paints a picture of a justice system in which the rich and well-connected can do virtually whatever they want. With America’s institutions under relentless attack — and in some cases crumbling — it’s worth thinking through what we’d like to save from the ashes. When the revolution finally ends. what do we hope to have left? In other words, what are our best traditions? There are a lot of them. At the very top of the list is equality under the law. Equality is the most basic of all American ideals. It’s the very first principle articulated in the Declaration of Independence. It’s why the founders broke with England. In America, all citizens would be subject to the same rules: The same standards. The same penalties. Rich or poor. Black or White. All of us are equal under the law. TUCKER CARLSON: POLITICAL VIOLENCE IS AN ATTACK ON AMERICA ITSELF That’s the promise. It’s easier to explain than to achieve, of course. But we’ve tried hard. We should be proud of that. Yet some in power are no longer trying. Equality, the thing we’ve fought to keep for centuries, is slipping away. The Jeffrey Epstein case is the latest example. Thursday night, dozens of unsealed court documents from the Epstein case emerged online. They paint a picture of a justice system in which the rich and well-connected can do virtually whatever they want. In one sworn deposition, a woman called Virginia Giuffre claims that Epstein and others — including Prince Andrew of England and attorney Alan Dershowitz — sexually abused her as a minor. Giuffre says an FBI agent responded that the agency didn’t plan to do anything about it. Epstein’s case, he allegedly said, wasn’t “going anywhere” because of “the chain of command.” The documents then describe what appears to be a remarkable abuse of power. Epstein’s accusers claim that Alan Dershowitz and Prince Andrew helped Epstein beat federal charges for sex crimes in 2008. They also allege that Dershowitz crafted the immunity agreement so that he himself also wouldn’t face criminal prosecution. Dershowitz has denied all of this.    Alex Acosta will not be joining us. He’s the former Labor Department secretary who at the time was a federal prosecutor. Acosta is the one who agreed to the Epstein deal. When asked why he let a sex abuser skate, Acosta reportedly said: “I was told Epstein ‘belonged to intelligence’ and to leave it alone.” What does that mean exactly? What intelligence service did Jeffrey Epstein work for? Why did our government allow him to sexually abuse little girls? We deserve answers to those questions. But so far, no one is providing them. Epstein can’t tell us. He’s dead. The press, strangely, doesn’t seem very interested in finding the answers. It’s possible that’s because Epstein was close to a remarkable number of prominent Democratic politicians.   The new documents suggest that former president Bill Clinton visited Epstein’s private island in the Caribbean with two young girls. Giuffre claims Epstein made her have sex with former New Mexico Gov. Bill Richardson, former U.S. Sen. George Mitchell, and a famous Massachusetts Institute of Technology scientist. Is any of this true? We don’t know. Ghislaine Maxwell probably does. She’s in custody and set to go on trial. Will she explain what happened and why? Doesn’t look promising. Her lawyers fought the release of these documents."]

input_text = [input_arr[2]]
#input_text = tfidf_transformer.transform(input_text)

class_probabilities = text_clf.predict_proba(input_text)
print("Left? %f Right? %f" % (class_probabilities[0][0], class_probabilities[0][1]))
print(input_text)

Left? 0.099581 Right? 0.900419
['2020-08-03, Tucker Carlson: Equality under law is slipping away The Jeffrey Epstein case paints a picture of a justice system in which the rich and well-connected can do virtually whatever they want. With America’s institutions under relentless attack — and in some cases crumbling — it’s worth thinking through what we’d like to save from the ashes. When the revolution finally ends. what do we hope to have left? In other words, what are our best traditions? There are a lot of them. At the very top of the list is equality under the law. Equality is the most basic of all American ideals. It’s the very first principle articulated in the Declaration of Independence. It’s why the founders broke with England. In America, all citizens would be subject to the same rules: The same standards. The same penalties. Rich or poor. Black or White. All of us are equal under the law. TUCKER CARLSON: POLITICAL VIOLENCE IS AN ATTACK ON AMERICA ITSELF That’s the promise. It’

# Above: Testing against ALL sources gathered - most informative.
# Below: Testing against respective datasets of the: 2, 4, or 6 sources trained on

In [43]:
# Determine the model to use with the highest accuracy relating to its specific dataset
# So for h_and_n: This only compares the accuracy for testing between Huff and News
# hs_and_nr: This compares the accuracy across testing against 4 sources: Huff, Salon, News, Redstate
# hsr_and_nrw: Accuracy of testing against the 6 sources it trained on.

#pickle_files.sort()
final_results = []
count = 0

for x in range(len(pickle_files)):
    total_accuracy = 0
    
    if (count < 6 or (count >= 18 and count < 24)):
        total_accuracy += (huff_acc[x] + newsmax_acc[x]) / 2
        print("%-35s %.3f" % (pickle_files[x], total_accuracy))
        final_results.append([pickle_files[x], total_accuracy])
    elif ((count >= 6 and count < 12) or (count >= 24 and count < 30)):
        total_accuracy += (huff_acc[x] + newsmax_acc[x] + salon_acc[x] + redstate_acc[x]) / 4
        print("%-35s %.3f" % (pickle_files[x], total_accuracy))
        final_results.append([pickle_files[x], total_accuracy])
    else:
        total_accuracy += (huff_acc[x] + newsmax_acc[x] + salon_acc[x] + redstate_acc[x] + rawstory_acc[x] + washington_acc[x]) / 6
        print("%-35s %.3f" % (pickle_files[x], total_accuracy))
        final_results.append([pickle_files[x], total_accuracy])
    count += 1

h_and_n_onegram_sr_nol_nopr.pkl     62.388
h_and_n_onegram_sr_l_nopr.pkl       93.168
h_and_n_bigram_sr_nol_nopr.pkl      94.634
h_and_n_bigram_sr_l_nopr.pkl        93.038
h_and_n_trigram_sr_nol_nopr.pkl     94.412
h_and_n_trigram_sr_l_nopr.pkl       63.228
hs_and_nr_onegram_sr_nol_nopr.pkl   92.742
hs_and_nr_onegram_sr_l_nopr.pkl     92.200
hs_and_nr_bigram_sr_nol_nopr.pkl    93.189
hs_and_nr_bigram_sr_l_nopr.pkl      92.088
hs_and_nr_trigram_sr_nol_nopr.pkl   93.100
hs_and_nr_trigram_sr_l_nopr.pkl     92.019
hsr_and_nrw_onegram_sr_nol_nopr.pkl 91.606
hsr_and_nrw_onegram_sr_l_nopr.pkl   91.209
hsr_and_nrw_bigram_sr_nol_nopr.pkl  91.671
hsr_and_nrw_bigram_sr_l_nopr.pkl    91.465
hsr_and_nrw_trigram_sr_nol_nopr.pkl 92.011
hsr_and_nrw_trigram_sr_l_nopr.pkl   91.369
h_and_n_onegram_sr_nol_pr.pkl       90.792
h_and_n_onegram_sr_l_pr.pkl         89.197
h_and_n_bigram_sr_nol_pr.pkl        90.465
h_and_n_bigram_sr_l_pr.pkl          88.775
h_and_n_trigram_sr_nol_pr.pkl       90.559
h_and_n_tri

## Determine what .pkl file to load in each of the 6 different notebooks to illustrate best possible confusion matrix for that respective dataset of 2, 4, or 6 sources.

In [44]:
# Use these results to determine which .pkl file to use and showcase the
# confusion matrix for in each of the six training notebooks towards the end.
print("Sorted final results:")
final_results.sort(key=lambda x: x[1], reverse=True)
for x in range(len(final_results)):
    print("%-35s %.3f" % (final_results[x][0], final_results[x][1]))

Sorted final results:
h_and_n_bigram_sr_nol_nopr.pkl      94.634
h_and_n_trigram_sr_nol_nopr.pkl     94.412
hs_and_nr_bigram_sr_nol_nopr.pkl    93.189
h_and_n_onegram_sr_l_nopr.pkl       93.168
hs_and_nr_trigram_sr_nol_nopr.pkl   93.100
h_and_n_bigram_sr_l_nopr.pkl        93.038
hs_and_nr_onegram_sr_nol_nopr.pkl   92.742
hs_and_nr_onegram_sr_l_nopr.pkl     92.200
hs_and_nr_bigram_sr_l_nopr.pkl      92.088
hs_and_nr_trigram_sr_l_nopr.pkl     92.019
hsr_and_nrw_trigram_sr_nol_nopr.pkl 92.011
hsr_and_nrw_bigram_sr_nol_nopr.pkl  91.671
hsr_and_nrw_onegram_sr_nol_nopr.pkl 91.606
hsr_and_nrw_bigram_sr_l_nopr.pkl    91.465
hsr_and_nrw_trigram_sr_l_nopr.pkl   91.369
hsr_and_nrw_onegram_sr_l_nopr.pkl   91.209
h_and_n_onegram_sr_nol_pr.pkl       90.792
hs_and_nr_bigram_sr_nol_pr.pkl      90.701
h_and_n_trigram_sr_nol_pr.pkl       90.559
hs_and_nr_trigram_sr_nol_pr.pkl     90.522
hs_and_nr_onegram_sr_nol_pr.pkl     90.475
h_and_n_bigram_sr_nol_pr.pkl        90.465
hsr_and_nrw_trigram_sr_nol_pr.pk

In [45]:
# Sort by n-gram
pickle_results = {}
onegram = []
bigram = []
trigram = []
onegram_overall = 0
bigram_overall = 0
trigram_overall = 0
total_one = 0
total_bi = 0
total_tri = 0

# Separate the results according to gram usage.
for x in range(len(final_results)):
    if "onegram" in final_results[x][0]:
        onegram.append(final_results[x][1])
    elif "bigram" in final_results[x][0]:
        bigram.append(final_results[x][1])
    else:
        trigram.append(final_results[x][1])

# Dictionary
#pickle_results["onegram"] = onegram
#pickle_results["bigram"] = bigram
#pickle_results["trigram"] = trigram
#for key, value in pickle_results.items():
#    if key == "onegram":
#        print(value)

for x in range(len(onegram)):
    total_one += onegram[x]
    total_bi += bigram[x]
    total_tri += trigram[x]
onegram_overall = total_one / len(onegram)
bigram_overall = total_bi / len(bigram)
trigram_overall = total_tri / len(trigram)

print("         onegram  bigram   trigram")
for x in range(len(onegram)):
    print("         %.3f   %.3f   %.3f  " % (onegram[x], bigram[x], trigram[x]))
print("\nOverall: %.3f   %.3f   %.3f  " % (onegram_overall, bigram_overall, trigram_overall))

         onegram  bigram   trigram
         93.168   94.634   94.412  
         92.742   93.189   93.100  
         92.200   93.038   92.019  
         91.606   92.088   92.011  
         91.209   91.671   91.369  
         90.792   91.465   90.559  
         90.475   90.701   90.522  
         89.402   90.465   89.769  
         89.376   89.698   89.157  
         89.197   89.369   89.127  
         88.193   88.775   63.228  
         62.388   63.148   57.719  

Overall: 88.396   89.020   86.083  


In [46]:
# See the difference between grams
pickle_files_by_gram = ['h_and_n_onegram_sr_nol_nopr.pkl', 'h_and_n_bigram_sr_nol_nopr.pkl', 'h_and_n_trigram_sr_nol_nopr.pkl',
                        'h_and_n_onegram_sr_l_nopr.pkl', 'h_and_n_bigram_sr_l_nopr.pkl', 'h_and_n_trigram_sr_l_nopr.pkl',
                        'hs_and_nr_onegram_sr_nol_nopr.pkl', 'hs_and_nr_bigram_sr_nol_nopr.pkl', 'hs_and_nr_trigram_sr_nol_nopr.pkl', 
                        'hs_and_nr_onegram_sr_l_nopr.pkl', 'hs_and_nr_bigram_sr_l_nopr.pkl', 'hs_and_nr_trigram_sr_l_nopr.pkl',
                        'hsr_and_nrw_onegram_sr_nol_nopr.pkl', 'hsr_and_nrw_bigram_sr_nol_nopr.pkl', 'hsr_and_nrw_trigram_sr_nol_nopr.pkl', 
                        'hsr_and_nrw_onegram_sr_l_nopr.pkl', 'hsr_and_nrw_bigram_sr_l_nopr.pkl', 'hsr_and_nrw_trigram_sr_l_nopr.pkl',
                        'h_and_n_onegram_sr_nol_pr.pkl', 'h_and_n_bigram_sr_nol_pr.pkl', 'h_and_n_trigram_sr_nol_pr.pkl', 
                        'h_and_n_onegram_sr_l_pr.pkl', 'h_and_n_bigram_sr_l_pr.pkl', 'h_and_n_trigram_sr_l_pr.pkl',
                        'hs_and_nr_onegram_sr_nol_pr.pkl', 'hs_and_nr_bigram_sr_nol_pr.pkl', 'hs_and_nr_trigram_sr_nol_pr.pkl', 
                        'hs_and_nr_onegram_sr_l_pr.pkl', 'hs_and_nr_bigram_sr_l_pr.pkl', 'hs_and_nr_trigram_sr_l_pr.pkl',
                        'hsr_and_nrw_onegram_sr_nol_pr.pkl', 'hsr_and_nrw_bigram_sr_nol_pr.pkl', 'hsr_and_nrw_trigram_sr_nol_pr.pkl', 
                        'hsr_and_nrw_onegram_sr_l_pr.pkl', 'hsr_and_nrw_bigram_sr_l_pr.pkl', 'hsr_and_nrw_trigram_sr_l_pr.pkl']

In [47]:
grams = []
one = 0
bi = 0
tri = 0
one_count = 0
bi_count = 0 
tri_count = 0
multiples = [2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 32, 35]

for param in pickle_files_by_gram:
    for x in range(len(final_results)):
        if param in final_results[x][0]:
            grams.append((final_results[x][0], final_results[x][1]))

for x in range(len(grams)):
    # Mutiples of 3...
    if (x % 3 == 0):
        if x == 0:
            one = grams[x][1]
        if one == 0:
            one = grams[x][1]
        print("\n%-35s   %.3f" % (grams[x][0], grams[x][1]))
    else:
        if bi == 0:
            bi = grams[x][1]
        if x in multiples:
            tri = grams[x][1]
        print("%-35s   %.3f" % (grams[x][0], grams[x][1]))
    
    # As x reaches a number in the list, clear variables and loop again
    if x in multiples:
        if (one > bi and one > tri):
            one_count += 1
        elif (bi > one and bi > tri):
            bi_count += 1
        else:
            tri_count += 1
        one = 0
        bi = 0
        tri = 0

print("\nOnegram > Bigram and Trigram accuracy:   %i / %i times" % (one_count, len(pickle_files_by_gram) / 3))
print("Bigram > Onegram and Bigram accuracy:    %i / %i times" % (bi_count, len(pickle_files_by_gram) / 3))
print("Trigram > Onegram and Trigram accuracy:  %i / %i times" % (tri_count, len(pickle_files_by_gram) / 3))


h_and_n_onegram_sr_nol_nopr.pkl       62.388
h_and_n_bigram_sr_nol_nopr.pkl        94.634
h_and_n_trigram_sr_nol_nopr.pkl       94.412

h_and_n_onegram_sr_l_nopr.pkl         93.168
h_and_n_bigram_sr_l_nopr.pkl          93.038
h_and_n_trigram_sr_l_nopr.pkl         63.228

hs_and_nr_onegram_sr_nol_nopr.pkl     92.742
hs_and_nr_bigram_sr_nol_nopr.pkl      93.189
hs_and_nr_trigram_sr_nol_nopr.pkl     93.100

hs_and_nr_onegram_sr_l_nopr.pkl       92.200
hs_and_nr_bigram_sr_l_nopr.pkl        92.088
hs_and_nr_trigram_sr_l_nopr.pkl       92.019

hsr_and_nrw_onegram_sr_nol_nopr.pkl   91.606
hsr_and_nrw_bigram_sr_nol_nopr.pkl    91.671
hsr_and_nrw_trigram_sr_nol_nopr.pkl   92.011

hsr_and_nrw_onegram_sr_l_nopr.pkl     91.209
hsr_and_nrw_bigram_sr_l_nopr.pkl      91.465
hsr_and_nrw_trigram_sr_l_nopr.pkl     91.369

h_and_n_onegram_sr_nol_pr.pkl         90.792
h_and_n_bigram_sr_nol_pr.pkl          90.465
h_and_n_trigram_sr_nol_pr.pkl         90.559

h_and_n_onegram_sr_l_pr.pkl           89.197
h_

In [48]:
# See the difference between lemmatization vs no lemmatization
pickle_files_by_lemmatization = ['h_and_n_onegram_sr_nol_nopr.pkl', 'h_and_n_onegram_sr_l_nopr.pkl',
                'h_and_n_bigram_sr_nol_nopr.pkl', 'h_and_n_bigram_sr_l_nopr.pkl',
                'h_and_n_trigram_sr_nol_nopr.pkl', 'h_and_n_trigram_sr_l_nopr.pkl',
                'hs_and_nr_onegram_sr_nol_nopr.pkl', 'hs_and_nr_onegram_sr_l_nopr.pkl',
                'hs_and_nr_bigram_sr_nol_nopr.pkl', 'hs_and_nr_bigram_sr_l_nopr.pkl',
                'hs_and_nr_trigram_sr_nol_nopr.pkl', 'hs_and_nr_trigram_sr_l_nopr.pkl',
                'hsr_and_nrw_onegram_sr_nol_nopr.pkl', 'hsr_and_nrw_onegram_sr_l_nopr.pkl',
                'hsr_and_nrw_bigram_sr_nol_nopr.pkl', 'hsr_and_nrw_bigram_sr_l_nopr.pkl',
                'hsr_and_nrw_trigram_sr_nol_nopr.pkl', 'hsr_and_nrw_trigram_sr_l_nopr.pkl',
                'h_and_n_onegram_sr_nol_pr.pkl', 'h_and_n_onegram_sr_l_pr.pkl',
                'h_and_n_bigram_sr_nol_pr.pkl', 'h_and_n_bigram_sr_l_pr.pkl',
                'h_and_n_trigram_sr_nol_pr.pkl', 'h_and_n_trigram_sr_l_pr.pkl',
                'hs_and_nr_onegram_sr_nol_pr.pkl', 'hs_and_nr_onegram_sr_l_pr.pkl',
                'hs_and_nr_bigram_sr_nol_pr.pkl', 'hs_and_nr_bigram_sr_l_pr.pkl',
                'hs_and_nr_trigram_sr_nol_pr.pkl', 'hs_and_nr_trigram_sr_l_pr.pkl',
                'hsr_and_nrw_onegram_sr_nol_pr.pkl', 'hsr_and_nrw_onegram_sr_l_pr.pkl',
                'hsr_and_nrw_bigram_sr_nol_pr.pkl', 'hsr_and_nrw_bigram_sr_l_pr.pkl',
                'hsr_and_nrw_trigram_sr_nol_pr.pkl', 'hsr_and_nrw_trigram_sr_l_pr.pkl']

In [49]:
lemmas = []
one = 0
bi = 0
one_count = 0
bi_count = 0 
multiples = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35]

for param in pickle_files_by_lemmatization:
    for x in range(len(final_results)):
        if param in final_results[x][0]:
            lemmas.append((final_results[x][0], final_results[x][1]))

for x in range(len(lemmas)):
    # Mutiples of 2...
    if (x % 2 == 0):
        if x == 0:
            one = lemmas[x][1]
        if one == 0:
            one = lemmas[x][1]
        print("\n%-35s   %.3f" % (lemmas[x][0], lemmas[x][1]))
    else:
        if bi == 0:
            bi = lemmas[x][1]
        print("%-35s   %.3f" % (lemmas[x][0], lemmas[x][1]))
        
    # As x reaches a number in the list, clear variables and loop again
    if x in multiples:
        if (one > bi):
            one_count += 1
        else:
            bi_count += 1
        one = 0
        bi = 0

print("\nNo Lemmatization > Lemmatization accuracy:   %i / %i times" % (one_count, len(pickle_files_by_lemmatization) / 2))
print("Lemmatization > No Lemmatization accuracy:   %i / %i times" % (bi_count, len(pickle_files_by_lemmatization) / 2))


h_and_n_onegram_sr_nol_nopr.pkl       62.388
h_and_n_onegram_sr_l_nopr.pkl         93.168

h_and_n_bigram_sr_nol_nopr.pkl        94.634
h_and_n_bigram_sr_l_nopr.pkl          93.038

h_and_n_trigram_sr_nol_nopr.pkl       94.412
h_and_n_trigram_sr_l_nopr.pkl         63.228

hs_and_nr_onegram_sr_nol_nopr.pkl     92.742
hs_and_nr_onegram_sr_l_nopr.pkl       92.200

hs_and_nr_bigram_sr_nol_nopr.pkl      93.189
hs_and_nr_bigram_sr_l_nopr.pkl        92.088

hs_and_nr_trigram_sr_nol_nopr.pkl     93.100
hs_and_nr_trigram_sr_l_nopr.pkl       92.019

hsr_and_nrw_onegram_sr_nol_nopr.pkl   91.606
hsr_and_nrw_onegram_sr_l_nopr.pkl     91.209

hsr_and_nrw_bigram_sr_nol_nopr.pkl    91.671
hsr_and_nrw_bigram_sr_l_nopr.pkl      91.465

hsr_and_nrw_trigram_sr_nol_nopr.pkl   92.011
hsr_and_nrw_trigram_sr_l_nopr.pkl     91.369

h_and_n_onegram_sr_nol_pr.pkl         90.792
h_and_n_onegram_sr_l_pr.pkl           89.197

h_and_n_bigram_sr_nol_pr.pkl          90.465
h_and_n_bigram_sr_l_pr.pkl            88.775

In [50]:
# See the difference between lemmatization vs no lemmatization
pickle_files_by_propernoun = ['h_and_n_onegram_sr_nol_nopr.pkl', 'h_and_n_onegram_sr_nol_pr.pkl',
                              'h_and_n_onegram_sr_l_nopr.pkl', 'h_and_n_onegram_sr_l_pr.pkl',
                              'h_and_n_bigram_sr_nol_nopr.pkl', 'h_and_n_bigram_sr_nol_pr.pkl',
                              'h_and_n_bigram_sr_l_nopr.pkl', 'h_and_n_bigram_sr_l_pr.pkl',
                              'h_and_n_trigram_sr_nol_nopr.pkl', 'h_and_n_trigram_sr_nol_pr.pkl',
                              'h_and_n_trigram_sr_l_nopr.pkl', 'h_and_n_trigram_sr_l_pr.pkl',
                              'hs_and_nr_onegram_sr_nol_nopr.pkl', 'hs_and_nr_onegram_sr_nol_pr.pkl',
                              'hs_and_nr_onegram_sr_l_nopr.pkl', 'hs_and_nr_onegram_sr_l_pr.pkl',
                              'hs_and_nr_bigram_sr_nol_nopr.pkl', 'hs_and_nr_bigram_sr_nol_pr.pkl',
                              'hs_and_nr_bigram_sr_l_nopr.pkl', 'hs_and_nr_bigram_sr_l_pr.pkl',
                              'hs_and_nr_trigram_sr_nol_nopr.pkl', 'hs_and_nr_trigram_sr_nol_pr.pkl',
                              'hs_and_nr_trigram_sr_l_nopr.pkl', 'hs_and_nr_trigram_sr_l_pr.pkl',
                              'hsr_and_nrw_onegram_sr_nol_nopr.pkl', 'hsr_and_nrw_onegram_sr_nol_pr.pkl',
                              'hsr_and_nrw_onegram_sr_l_nopr.pkl', 'hsr_and_nrw_onegram_sr_l_pr.pkl',
                              'hsr_and_nrw_bigram_sr_nol_nopr.pkl', 'hsr_and_nrw_bigram_sr_nol_pr.pkl',
                              'hsr_and_nrw_bigram_sr_l_nopr.pkl', 'hsr_and_nrw_bigram_sr_l_pr.pkl',
                              'hsr_and_nrw_trigram_sr_nol_nopr.pkl', 'hsr_and_nrw_trigram_sr_nol_pr.pkl',
                              'hsr_and_nrw_trigram_sr_l_nopr.pkl', 'hsr_and_nrw_trigram_sr_l_pr.pkl']         

In [51]:
propernoun = []
one = 0
bi = 0
one_count = 0
bi_count = 0 
multiples = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35]

for param in pickle_files_by_propernoun:
    for x in range(len(final_results)):
        if param in final_results[x][0]:
            propernoun.append((final_results[x][0], final_results[x][1]))

for x in range(len(propernoun)):
    # Mutiples of 2...
    if (x % 2 == 0):
        if x == 0:
            one = propernoun[x][1]
        if one == 0:
            one = propernoun[x][1]
        print("\n%-35s   %.3f" % (propernoun[x][0], propernoun[x][1]))
    else:
        if bi == 0:
            bi = propernoun[x][1]
        print("%-35s   %.3f" % (propernoun[x][0], propernoun[x][1]))
        
    # As x reaches a number in the list, clear variables and loop again
    if x in multiples:
        if (one > bi):
            one_count += 1
        else:
            bi_count += 1
        one = 0
        bi = 0

print("\nNo Proper Noun Removal > Proper Noun Removal accuracy:   %i / %i times" % (one_count, len(pickle_files_by_propernoun) / 2))
print("Proper Noun Removal > No Proper Noun Removal accuracy:   %i / %i times" % (bi_count, len(pickle_files_by_propernoun) / 2))


h_and_n_onegram_sr_nol_nopr.pkl       62.388
h_and_n_onegram_sr_nol_pr.pkl         90.792

h_and_n_onegram_sr_l_nopr.pkl         93.168
h_and_n_onegram_sr_l_pr.pkl           89.197

h_and_n_bigram_sr_nol_nopr.pkl        94.634
h_and_n_bigram_sr_nol_pr.pkl          90.465

h_and_n_bigram_sr_l_nopr.pkl          93.038
h_and_n_bigram_sr_l_pr.pkl            88.775

h_and_n_trigram_sr_nol_nopr.pkl       94.412
h_and_n_trigram_sr_nol_pr.pkl         90.559

h_and_n_trigram_sr_l_nopr.pkl         63.228
h_and_n_trigram_sr_l_pr.pkl           89.127

hs_and_nr_onegram_sr_nol_nopr.pkl     92.742
hs_and_nr_onegram_sr_nol_pr.pkl       90.475

hs_and_nr_onegram_sr_l_nopr.pkl       92.200
hs_and_nr_onegram_sr_l_pr.pkl         89.376

hs_and_nr_bigram_sr_nol_nopr.pkl      93.189
hs_and_nr_bigram_sr_nol_pr.pkl        90.701

hs_and_nr_bigram_sr_l_nopr.pkl        92.088
hs_and_nr_bigram_sr_l_pr.pkl          89.369

hs_and_nr_trigram_sr_nol_nopr.pkl     93.100
hs_and_nr_trigram_sr_nol_pr.pkl       90.522

In [52]:
# Access the classifier and CountVectorizer() in the Pipeline object from the .pkl file

for pickle in pickle_files_by_propernoun:
    text_clf = joblib.load(pickle)
    
    try:
        clf = text_clf.named_steps['clf']
        count_vect = text_clf.named_steps['vect']
        print(pickle)
        print(clf)
        print("%s \n" % (count_vect))
    except:
        clf = text_clf.named_steps['eclf']
        count_vect = text_clf.named_steps['vect']
        print(pickle)
        print(clf)
        print("%s \n" % (count_vect))

h_and_n_onegram_sr_nol_nopr.pkl
RandomForestClassifier(n_estimators=200, random_state=0)
CountVectorizer(max_df=0.75, max_features=5000, min_df=4, stop_words='english') 

h_and_n_onegram_sr_nol_pr.pkl
VotingClassifier(estimators=[('svc',
                              CalibratedClassifierCV(base_estimator=LinearSVC(random_state=0))),
                             ('sgd',
                              CalibratedClassifierCV(base_estimator=SGDClassifier(random_state=0))),
                             ('rf',
                              RandomForestClassifier(n_estimators=200,
                                                     random_state=0))],
                 voting='soft')
CountVectorizer(max_df=0.75, max_features=5000, min_df=4, stop_words='english') 

h_and_n_onegram_sr_l_nopr.pkl
VotingClassifier(estimators=[('svc',
                              CalibratedClassifierCV(base_estimator=LinearSVC(random_state=0))),
                             ('sgd',
                              Cal

hs_and_nr_onegram_sr_l_pr.pkl
VotingClassifier(estimators=[('svc',
                              CalibratedClassifierCV(base_estimator=LinearSVC(random_state=0))),
                             ('sgd',
                              CalibratedClassifierCV(base_estimator=SGDClassifier(random_state=0))),
                             ('rf',
                              RandomForestClassifier(n_estimators=200,
                                                     random_state=0))],
                 voting='soft')
CountVectorizer(max_df=0.75, max_features=5000, min_df=4, stop_words='english') 

hs_and_nr_bigram_sr_nol_nopr.pkl
VotingClassifier(estimators=[('svc',
                              CalibratedClassifierCV(base_estimator=LinearSVC(random_state=0))),
                             ('sgd',
                              CalibratedClassifierCV(base_estimator=SGDClassifier(random_state=0))),
                             ('rf',
                              RandomForestClassifier(n_estimator

hsr_and_nrw_bigram_sr_nol_pr.pkl
VotingClassifier(estimators=[('svc',
                              CalibratedClassifierCV(base_estimator=LinearSVC(random_state=0))),
                             ('sgd',
                              CalibratedClassifierCV(base_estimator=SGDClassifier(random_state=0))),
                             ('rf',
                              RandomForestClassifier(n_estimators=200,
                                                     random_state=0))],
                 voting='soft')
CountVectorizer(max_df=0.75, max_features=5000, min_df=4, ngram_range=(1, 2),
                stop_words='english') 

hsr_and_nrw_bigram_sr_l_nopr.pkl
VotingClassifier(estimators=[('svc',
                              CalibratedClassifierCV(base_estimator=LinearSVC(random_state=0))),
                             ('sgd',
                              CalibratedClassifierCV(base_estimator=SGDClassifier(random_state=0))),
                             ('rf',
                         

In [53]:
for x in range(len(final_results)):
    text_clf = joblib.load(final_results[x][0])
    
    try:
        clf = text_clf.named_steps['clf']
        count_vect = text_clf.named_steps['vect']
        print("%s: %.3f" % (final_results[x][0], final_results[x][1]))
        print(clf)
        print("%s \n" % (count_vect))
    except:
        clf = text_clf.named_steps['eclf']
        count_vect = text_clf.named_steps['vect']
        print("%s: %.3f" % (final_results[x][0], final_results[x][1]))
        print(clf)
        print("%s \n" % (count_vect))

h_and_n_bigram_sr_nol_nopr.pkl: 94.634
VotingClassifier(estimators=[('svc',
                              CalibratedClassifierCV(base_estimator=LinearSVC(random_state=0))),
                             ('sgd',
                              CalibratedClassifierCV(base_estimator=SGDClassifier(random_state=0))),
                             ('rf',
                              RandomForestClassifier(n_estimators=200,
                                                     random_state=0))],
                 voting='soft')
CountVectorizer(max_df=0.75, max_features=5000, min_df=4, ngram_range=(1, 2),
                stop_words='english') 

h_and_n_trigram_sr_nol_nopr.pkl: 94.412
VotingClassifier(estimators=[('svc',
                              CalibratedClassifierCV(base_estimator=LinearSVC(random_state=0))),
                             ('sgd',
                              CalibratedClassifierCV(base_estimator=SGDClassifier(random_state=0))),
                             ('rf',
            

hsr_and_nrw_bigram_sr_l_nopr.pkl: 91.465
VotingClassifier(estimators=[('svc',
                              CalibratedClassifierCV(base_estimator=LinearSVC(random_state=0))),
                             ('sgd',
                              CalibratedClassifierCV(base_estimator=SGDClassifier(random_state=0))),
                             ('rf',
                              RandomForestClassifier(n_estimators=200,
                                                     random_state=0))],
                 voting='soft')
CountVectorizer(max_df=0.75, max_features=5000, min_df=4, ngram_range=(1, 2),
                stop_words='english') 

hsr_and_nrw_trigram_sr_l_nopr.pkl: 91.369
VotingClassifier(estimators=[('svc',
                              CalibratedClassifierCV(base_estimator=LinearSVC(random_state=0))),
                             ('sgd',
                              CalibratedClassifierCV(base_estimator=SGDClassifier(random_state=0))),
                             ('rf',
        

h_and_n_onegram_sr_l_pr.pkl: 89.197
VotingClassifier(estimators=[('svc',
                              CalibratedClassifierCV(base_estimator=LinearSVC(random_state=0))),
                             ('sgd',
                              CalibratedClassifierCV(base_estimator=SGDClassifier(random_state=0))),
                             ('rf',
                              RandomForestClassifier(n_estimators=200,
                                                     random_state=0))],
                 voting='soft')
CountVectorizer(max_df=0.75, max_features=5000, min_df=4, stop_words='english') 

hs_and_nr_trigram_sr_l_pr.pkl: 89.157
VotingClassifier(estimators=[('svc',
                              CalibratedClassifierCV(base_estimator=LinearSVC(random_state=0))),
                             ('sgd',
                              CalibratedClassifierCV(base_estimator=SGDClassifier(random_state=0))),
                             ('rf',
                              RandomForestClassifier(

In [54]:
# Shows you the most frequently occurring words seen in text labelled as left or right
# This works on the most recent clf, but should relatively represent the same features used by each classifier
# so long as the parameters for min_df and max_df for each classifier are the same
def show_most_informative_features(vectorizer, clf, n=20):
    top_features = []
    feature_names = vectorizer.get_feature_names() # Get all the feature names that CountVectorizer() is using
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names)) # Put all feature names with their weights. Sort.
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1]) 
    print("\t\t%-15s\t\t\t%-15s" % ('Left', 'Right'))
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))    
        top_features.append([fn_1, coef_1, fn_2, coef_2])
    print()
    return top_features

In [55]:
# RandomForest and VotingClassifier - Have no coef_ function
# Therefore, you cannot see the frequency of each feature those classifiers are using
# You'll need to use the 2nd best classifier to see the frequency of the top features.
# Use LinearSVC classifier in Top_Features.ipynb (SGD - 3rd best)
text_clf = joblib.load(final_results[1][0])
clf = text_clf.named_steps['eclf']
count_vect = text_clf.named_steps['vect']
top_features = show_most_informative_features(count_vect, clf, 40)

AttributeError: 'VotingClassifier' object has no attribute 'coef_'