In [1]:
import pandas as pd
import numpy as np


# load dataset prepared in wrangle_news.py
news = pd.read_csv('/Users/daniel/Documents/Work/ProDev/Data Science/video_digest/text_classification_all/news.csv', index_col=0)
news.head()

Unnamed: 0,label,content
0,left,WASHINGTON — Jefferson Beauregard Sessions ...
1,left,"On Wednesday, the Senate Environment and Publi..."
2,left,Nina Falcone has given up on cash. Whenever an...
3,left,MOSCOW — A gunman stormed a police station ...
4,left,The great American consumer is very much alive...


In [2]:
## IF-IDF Logistic regression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score

In [3]:
# Split in train test (default 0.25 test)
X_train_raw, X_test_raw, y_train, y_test = train_test_split(news['content'],news['label'])

In [50]:
# Vectorize raw test data
vectorizer = TfidfVectorizer(ngram_range = (1,2))
X_train = vectorizer.fit_transform(X_train_raw)

In [51]:
# Train LR classifier
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [52]:
# Test model accuracy, 5x cross validation
x_test = vectorizer.transform(X_test_raw)
scores = cross_val_score(classifier, x_test, y_test, cv=5)
print(scores)

[0.82073704 0.811875   0.8225     0.82625    0.82363977]


In [53]:
# Do predictions
predictions = classifier.predict(x_test)

In [54]:
# report metrics
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

       left       0.87      0.86      0.87      3989
      right       0.86      0.88      0.87      4011

avg / total       0.87      0.87      0.87      8000



for vectorizer = TfidfVectorizer(ngram_range = (1,3)):

Scores
[0.81449094 0.80125    0.813125   0.824375   0.8217636 ]

Report
             precision    recall  f1-score   support

       left       0.82      0.89      0.85      3989
      right       0.88      0.80      0.84      4011

avg / total       0.85      0.84      0.84      8000

All defaults was 0.86, same with english stopwords. With max_features = 1000 it's .81
Best is ngrams 1-2 at .87, all else default

In [18]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [59]:
# Breitbart Coulter transcript
with open('transcript_bb_coulter.txt', 'r') as myfile:
    text=myfile.read().replace('\n', '')

In [61]:
# Single prediction
text = X_test_raw[20583]
exemplar = vectorizer.transform([text]) # example string
exemplar_cat = classifier.predict(exemplar) # returns category
exemplar_prob = classifier.predict_proba(exemplar) # returns probability

if exemplar_prob[0][0] > exemplar_prob[0][1]:
    class_prob = exemplar_prob[0][0]
else: class_prob = exemplar_prob[0][1]

print('This transcript was classified as politically ' + exemplar_cat + ', with a probability of ' + str(class_prob))
print(text[1:280])

['This transcript was classified as politically right, with a probability of 0.7226171835505644']
this is you know these Dreams by Masters they are idea for flipping an election to Trump is will take out these 13 Russians will put Facebook ads Annapolis way that will have will be putting our finger on on this election more than the entire media in the United States no match 


In [62]:
# Most important features
feature_array = np.array(vectorizer.get_feature_names())
tfidf_sorting = np.argsort(exemplar.toarray()).flatten()[::-1]

n = 3
top_n = feature_array[tfidf_sorting][:n]
print(top_n)

['the' 'on on' 'dominant on']


## Save/Load classifier

In [26]:
## Save trained LR model
import pickle

file_Name = "classifier_pol"

fileObject = open(file_Name,'wb') 
pickle.dump(classifier,fileObject)   
fileObject.close()

In [None]:
# To open the file for reading
fileObject = open(file_Name,'rb')
classifier_pol_bias = pickle.load(fileObject)  
fileObject.close()

In [30]:
indices = np.argsort(vectorizer.idf_)[::-1]
features = vectorizer.get_feature_names()
top_n = 10
top_features = [features[i] for i in indices[:top_n]]
print(top_features)

['ﬂed', 'keola', 'kentwood', 'kenyatta', 'kenzaburo', 'kenzie', 'kenzo', 'keohane', 'keokuk', 'keoni']


In [33]:
from collections import defaultdict

features_by_gram = defaultdict(list)
for f, w in zip(vectorizer.get_feature_names(), vectorizer.idf_):
    features_by_gram[len(f.split(' '))].append((f, w))
top_n = 2
for gram, features in features_by_gram.items():
    top_features = sorted(features, key=lambda x: x[1], reverse=True)[:top_n]
    top_features = [f[0] for f in top_features]
    #print '{}-gram top:'.format(gram), top_features