In [1]:
import pandas as pd
import numpy as np


# load dataset prepared in wrangle_news.py
news = pd.read_csv('news.csv', index_col=0)
news.head()

Unnamed: 0,label,content
0,left,WASHINGTON — Jefferson Beauregard Sessions ...
1,left,"On Wednesday, the Senate Environment and Publi..."
2,left,Nina Falcone has given up on cash. Whenever an...
3,left,MOSCOW — A gunman stormed a police station ...
4,left,The great American consumer is very much alive...


In [2]:
## IF-IDF Logistic regression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score

In [3]:
# Split in train test (default 0.25 test)
X_train_raw, X_test_raw, y_train, y_test = train_test_split(news['content'],news['label'])

In [4]:
# Vectorize raw test data
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_raw)

In [5]:
# Train LR classifier
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [6]:
# Test model accuracy, 5x cross validation
x_test = vectorizer.transform(X_test_raw)
scores = cross_val_score(classifier, x_test, y_test, cv=5)
print(scores)

[0.82261087 0.798125   0.8325     0.824375   0.81801126]


In [7]:
# Do predictions
predictions = classifier.predict(x_test)

In [8]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

       left       0.88      0.84      0.86      3989
      right       0.85      0.88      0.86      4011

avg / total       0.86      0.86      0.86      8000



In [18]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [24]:
# Single prediction
text = X_test_raw[20583]
exemplar = vectorizer.transform([text]) # example string
exemplar_cat = classifier.predict(exemplar) # returns category
exemplar_prob = classifier.predict_proba(exemplar) # returns probability

if exemplar_prob[0][0] > exemplar_prob[0][1]:
    class_prob = exemplar_prob[0][0]
else: class_prob = exemplar_prob[0][1]

print('This transcript was classified as ' + exemplar_cat + ', with a probability of ' + str(class_prob))
print(text[1:280])

['This transcript was classified as right, with a probability of 0.9207456226331542']
The images are horrific. Who can forget the video of masked Islamic jihadists on a beach beheading Christians    executing them   for the world to see? And while there has been condemnation of the massacre of Christians and other religious minorities in the Middle East, there’s 


In [25]:
# Most important features
feature_array = np.array(vectorizer.get_feature_names())
tfidf_sorting = np.argsort(exemplar.toarray()).flatten()[::-1]

n = 3
top_n = feature_array[tfidf_sorting][:n]
print(top_n)

['genocide' 'the' 'christians']


In [64]:
## Save trained LR model
import pickle

file_Name = "classifier_pol"

fileObject = open(file_Name,'wb') 
pickle.dump(classifier,fileObject)   
fileObject.close()

In [None]:
# To open the file for reading
fileObject = open(file_Name,'rb')
classifier_pol_bias = pickle.load(fileObject)  
fileObject.close()