In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV

import os
import random
random.seed(1984)

Using TensorFlow backend.


In [2]:
data_dir = "../newspaper-data/"

In [3]:
train_df = pd.read_csv(data_dir + 'us_news_train.csv')
test_df = pd.read_csv(data_dir + 'us_news_test.csv')
validation_df = pd.read_csv(data_dir +'us_news_validation.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
train_df.head()

Unnamed: 0,outlet,outlet_url,datetime,url_orig,headline,description,author,domain,topic_tags,text,section,news_keywords,subsection,paywall,provider,ideology
0,cnbc,https://www.cnbc.com/,2019-03-15T09:00:46Z,https://www.cnbc.com/2019/03/15/iea-report-ven...,Venezuela's electricity crisis could trigger '...,A nationwide power failure in crisis-stricken ...,Sam Meredith,Oil,"WTI Crude (Apr'19), ICE Brent Crude (May'19), ...",A nationwide power failure in crisis-stricken ...,,"Venezuela, oil, electricity, IEA, report, OPEC...",,,,center
1,cnbc,https://www.cnbc.com/,2018-10-01T10:30:36Z,https://www.cnbc.com/2018/10/01/canadas-husky-...,Canada's Husky Energy offers to buy MEG Energy...,Canadian oil and gas producer Husky Energy sai...,CNBC,Energy,"Oil and Gas, Business, Energy, Canada, US: New...",Canadian oil and gas producer Husky Energy sai...,,"Husky Energy, MEG, mergers and acquisitions, M...",,,,center
2,cnbc,https://www.cnbc.com/,2018-08-16T13:15:32Z,https://www.cnbc.com/2018/08/16/travel-apps-to...,3 innovative new travel apps that save time an...,From the cheapest rideshare to 30 percent off ...,Jimmy Im,Make It - Life,"Tourism, Travel, Career advice, Entrepreneursh...",Millennials are more likely to increase vacati...,,"make it, hoteltonight, daily drop, jetlag, tim...",,,,center
3,thehill,https://www.thehill.com/,2018-05-03T19:09:42Z,http://thehill.com/homenews/administration/386...,EXCLUSIVE: Giuliani calls for Sessions to 'ste...,“I am waiting for the Attorney General to step...,Niall Stanage,Administration,"Russia Investigation, Rudy Giuliani, Michael C...",Rudy Giuliani called for Attorney General Jeff...,Homenews,,,,,center
4,bbcnews,https://www.bbc.co.uk/news,2018-07-16T14:36:28Z,https://www.bbc.co.uk/sport/golf/44850075,The Open 2018: Tiger Woods with late tee time ...,Tiger Woods receives a tee time of 15:21 BST f...,,Golf,,Three-time champion Tiger Woods has a late sta...,Sport,,,,,center


In [5]:
# simple label encoding of "ideology" variable 
train_df["ideology_code"] = train_df["ideology"].astype('category')
train_df.dtypes

outlet             object
outlet_url         object
datetime           object
url_orig           object
headline           object
description        object
author             object
domain             object
topic_tags         object
text               object
section            object
news_keywords      object
subsection         object
paywall            object
provider           object
ideology           object
ideology_code    category
dtype: object

In [6]:
train_df["ideology_code"] = train_df["ideology_code"].cat.codes

In [7]:
sentences = train_df['headline'].values
y = train_df['ideology_code'].values

In [8]:
sentences = sentences.astype('U')

In [9]:
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1984)

In [10]:
vectorizer = CountVectorizer(stop_words="english")
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)

In [11]:
X_train

<60000x32280 sparse matrix of type '<class 'numpy.int64'>'
	with 465974 stored elements in Compressed Sparse Row format>

---

In [20]:
# Make a decision tree and train
tree = DecisionTreeClassifier()

In [23]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

---

In [12]:
from sklearn.ensemble import RandomForestClassifier

In [13]:
# Create the model with 20 trees
model = RandomForestClassifier(n_estimators=20,
                               bootstrap = True,
                               max_features = 'sqrt')

In [14]:
# Fit on training data
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [15]:
# Actual class predictions
rf_predictions = model.predict(X_test)
# Probabilities for each class
rf_probs = model.predict_proba(X_test)[:, 1]

In [20]:
n_nodes = []
max_depths = []

# Stats about the trees in random forest
for ind_tree in model.estimators_:
    n_nodes.append(ind_tree.tree_.node_count)
    max_depths.append(ind_tree.tree_.max_depth)

In [21]:
print(f'Average number of nodes {int(np.mean(n_nodes))}')
print(f'Average maximum depth {int(np.mean(max_depths))}')

Average number of nodes 64468
Average maximum depth 2824


In [22]:
# Training predictions (to demonstrate overfitting)
train_rf_predictions = model.predict(X_train)
train_rf_probs = model.predict_proba(X_train)[:, 1]

# Testing predictions (to determine performance)
rf_predictions = model.predict(X_test)
rf_probs = model.predict_proba(X_test)[:, 1]

In [23]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve

In [24]:
rf_predictions

array([0, 0, 0, ..., 1, 0, 2], dtype=int8)

In [25]:
y_test

array([1, 3, 4, ..., 3, 0, 4], dtype=int8)

In [26]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [27]:
print(confusion_matrix(y_test,rf_predictions))

[[2056  640  499  461  363]
 [ 737 1324  855  605  446]
 [ 707  924 1482  388  451]
 [ 532  629  552 1957  410]
 [ 451  441  412  377 2301]]


In [28]:
print(classification_report(y_test,rf_predictions,digits=4))

              precision    recall  f1-score   support

           0     0.4586    0.5116    0.4837      4019
           1     0.3345    0.3338    0.3341      3967
           2     0.3900    0.3750    0.3824      3952
           3     0.5166    0.4797    0.4975      4080
           4     0.5795    0.5779    0.5786      3982

   micro avg     0.4560    0.4560    0.4560     20000
   macro avg     0.4558    0.4556    0.4552     20000
weighted avg     0.4563    0.4560    0.4557     20000



In [30]:
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot
# Pull out one tree from the forest
tree = model.estimators_[5]
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot
# Pull out one tree from the forest
tree = model.estimators_[5]
# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')

ModuleNotFoundError: No module named 'pydot'

In [32]:
# Get numerical feature importances
importances = list(model.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

NameError: name 'feature_list' is not defined

---

In [56]:
def evaluate_model(predictions, probs, train_predictions, train_probs):
    """Compare machine learning model to baseline performance.
    Computes statistics and shows ROC curve."""
    
    baseline = {}
    
    baseline['recall'] = recall_score(y_test, 
                                     [1 for _ in range(len(y_test))], average="macro")
    baseline['precision'] = precision_score(y_test, 
                                      [1 for _ in range(len(y_test))], average="macro")
    baseline['roc'] = 0.5
    
    results = {}
    
    results['recall'] = recall_score(y_test, predictions, average="macro")
    results['precision'] = precision_score(y_test, predictions, average="macro")
    results['roc'] = roc_auc_score(y_test, probs, average="macro")
    
    train_results = {}
    train_results['recall'] = recall_score(y_train, train_predictions, average="macro")
    train_results['precision'] = precision_score(y_train, train_predictions, average="macro")
    train_results['roc'] = roc_auc_score(y_train, train_probs, average="macro")
    
    for metric in ['recall', 'precision', 'roc']:
        print(f'{metric.capitalize()} Baseline: {round(baseline[metric], 2)} Test: {round(results[metric], 2)} Train: {round(train_results[metric], 2)}')
    
    # Calculate false positive rates and true positive rates
    base_fpr, base_tpr, _ = roc_curve(y_test, [1 for _ in range(len(test_labels))], average="macro")
    model_fpr, model_tpr, _ = roc_curve(y_test, probs, average="macro")

    plt.figure(figsize = (8, 6))
    plt.rcParams['font.size'] = 16
    
    # Plot both curves
    plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
    plt.plot(model_fpr, model_tpr, 'r', label = 'model')
    plt.legend();
    plt.xlabel('False Positive Rate'); 
    plt.ylabel('True Positive Rate'); plt.title('ROC Curves');
    plt.show();

In [57]:
evaluate_model(rf_predictions, rf_probs, train_rf_predictions, train_rf_probs)

  'precision', 'predicted', average, warn_for)


ValueError: multiclass format is not supported