# Box plots of error rates over 100 samples

In [13]:
import numpy as np
import pandas as pd
import csv
import os
import pprint
from timeit import timeit
from functools import partial
import seaborn as sbn
np.random.seed(12345)
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import RidgeCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from matplotlib import pyplot as plt
from random import random


In [14]:
#--- constants(do not change) ---
kFold = 10
iters = 5
cpus = 4

In [15]:
#--- data preparation utilities ---

pp = pprint.PrettyPrinter(indent=4)

# popularity detector
def popular(s):
    return 1 if s>1400 else 0

#drop url and time, standardize everything else
def prepare(data):
    data_no_url_time = data.drop(['url',' timedelta'],axis = 1)
    scaled_data = data_no_url_time.copy()
    for column in data_no_url_time.columns:
        if column!=' shares':
            scaled_data[column] = preprocessing.scale(data_no_url_time[column])
    scaled_data[' popularity'] = scaled_data[' shares'].apply(popular)
    std_data = scaled_data.drop([' shares'],axis=1)
    return std_data

#takes dataframes containing both predictor and the output and separates them
def make_train_test(train, test, lastColumnTrain, columnOutput):
    x_train = train.loc[:,:lastColumnTrain]
    y_train = train.loc[:,columnOutput]
    x_test = test.loc[:,:lastColumnTrain]
    y_test = test.loc[:,columnOutput]
    return x_train, y_train, x_test, y_test


In [16]:
# I/O utilities
def summary(cl, y_pred, y_test):
    print ("coefficients = ", cl.coef_)
    print ("intercepts = ", cl.intercept_)
    print ("iterations = ", cl.n_iter_)

def plot_misclassification(m):
    labels, data = [*zip(*m.items())]  # 'transpose' items to parallel key, value lists
    plt.boxplot(data)
    plt.xticks(range(1, len(labels) + 1), labels)
    plt.show()


    

In [17]:
#--- classifiers ---
alphas = np.logspace(-5,5,100)

def logistic_plain_cv(x_train, y_train, x_test):
    clf = LogisticRegressionCV(cv=kFold, 
                               max_iter=100000,
                               n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred_test = [1 if p > 0.5 else 0 for p in clf.predict(x_test)]
    y_pred_train = [1 if p > 0.5 else 0 for p in clf.predict(x_train)]
    return clf, y_pred_train, y_pred_test

def random_forest(x_train, y_train, x_test):
    clf = RandomForestClassifier(n_estimators=100,
                                n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred_test = [1 if p > 0.5 else 0 for p in clf.predict(x_test)]
    y_pred_train = [1 if p > 0.5 else 0 for p in clf.predict(x_train)]
    return clf, y_pred_train, y_pred_test

def lasso_cv(x_train, y_train, x_test):
    clf = LassoCV(alphas=alphas,
                  cv=kFold,
                  max_iter=100000)
    clf.fit(x_train, y_train)
    y_pred_test = [1 if p > 0.5 else 0 for p in clf.predict(x_test)]
    y_pred_train = [1 if p > 0.5 else 0 for p in clf.predict(x_train)]
    return clf, y_pred_train, y_pred_test

def elastic_net_cv(x_train, y_train, x_test):
    clf = ElasticNetCV(alphas=alphas,
                       cv=kFold,
                       max_iter=100000,
                       n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred_test = [1 if p > 0.5 else 0 for p in clf.predict(x_test)]
    y_pred_train = [1 if p > 0.5 else 0 for p in clf.predict(x_train)]
    return clf, y_pred_train, y_pred_test

def ridge_cv(x_train, y_train, x_test):
    clf = RidgeCV(alphas=alphas,
                  cv=kFold)
    clf.fit(x_train, y_train)
    y_pred_test = [1 if p > 0.5 else 0 for p in clf.predict(x_test)]
    y_pred_train = [1 if p > 0.5 else 0 for p in clf.predict(x_train)]
    return clf, y_pred_train, y_pred_test

def svc(x_train, y_train, x_test):
    parameter_space = [
        {'kernel': ['rbf'],
         'gamma': np.logspace(-1,2,10),
         'C': np.logspace(-1,2,10)}
    ]
    clf = GridSearchCV(SVC(),
                       parameter_space,
                       cv=kFold,
                       iid=True,
                       scoring='accuracy',
                       n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred_test = [1 if p > 0.5 else 0 for p in clf.predict(x_test)]
    y_pred_train = [1 if p > 0.5 else 0 for p in clf.predict(x_train)]
    return clf, y_pred_train, y_pred_test

In [18]:
#--- test models against fake data ---

def make_data_up(noise):
    data = {'predictor': [], 'output': []}
    for i in range(1000):
        r = random()
        v = True if r > 0.5 else False
        if random() < noise :
            v = not v
        data['predictor'].append(r)
        data['output'].append(1 if v else 0)
    made_up_data = pd.DataFrame(data)
    made_up_data['predictor'] = preprocessing.scale(made_up_data['predictor'])
    return made_up_data

def test_with_fake_data(data, classifier):
    p = data.loc[:,:'predictor'].columns.size
    n = len(data.index)
    learning_set_sizes = {'10p': 10*p, '50p': 50*p}
    misclassification_rates = {'10p': [], '50p': []}
    for iteration in range(iters):
        for size_case in learning_set_sizes:
            #print('-'*80)
            #print ("iteration: ", iteration+1, " of ", iters)
            #print("case: ", size_case)
            size = learning_set_sizes[size_case]
            #print("learning set size =", size)
            train_set,test_set = train_test_split(data, test_size= 1.0*size/n)
            x_train, y_train, x_test, y_test = make_train_test(train_set, test_set, 'predictor', 'output')
            clf, y_pred = classifier(x_train, y_train, x_test)
            #summary(clf, y_pred, y_test)
            misclassification_rates[size_case].append(1.0 - accuracy_score(y_test, y_pred))
            #print('-'*80)
    #print("Misclassification rates:", misclassification_rates)
    plot_misclassification(misclassification_rates)
    
fake_data = make_data_up(0.1)

# print("---RANDOM FOREST---")
# test_with_fake_data(fake_data, random_forest)
# print("---RADIAL SVM---")
# test_with_fake_data(fake_data, svc)
# print("---LOGISTIC CV---")
# test_with_fake_data(fake_data, logistic_plain_cv)
# print("---LOGISTIC LASSO CV---")
# test_with_fake_data(fake_data, lasso_cv)
# print("---LOGISTIC ELASTIC NET CV---")
# test_with_fake_data(fake_data, elastic_net_cv)
# print("---LOGISTIC RIDGE CV---")
# test_with_fake_data(fake_data, ridge_cv)

In [19]:
#--- engine to run model ---
results = {}
def run_model(data, name, classifier):
    lastPredictor = ' abs_title_sentiment_polarity'
    columnOutput = ' popularity'
    p = data.loc[:,:lastPredictor].columns.size
    n = len(data.index)
    learning_set_sizes = {'2p': 2*p, '10p': 10*p}
    misclassification_rates = {
        'train': {
            '2p': [],
            '10p': []
        },
        'test': {
            '2p': [],
            '10p': []
        }
    }
    optimal_params = {'2p': [], '10p': []}
    for iteration in range(iters):
        print(iteration)
        for size_case in learning_set_sizes:
            #print('-'*80)
            #print ("iteration: ", iteration+1, " of ", iters)
            #print("case: ", size_case)
            size = learning_set_sizes[size_case]
            train_set,test_set = train_test_split(data, test_size= 1.0*size/n)
            x_train, y_train, x_test, y_test = make_train_test(train_set, test_set, lastPredictor, columnOutput)
            clf, y_pred_train, y_pred_test = classifier(x_train, y_train, x_test)
            misclassification_rates['train'][size_case].append(1.0 - accuracy_score(y_train, y_pred_train))
            misclassification_rates['test'][size_case].append(1.0 - accuracy_score(y_test, y_pred_test))
            #print('-'*80)
    results[name] = misclassification_rates



In [20]:
data = pd.read_csv('OnlineNewsPopularity.csv') 
std_data=prepare(data.iloc[0:700])



In [21]:

config = {
         'RANDOM FOREST': random_forest,
         'LOGISTIC': logistic_plain_cv,
         'LASSO': lasso_cv,
         'ELASTIC NET': elastic_net_cv,
         'RIDGE': ridge_cv,
         'RADIAL SVM': svc,
   }

def run_with_timer(label, data, classifierCallable):
    print("\n---" + label + "---")
    r = partial(run_model, data, label, classifierCallable)
    print("time elapsed = ", timeit(r, number=1), "s")

for l in config:
   run_with_timer(l, std_data, config[l])
    
pp.pprint(results)

# restructured_results = {
# '2p':{
#     'train': {},
#     'test':{}
# },
#     '10p': {
#         'train': {},
#         'test': {}
#     }
# }

# for k in results:
#     restructured_results['2p']['train'][k] = results[k]['train']['2p']
#     restructured_results['2p']['test'][k] = results[k]['test']['2p']
#     restructured_results['10p']['train'][k] = results[k]['train']['10p']
#     restructured_results['10p']['test'][k] = results[k]['test']['10p']
    
# pp.pprint(restructured_results)



---RANDOM FOREST---
0
1
2
3
4
time elapsed =  3.6995514589999914 s

---LOGISTIC---
0
1
2
3
4
time elapsed =  4.211847442999996 s

---LASSO---
0
1
2
3
4
time elapsed =  2.809018715999997 s

---ELASTIC NET---
0
1
2
3
4
time elapsed =  2.1248623510000044 s

---RIDGE---
0




1




2




3




4




time elapsed =  28.11048624 s

---RADIAL SVM---
0
1
2
3
4
time elapsed =  53.71903432300002 s
{   'ELASTIC NET': {   'test': {   '10p': [   0.5241379310344827,
                                              0.4482758620689655,
                                              0.5189655172413793,
                                              0.4948275862068966,
                                              0.5103448275862069],
                                   '2p': [   0.4051724137931034,
                                             0.4482758620689655,
                                             0.3706896551724138,
                                             0.4482758620689655,
                                             0.5258620689655172]},
                       'train': {   '10p': [   0.44166666666666665,
                                               0.25,
                                               0.4666666666666667,
                                               0.46666666666

In [None]:
# err_2p = restructured_results['2p']
# err_10p = restructured_results['10p']

# def make_plot(errs,label, p):
#     #x
#     X = list(errs['train'].keys())
#     #y
#     Y = {k:errs[k].values() for k in errs}
#     #col
#     COL = list(errs.keys())
#     fig, axes = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(9, 6))
#     # rectangular box plot
#     bplot1 = axes[0].boxplot(list(Y['train']),
#                          vert=True,   # vertical box aligmnent
#                          patch_artist=True)   # fill with color
#     axes[0].set_xlabel("Training errors", fontsize=14)
#     axes[0].set_ylabel('Misclassification rate', fontsize=14)

#     # notch shape box plot
#     bplot2 = axes[1].boxplot(list(Y['test']),
#                          vert=True,   # vertical box aligmnent
#                          patch_artist=True)   # fill with color
#     axes[1].set_xlabel("Test errors", fontsize=14)

#     # fill with colors
#     colors = ['pink', 'lightblue', 'lightgreen', 'red', 'orange', 'yellow']
#     for bplot in (bplot1, bplot2):
#         for patch, color in zip(bplot['boxes'], colors):
#             patch.set_facecolor(color)

#     # adding horizontal grid lines
#     for ax in axes:
#         ax.yaxis.grid(True)
#         ax.set_xticklabels(X, rotation = 90)
#     plt.tight_layout(rect=[0, 0.03, 1, 0.95])
#     fig.suptitle("$n_{learn} = "+ label +" = "+str(p)+"$", fontsize=20)

#     plt.savefig(label+'.png', dpi=1200)
#     plt.show()
    
# make_plot(err_2p, '2p', 118)
# make_plot(err_10p, '10p', 580)


