Andrew Carr

## Imports

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

plt.style.use("ggplot")

import statsmodels.api as sm

from sklearn import datasets
digits = datasets.load_digits()

  from pandas.core import datetools


# Problem A

## Create Train Test Split

In [2]:
data = np.array([(x,y) for x,y in zip(digits.data,digits.target)])
train, test = train_test_split(data, train_size=0.7)
train_x, train_y = list(train[:,0]), list(train[:,1])
test_x, test_y = list(test[:,0]), list(test[:,1])

## Helper functions

In [3]:
def get_droppable_features(reg, accuracy, coefs):
    """Determine which features can be dropped based on their coefficient value"""
    # get the best k values
    best_k = np.argmax(accuracy)

    # determine the mean of the coefficient values
    mean_coef = np.mean(coefs[best_k], axis=0)

    # if any of the coefficients are 0, we know they can be dropped
    features_to_drop = np.where(mean_coef==0)
    print("With {} we can drop the following features {}".format(reg, features_to_drop[0]))
    return features_to_drop[0]
    
def run_l2():
    """Run L2 regularized logistic regression on the current test/train set in memory"""
    coefs = []
    accuracy = []
    for k in range(-10,11):
        c = 10**k

        clf = LogisticRegression(C=c)
        clf.fit(train_x, train_y)

        coefs.append(clf.coef_)

        acc = clf.score(test_x, test_y)
        accuracy.append(acc)

        print("Accuracy for k = {} is {}".format(k,acc))

    get_droppable_features("L2", accuracy, coefs)
    
def run_l1():
    """Run L1 regularized logistic regression on the current test/train set in memory"""
    coefs = []
    accuracy = []
    for k in range(-10,11):
        c = 10**k

        clf = LogisticRegression(penalty='l1',C=c)
        clf.fit(train_x, train_y)

        coefs.append(clf.coef_)

        acc = clf.score(test_x, test_y)
        accuracy.append(acc)

        print("Accuracy for k = {} is {}".format(k,acc))

    get_droppable_features("L1", accuracy, coefs)
    

## L2 regularization

In [4]:
# L2 on the digits dataset
run_l2()

Accuracy for k = -10 is 0.8648148148148148
Accuracy for k = -9 is 0.8648148148148148
Accuracy for k = -8 is 0.8648148148148148
Accuracy for k = -7 is 0.8759259259259259
Accuracy for k = -6 is 0.8777777777777778
Accuracy for k = -5 is 0.8796296296296297
Accuracy for k = -4 is 0.9166666666666666
Accuracy for k = -3 is 0.9481481481481482
Accuracy for k = -2 is 0.9666666666666667
Accuracy for k = -1 is 0.9648148148148148
Accuracy for k = 0 is 0.9537037037037037
Accuracy for k = 1 is 0.9444444444444444
Accuracy for k = 2 is 0.9351851851851852
Accuracy for k = 3 is 0.9296296296296296
Accuracy for k = 4 is 0.924074074074074
Accuracy for k = 5 is 0.9259259259259259
Accuracy for k = 6 is 0.9277777777777778
Accuracy for k = 7 is 0.9259259259259259
Accuracy for k = 8 is 0.924074074074074
Accuracy for k = 9 is 0.9259259259259259
Accuracy for k = 10 is 0.924074074074074
With L2 we can drop the following features [ 0 24 32 39]


This means that k = -1,0 have the highest accuracy. Which is quite interesting because it means $\lambda \in [\frac{1}{10}, 1]$ gives us good accuracy.

## L1 regularization

In [5]:
# L1 on the digits dataset
run_l1()

Accuracy for k = -10 is 0.09814814814814815
Accuracy for k = -9 is 0.09814814814814815
Accuracy for k = -8 is 0.09814814814814815
Accuracy for k = -7 is 0.09814814814814815
Accuracy for k = -6 is 0.09814814814814815
Accuracy for k = -5 is 0.09814814814814815
Accuracy for k = -4 is 0.09814814814814815
Accuracy for k = -3 is 0.7388888888888889
Accuracy for k = -2 is 0.9333333333333333
Accuracy for k = -1 is 0.9592592592592593
Accuracy for k = 0 is 0.9555555555555556
Accuracy for k = 1 is 0.9444444444444444
Accuracy for k = 2 is 0.9296296296296296
Accuracy for k = 3 is 0.937037037037037
Accuracy for k = 4 is 0.9314814814814815
Accuracy for k = 5 is 0.9296296296296296
Accuracy for k = 6 is 0.9314814814814815
Accuracy for k = 7 is 0.9296296296296296
Accuracy for k = 8 is 0.9314814814814815
Accuracy for k = 9 is 0.9314814814814815
Accuracy for k = 10 is 0.9296296296296296
With L1 we can drop the following features [ 0  1  7  8 15 16 23 24 31 32 39 40 47 48 55 56 57]


Here we notice that large negative values give terrible accuracy. This suggests that there are some independent features that can be dropped to increase sparsity which is seen above

Very large values of k give us the term that is closest to unregularized values.

You can also see that L1 allows us to drop more features than L2 since is promotes sparsity. 

# Problem B (my data)

I've added some of the cleaning code, because I haven't saved it yet, I will.. don't worry.

In [3]:
def replace(piece):
    platplus[piece] = platplus[piece].str.replace("%", "").astype("float")
    bronze[piece] = bronze[piece].str.replace("%", "").astype("float")
    silver[piece] = silver[piece].str.replace("%", "").astype("float")
    gold[piece] = gold[piece].str.replace("%", "").astype("float")
    plat[piece] = plat[piece].str.replace("%", "").astype("float")

    

In [4]:
platplus = pd.read_html("platplus.html")[0]
bronze = pd.read_html("bronze.html")[0]
silver = pd.read_html("silver.html")[0]
gold = pd.read_html("gold.html")[0]
plat = pd.read_html("plat.html")[0]


# clean the data
replace('Win Percent')
replace('Ban Rate')
replace('Play Percent')

platplus.dropna(inplace=True)
bronze.dropna(inplace=True)
silver.dropna(inplace=True)
gold.dropna(inplace=True)
plat.dropna(inplace=True)

In [8]:
platplus.sample(5)

Unnamed: 0,Rank,Champion,Role,Win Percent,Play Percent,Ban Rate,Playerbase Avg. Games,Kills,Deaths,Assists,Largest Killing Spree,Damage Dealt,Damage Taken,Total Healing,Minions Killed,Enemy Jungle CS,Team Jungle CS,Gold Earned,Role Position,Position Change
96,97,Malzahar,Middle,53.94,6.1,3.25,7.51,5.12,5.35,7.52,9,19860,15639,1068,191.4,1.93,8.45,11800,4,2
71,72,Karma,Middle,50.02,0.92,0.14,10.4,4.49,5.0,9.01,8,18092,16609,2589,155.2,1.02,5.43,10671,45,8
117,118,Orianna,Middle,50.47,7.22,0.19,6.83,5.37,5.23,8.57,9,18816,16050,1673,178.1,1.82,9.3,11531,19,6
184,185,Volibear,Top,47.34,0.38,0.02,5.03,5.46,5.55,6.06,7,15908,31812,2527,153.5,1.6,2.62,10955,49,4
114,115,Nunu,Support,39.11,0.18,0.02,7.14,2.71,6.96,11.43,6,10217,22988,4844,39.6,0.98,3.47,9348,36,2


In [5]:
platplus['Role_Code'] = platplus['Role'].astype("category").cat.codes
bronze['Role_Code'] = bronze['Role'].astype("category").cat.codes
silver['Role_Code'] = silver['Role'].astype("category").cat.codes
gold['Role_Code'] = gold['Role'].astype("category").cat.codes
plat['Role_Code'] = plat['Role'].astype("category").cat.codes

This is for column role reference

In [10]:
for i in range(0,5):
    print(i, platplus.loc[platplus['Role_Code']==i, 'Role'].unique()[0])

0 ADC
1 Jungle
2 Middle
3 Support
4 Top


In [6]:
platplus.to_csv("platplus.csv", index=False)
bronze.to_csv("bronze.csv", index=False)
silver.to_csv("silver.csv", index=False)
gold.to_csv("gold.csv", index=False)
plat.to_csv("plat.csv", index=False)

In [11]:
train, test = train_test_split(platplus, train_size=0.7)

Columns for reference

In [12]:
train.columns

Index(['Rank', 'Champion', 'Role', 'Win Percent', 'Play Percent', 'Ban Rate',
       'Playerbase Avg. Games', 'Kills', 'Deaths', 'Assists',
       'Largest Killing Spree', 'Damage Dealt', 'Damage Taken',
       'Total Healing', 'Minions Killed', 'Enemy Jungle CS', 'Team Jungle CS',
       'Gold Earned', 'Role Position', 'Position Change', 'Role_Code'],
      dtype='object')

## Using All Features

In [13]:
train_y = train['Role_Code']

li = ['Win Percent', 'Play Percent', 'Ban Rate',
       'Playerbase Avg. Games', 'Kills', 'Deaths', 'Assists',
       'Largest Killing Spree', 'Damage Dealt', 'Damage Taken',
       'Total Healing', 'Minions Killed', 'Enemy Jungle CS', 'Team Jungle CS',
       'Gold Earned']

train_x = train[li]

test_y = test['Role_Code']
test_x = test[li]

In [14]:
run_l2()

Accuracy for k = -10 is 0.32786885245901637
Accuracy for k = -9 is 0.4262295081967213
Accuracy for k = -8 is 0.5081967213114754
Accuracy for k = -7 is 0.5245901639344263
Accuracy for k = -6 is 0.5901639344262295
Accuracy for k = -5 is 0.7540983606557377
Accuracy for k = -4 is 0.8524590163934426
Accuracy for k = -3 is 0.8688524590163934
Accuracy for k = -2 is 0.9180327868852459
Accuracy for k = -1 is 0.9180327868852459
Accuracy for k = 0 is 0.9016393442622951
Accuracy for k = 1 is 0.9016393442622951
Accuracy for k = 2 is 0.9180327868852459
Accuracy for k = 3 is 0.9180327868852459
Accuracy for k = 4 is 0.8852459016393442
Accuracy for k = 5 is 0.9016393442622951
Accuracy for k = 6 is 0.9016393442622951
Accuracy for k = 7 is 0.9016393442622951
Accuracy for k = 8 is 0.9180327868852459
Accuracy for k = 9 is 0.9344262295081968
Accuracy for k = 10 is 0.9016393442622951
With L2 we can drop the following features []


In [15]:
run_l1()

Accuracy for k = -10 is 0.11475409836065574
Accuracy for k = -9 is 0.11475409836065574
Accuracy for k = -8 is 0.11475409836065574
Accuracy for k = -7 is 0.11475409836065574
Accuracy for k = -6 is 0.16393442622950818
Accuracy for k = -5 is 0.36065573770491804
Accuracy for k = -4 is 0.5081967213114754
Accuracy for k = -3 is 0.639344262295082
Accuracy for k = -2 is 0.8688524590163934
Accuracy for k = -1 is 0.8688524590163934
Accuracy for k = 0 is 0.9180327868852459
Accuracy for k = 1 is 0.9344262295081968
Accuracy for k = 2 is 0.9672131147540983
Accuracy for k = 3 is 0.9672131147540983
Accuracy for k = 4 is 0.9672131147540983
Accuracy for k = 5 is 0.9672131147540983
Accuracy for k = 6 is 0.9672131147540983
Accuracy for k = 7 is 0.9672131147540983
Accuracy for k = 8 is 0.9836065573770492
Accuracy for k = 9 is 0.9672131147540983
Accuracy for k = 10 is 0.9672131147540983
With L1 we can drop the following features []


So with all of the features, we can still get 95% accuracy, which is great, but not consistently. Which is interesting. We really would like to reduce the number of features we want to use. But with my data, since I bet it's not linearly nicely behaved. However, using domain knowledge, I can choose the features. 

## Using Smart Features Chosen Through OLS ETC

In [16]:
train_y = train['Role_Code']
li = ['Win Percent','Minions Killed', 'Total Healing', 'Team Jungle CS','Kills', 'Assists', 'Deaths', 'Damage Dealt', 'Damage Taken', 'Gold Earned']

train_x = train[li]

test_y = test['Role_Code']
test_x = test[li]

## L2 regularization

In [17]:
# predict champion roles based on certain critera
run_l2()

Accuracy for k = -10 is 0.32786885245901637
Accuracy for k = -9 is 0.4262295081967213
Accuracy for k = -8 is 0.5081967213114754
Accuracy for k = -7 is 0.5245901639344263
Accuracy for k = -6 is 0.5901639344262295
Accuracy for k = -5 is 0.7540983606557377
Accuracy for k = -4 is 0.8524590163934426
Accuracy for k = -3 is 0.8688524590163934
Accuracy for k = -2 is 0.9180327868852459
Accuracy for k = -1 is 0.8852459016393442
Accuracy for k = 0 is 0.8852459016393442
Accuracy for k = 1 is 0.8688524590163934
Accuracy for k = 2 is 0.8852459016393442
Accuracy for k = 3 is 0.8688524590163934
Accuracy for k = 4 is 0.8688524590163934
Accuracy for k = 5 is 0.8688524590163934
Accuracy for k = 6 is 0.8688524590163934
Accuracy for k = 7 is 0.8688524590163934
Accuracy for k = 8 is 0.8688524590163934
Accuracy for k = 9 is 0.9016393442622951
Accuracy for k = 10 is 0.8688524590163934
With L2 we can drop the following features []


This means I can predict role with ~95% accuracy! This is a huge thing because I am not using a very large number of features. Look at k=5. It depends on the train and test split, but this would definitely be my prefered method for choosing my C value. Since my dataset is so small, it is easy enough to do this since I don't have to worry too much about computational latency.

## L1 regularization

In [18]:
run_l1()

Accuracy for k = -10 is 0.11475409836065574
Accuracy for k = -9 is 0.11475409836065574
Accuracy for k = -8 is 0.11475409836065574
Accuracy for k = -7 is 0.11475409836065574
Accuracy for k = -6 is 0.16393442622950818
Accuracy for k = -5 is 0.36065573770491804
Accuracy for k = -4 is 0.5081967213114754
Accuracy for k = -3 is 0.639344262295082
Accuracy for k = -2 is 0.8688524590163934
Accuracy for k = -1 is 0.8852459016393442
Accuracy for k = 0 is 0.9180327868852459
Accuracy for k = 1 is 0.8852459016393442
Accuracy for k = 2 is 0.9016393442622951
Accuracy for k = 3 is 0.8852459016393442
Accuracy for k = 4 is 0.9180327868852459
Accuracy for k = 5 is 0.9180327868852459
Accuracy for k = 6 is 0.9016393442622951
Accuracy for k = 7 is 0.8852459016393442
Accuracy for k = 8 is 0.8852459016393442
Accuracy for k = 9 is 0.8852459016393442
Accuracy for k = 10 is 0.9180327868852459
With L1 we can drop the following features []


By using this method of regularization we can get accuracy of ~95% which is also great. With k=0. It seems that L1 gives us better accuracy in general. 

So, regularization doesn't suggest we can drop features, but after working around (and using domain knowledge) it is obvious that there are improvements that can be made (not using all the features). So anyway, here ya go. Also I can't seem to get stats models to converge. I've tried everything.

In [19]:
model = sm.MNLogit(train_y, train_x)

In [20]:
model.fit?

In [21]:
model = sm.MNLogit(train_y, train_x)
result= model.fit?
print(result.summary())

NameError: name 'result' is not defined

In [None]:
result= model.fit

In [None]:
model = sm.OLS(train_y, train_x)

In [None]:
result = model.fit()

In [None]:
print(result.summary())

In [None]:
print(result.aic, result.bic)

I prefer L1 regularized LogisticRegression for my personal data. This is because regardless of the regularization term, it seems to perform very well. It is also odd that if you look at the OLS version, it suggests that Gold Earned is not statistically significant, but if we don't include this feature in the analysis the accuracy on both L1 and L2 goes down (not shown).  However, we have been able to use the OLS to see and prune features. Even though both L1 and L2 don't suggest features to prune in the full model.