In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.linear_model import LogisticRegression,RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
%matplotlib inline

In [2]:
rawdf=pd.read_pickle('bigtrainingdataset.pickle')
df=rawdf.drop(['key','myteam','oppteam','gamedate','myteam_curscore','oppteam_curscore'],axis=1)

In [3]:
df.head(2)

Unnamed: 0,myteam_prvwins,oppteam_prvwins,homegame,result,myteam_scores,oppteam_scores,myteam_trnovr,oppteam_trnovr,myteam_top,oppteam_top,myteam_pyds,oppteam_pyds,myteam_ryds,oppteam_ryds,myteam_penyds,oppteam_penyds,myteam_ptyds,oppteam_ptyds
138,"[0, 0, 0, 0]","[0, 1, 1, 1]",False,0,"[17, 3, 14, 16]","[27, 19, 30, 23]","[2, 1, 2, 2]","[1, 0, 0, 3]","[26.5, 28.733333333333334, 35.083333333333336,...","[32.583333333333336, 32.71666666666667, 32.916...","[286, 258, 230, 150]","[408, 330, 246, 241]","[54, 7, 126, 182]","[46, 123, 146, 45]","[57, 51, 82, 67]","[53, 15, 40, 25]","[344, 331, 234, 259]","[140, 80, 52, 211]"
140,"[0, 0, 1, 1]","[0, 1, 0, 0]",True,1,"[22, 20, 22, 13]","[13, 31, 28, 23]","[2, 4, 3, 3]","[4, 1, 1, 1]","[32.333333333333336, 25.683333333333334, 28.93...","[34.31666666666667, 22.183333333333334, 35.45,...","[204, 294, 160, 240]","[265, 273, 299, 399]","[30, 109, 90, 56]","[27, 18, 64, 58]","[22, 38, 30, 90]","[55, 101, 46, 55]","[197, 218, 199, 320]","[183, 192, 99, 137]"


In [4]:
def GetAdjustedDF(df,adj_factor):
    
    def AdjustedStats (statlist,adj_factor) :
        if len(statlist)==3:
            return np.matmul(np.array(statlist),np.array([1+adj_factor,1,1-adj_factor]))
        elif len(statlist)==4:
            return np.matmul(np.array(statlist),np.array([1+adj_factor,1+adj_factor/2,1-adj_factor/2,1-adj_factor]))
    
    cols_to_transform=['myteam_prvwins', 'oppteam_prvwins','myteam_scores', 'oppteam_scores', 'myteam_trnovr', 'oppteam_trnovr',
       'myteam_top', 'oppteam_top', 'myteam_pyds', 'oppteam_pyds',
       'myteam_ryds', 'oppteam_ryds', 'myteam_penyds', 'oppteam_penyds',
       'myteam_ptyds', 'oppteam_ptyds']

    adjusteddf=df.copy()
    for col in cols_to_transform:
        adjusteddf[col]=df[col].apply(AdjustedStats,adj_factor=adj_factor)
        
    return adjusteddf

In [5]:
def Get_traintest(df):
    X,y=df.drop(['result','myteam_penyds','oppteam_penyds',
             'myteam_prvwins','oppteam_scores',
#              'myteam_trnovr','oppteam_trnovr','myteam_top','oppteam_top'
            ],axis=1),df[['result']]
    X=pd.DataFrame(StandardScaler().fit_transform(X),columns=X.columns)
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.35,shuffle=True)
    return (X_train,X_test,y_train,y_test)

In [6]:
mydictofmodels=[
    {
        'estimator':LogisticRegression(),
        'name':'LogisticRegression',
        'parameters':{
            'solver':['lbfgs','liblinear'],
            'C':[1.0,0.8,0.5,0.3]
        }
    },
        {
        'estimator':RidgeClassifier(),
        'name':'Ridge Classifier',
        'parameters':{
            'alpha':[1.0,2.0,10.0,0.5],
            'solver' : ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag']
        }
    },
        {
        'estimator':GaussianNB(),
        'name':'Gaussian Naive Bayes',
        'parameters':{
#             'solver':['lbfgs','linear']
        }
    },
            {
        'estimator':DecisionTreeClassifier(),
        'name':'Decision Tree',
        'parameters':{
            'min_samples_split':[2,5,10,20,40],
            'min_samples_leaf' :[20,40,60,100],
            'max_depth':[3,5,10,25]
        }
    },
            {
        'estimator':RandomForestClassifier(),
        'name':'Random Forest',
        'parameters':{
            'min_samples_split':[2,5,10,20,40],
            'min_samples_leaf' :[20,40,60,100],
            'n_estimators':[100]
        }
    },
    {
        'estimator':SVC(),
        'name':'SVC',
        'parameters':{
            'gamma':['auto','scale']
        }
    }
    
]

In [7]:
modelnames=[]
modelscores=[]
modelbestparams=[]
adj_factor_list=[]

for adj_factor in [0,0.1,0.2,0.3,0.4,0.5]:
    adjdf=GetAdjustedDF(df=df,adj_factor=adj_factor)
    X_train,X_test,y_train,y_test=Get_traintest(adjdf)
    for model in mydictofmodels:
        gsmodel=GridSearchCV(model['estimator'],param_grid=model['parameters'],cv=5,verbose=True)
        gsmodel.fit(X_train,np.array(y_train).ravel())
        modelnames.append(model['name'])
        modelscores.append(gsmodel.score(X_test,y_test))
        modelbestparams.append(gsmodel.best_params_)
        adj_factor_list.append(adj_factor)


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    1.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   11.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    2.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   10.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    1.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   10.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    1.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   11.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    1.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   11.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    1.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   11.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished


In [8]:
resultsdf=pd.DataFrame({'modelname':modelnames,
              'scores':modelscores,
              'bestparams':modelbestparams,
              'adj_factor':adj_factor_list})
resultsdf.sort_values(['scores'],ascending=False).head(10)

Unnamed: 0,modelname,scores,bestparams,adj_factor
4,Random Forest,0.665962,"{'min_samples_leaf': 20, 'min_samples_split': ...",0.0
2,Gaussian Naive Bayes,0.651163,{},0.0
34,Random Forest,0.651163,"{'min_samples_leaf': 60, 'min_samples_split': ...",0.5
31,Ridge Classifier,0.651163,"{'alpha': 10.0, 'solver': 'auto'}",0.5
30,LogisticRegression,0.651163,"{'C': 1.0, 'solver': 'liblinear'}",0.5
18,LogisticRegression,0.646934,"{'C': 1.0, 'solver': 'lbfgs'}",0.3
19,Ridge Classifier,0.646934,"{'alpha': 10.0, 'solver': 'auto'}",0.3
8,Gaussian Naive Bayes,0.64482,{},0.1
10,Random Forest,0.640592,"{'min_samples_leaf': 40, 'min_samples_split': ...",0.1
1,Ridge Classifier,0.640592,"{'alpha': 1.0, 'solver': 'sparse_cg'}",0.0


In [None]:
# mlpmodel=MLPClassifier(solver='adam', alpha=1e-5,
#                        hidden_layer_sizes=(5,10,5), early_stopping=False,random_state=1,max_iter=1000,verbose=True,learning_rate_init=0.001)
# mlpmodel.fit(X_train,np.array(y_train).ravel())
# mlpmodel.score(X_train,y_train),mlpmodel.score(X_test,y_test)