In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import time
import operator

### Initialize Strategies

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

import strategies

strategies = [strategies.Strategy('top_5_linreg', strategies.select_top_5_gp, LinearRegression() ),
              strategies.Strategy('top_10_linreg', strategies.select_top_10_gp, LinearRegression() )
             ]


### Run

In [3]:
import random
test_fraction = 0.3

years = [2015, 2016, 2017]
stats = ['pts_per_min', 'trb_per_min', 'ast_per_min', 'blk_per_min', 'stl_per_min']

results_dict = {}


for year in years:
    df_full = pd.read_csv('df_actuals/actual_df_{}.csv'.format(year))
    
    # split df
    indices = list(range(len(df_full)))
    num_hidden = int(test_fraction * len(df_full))
    hidden_indices = random.sample(indices, num_hidden)
    
    df_full['hidden'] = 'False'
    for index in hidden_indices:
        df_full.at[index, 'hidden'] = 'True'
        
    df_train = df_full[df_full['hidden'] == 'False']
    df_test = df_full[df_full['hidden'] == 'True']
    
    #print(df_full.index.values.tolist())
    #print(df_train.index.values.tolist())
    #print(df_test.index.values.tolist())




    
        
    # create graphs from respective datasets
    edge_attrs = stats + ['times_played']
    
    G_train = nx.from_pandas_edgelist(df_train, 'defense', 'player', edge_attrs).to_undirected()
    G_full = nx.from_pandas_edgelist(df_full, 'defense', 'player', edge_attrs).to_undirected()

    for stat in stats:        
        for strategy in strategies:
            print('NEW RUN')
            print(year)
            print(stat)
            print(strategy.name)
            
            # TRAIN
            
            train_x_list = []
            train_y_list = []
            
            print('TRAIN')
            print('num of rows: {}'.format(len(df_train)))
                
            # some problem with indexes - not sure what yet
            count = 0
            for index, row in df_train.iterrows():
                if count % 500 == 0:
                    print(count)
                    print(time.time())
                count += 1
                
                x = strategy.param_select_func(G_train, row['defense'], row['player'], stat)
                
                # only add x,y to list if x actually returned value
                if type(x) == np.ndarray:
                    train_x_list.append(x)
                    y = np.array([row[stat]])
                    train_y_list.append(y)
                    
            train_x = np.vstack(train_x_list)
            train_y = np.vstack([arr.reshape((1, 1)) for arr in train_y_list])
            
            # TEST
            
            test_x_list = []
            test_y_list = []
                  
            print('TEST')
            print('num of rows: {}'.format(len(df_test)))
                  
            count = 0
            for index, row in df_test.iterrows():
                if count % 500 == 0:
                    print(count)
                    print(time.time())
                count += 1
                
                x = strategy.param_select_func(G_full, row['defense'], row['player'], stat)
                
                # only add x,y to list if x actually returned value
                if type(x) == np.ndarray:
                    test_x_list.append(x)
                    y = np.array([row[stat]])
                    test_y_list.append(y)
                    
            test_x = np.vstack(test_x_list)
            test_y = np.vstack([arr.reshape((1, 1)) for arr in test_y_list])
            
            
            # train model and use to predict test_y
            print('fitting model')
            print(time.time())
            
            strategy.model.fit(train_x, train_y)
            
            print(time.time())
            
            test_y_pred = strategy.model.predict(test_x)
            
            # save results
            key = (year, stat, strategy.name)
            results_dict[key] = [test_y, test_y_pred]


NEW RUN
2015
pts_per_min
top_5_linreg
TRAIN
num of rows: 13160
0
1523408968.591612
500
1523408971.7743013
1000
1523408975.6811287
1500
1523408979.238056
2000
1523408982.5361507
2500
1523408986.7572048
3000
1523408990.0522215
3500
1523408994.117048
4000
1523408996.056888
4500
1523409000.8967113
5000
1523409004.013381
5500
1523409008.7373605
6000
1523409012.9306254
6500
1523409017.0956244


KeyboardInterrupt: 

In [None]:
import pickle

with open('results1.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle)

### View results

results_dict[(year, stat, strategy_name)] = [actual_test_y, pred_test_y]

In [None]:
results_dict