In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import time
import operator

# holds different selection methods and triangle generator
import fitting

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor


lin_reg = LinearRegression()
random_forest = RandomForestRegressor()

### Run

In [2]:
import random
test_fraction = 0.05
num_iterations = 3

#years = [2015, 2016, 2017]
years = [2015]

#stats = ['pts_per_min', 'trb_per_min', 'ast_per_min', 'blk_per_min', 'stl_per_min']
stats = ['pts_per_min']



for year in years:
    df_full = pd.read_csv('df_actuals/df_actual_{}.csv'.format(year))
    

    for i in range(num_iterations):
        # split df
        indices = list(range(len(df_full)))
        num_hidden = int(test_fraction * len(df_full))
        hidden_indices = random.sample(indices, num_hidden)
    
        df_full['hidden'] = 'False'
        for index in hidden_indices:
            df_full.at[index, 'hidden'] = 'True'
        
            df_train = df_full[df_full['hidden'] == 'False']
            df_test = df_full[df_full['hidden'] == 'True']
    
        #print(df_full.index.values.tolist())
        #print(df_train.index.values.tolist())
        #print(df_test.index.values.tolist())


        # create graphs from respective datasets
        edge_attrs = stats + ['times_played']

        G_train = nx.from_pandas_edgelist(df_train, 'defense', 'player', edge_attrs).to_undirected()
        G_full = nx.from_pandas_edgelist(df_full, 'defense', 'player', edge_attrs).to_undirected()
        
        for stat in stats:
            
            print('NEW RUN')
            print(year)
            print(stat)
            
            # do for all treatments at once
            
            # TRAIN
            
            train_x_lists = {}
            train_y_lists = {}
            
            train_x_actual = {}
            train_y_actual = {}
            
            for treatment in fitting.treatments:
                train_x_lists[treatment] = []
                train_y_lists[treatment] = []
                
            
            print('TRAIN')
            print('num of rows: {}'.format(len(df_train)))
                
            # some problem with indexes - not sure what yet
            count = 0
            for index, row in df_train.iterrows():
                if count % 500 == 0:
                    print(count)
                    print(time.time())
                count += 1
                
                # base arr is same for all treatments
                triangle_list = fitting.generate_triangles(G_train, row['defense'], row['player'], stat)
                
                # if <10 (because of diffs between G_full and G_train) -> leave out, test data will still be consistent
                if len(triangle_list) < 10:
                    continue
                
                triangle_arr = fitting.sort_triangles(triangle_list)
                
                for treatment in fitting.treatments:
                    x = fitting.route_treatment(triangle_arr, treatment)
                    
                    #if x == 'NAN':
                    #    print(row['defense'])
                    #    print(row['player'])
                    #    print('tlist with G_train: {}'.format(triangle_list))
                    #    withfull = fitting.generate_triangles(G_full, row['defense'], row['player'], stat)
                    #    print('tlist with G_full: {}'.format(withfull))
                
                    # only add x,y to list if x actually returned value
                    if type(x) == np.ndarray:
                        train_x_lists[treatment].append(x)
                        
                        y = np.array([row[stat]])
                        train_y_lists[treatment].append(y)
             
            for treatment in fitting.treatments:
                train_x_actual[treatment] = np.vstack(train_x_lists[treatment])
                train_y_actual[treatment] = np.vstack([arr.reshape((1, 1)) for arr in train_y_lists[treatment]])
            
            # TEST
            
            test_x_lists = {}
            test_y_lists = {}
            
            test_x_actual = {}
            test_y_actual = {}
            
            for treatment in fitting.treatments:
                test_x_lists[treatment] = []
                test_y_lists[treatment] = []
                  
            print('TEST')
            print('num of rows: {}'.format(len(df_test)))
            
            # build labels (for error checking later)
            test_y_labels = []
            
            # some problem with indexes - not sure what yet
            count = 0
            for index, row in df_test.iterrows():
                if count % 500 == 0:
                    print(count)
                    print(time.time())
                count += 1
                
                test_y_labels.append('{}_{}'.format(row['defense'], row['player']))
                
                # base arr is same for all treatments
                triangle_list = fitting.generate_triangles(G_full, row['defense'], row['player'], stat)
                triangle_arr = fitting.sort_triangles(triangle_list)
                
                for treatment in fitting.treatments:
                    x = fitting.route_treatment(triangle_arr, treatment)
                
                    # only add x,y to list if x actually returned value
                    if type(x) == np.ndarray:
                        test_x_lists[treatment].append(x)
                        
                        y = np.array([row[stat]])
                        test_y_lists[treatment].append(y)
             
            for treatment in fitting.treatments:
                test_x_actual[treatment] = np.vstack(test_x_lists[treatment])
                test_y_actual[treatment] = np.vstack([arr.reshape((1, 1)) for arr in test_y_lists[treatment]])
              
            
            # pred dict to store
            lin_reg_y_pred = {}
            random_forest_y_pred = {}
            
                    
            # train models and use to predict test_y
            for treatment in fitting.treatments:
                print('fitting linreg {}'.format(treatment))
                print(time.time())
            
                lin_reg.fit(train_x_actual[treatment], train_y_actual[treatment])
            
                print(time.time())
            
                lin_reg_y_pred[treatment] = lin_reg.predict(test_x_actual[treatment])
                
                print('fitting rf')
                print(time.time())
            
                random_forest.fit(train_x_actual[treatment], train_y_actual[treatment].ravel())
            
                print(time.time())
            
                random_forest_y_pred[treatment] = random_forest.predict(test_x_actual[treatment])
                
            
            # SAVE RESULTS
            # build dict to populate df
            data_dict = {}
            data_dict['label'] = test_y_labels
            
            for treatment in fitting.treatments:
                data_dict['linreg_{}'.format(treatment)] = lin_reg_y_pred[treatment].ravel()
                data_dict['rf_{}'.format(treatment)] = random_forest_y_pred[treatment]
                data_dict['actual_{}'.format(treatment)] = test_y_actual[treatment].ravel()

            
            # build df
            results_df = pd.DataFrame(data=data_dict)
            
            # save df
            file_path = 'results/{}{}_{}.csv'.format(stat, year, i)
            
            results_df.to_csv(file_path, index=False)
            
            print('results saved to {}'.format(file_path))


NEW RUN
2015
pts_per_min
TRAIN
num of rows: 12718
0
1525805035.179634
500
1525805044.7561643
1000
1525805054.8681405
1500
1525805064.3405414
2000
1525805074.1235137
2500
1525805083.8204327
3000
1525805096.5439203
3500
1525805111.0361266
4000
1525805123.295448
4500
1525805136.3570035
5000
1525805151.270362
5500
1525805164.2925963
6000
1525805180.1226914
6500
1525805196.222108
7000
1525805212.6280837
7500
1525805228.5673401
8000
1525805245.1868582
8500
1525805260.5602036
9000
1525805272.0590153
9500
1525805283.1585577
10000
1525805294.3278987
10500
1525805305.9073224
11000
1525805320.0279992
11500
1525805333.72302
12000
1525805347.6338391
12500
1525805361.4101546
TEST
num of rows: 669
0
1525805367.6698353
500
1525805382.2267852
fitting linreg null
1525805387.451362
1525805387.478181
fitting rf
1525805387.4794302
1525805387.8294842
fitting linreg median_5
1525805387.8331146
1525805387.8357954
fitting rf
1525805387.8364189
1525805388.771363
fitting linreg median_10
1525805388.7748835
15258

In [3]:
for d in data_dict:
    print(d)
    try:
        print(data_dict[d].shape)
    except:
        print('not arr')

label
not arr
linreg_null
(669,)
rf_null
(669,)
actual_null
(669,)
linreg_median_5
(669,)
rf_median_5
(669,)
actual_median_5
(669,)
linreg_median_10
(669,)
rf_median_10
(669,)
actual_median_10
(669,)
linreg_mean_5
(669,)
rf_mean_5
(669,)
actual_mean_5
(669,)
linreg_mean_10
(669,)
rf_mean_10
(669,)
actual_mean_10
(669,)


In [4]:
fitting.treatments

['null', 'median_5', 'median_10', 'mean_5', 'mean_10']

In [5]:
train_x_actual['null']

array([[ 0.32313947,  0.2127445 ],
       [ 0.21408559,  0.14037801],
       [ 0.41839237,  0.3066049 ],
       ..., 
       [ 0.78472379,  0.51642457],
       [ 0.69758628,  0.40740741],
       [ 0.        ,  0.        ]])

In [6]:
np.isnan(np.min(train_x_actual['null']))

False

In [7]:
train_y_actual['null']

array([[ 0.30555556],
       [ 0.46153846],
       [ 0.44230769],
       ..., 
       [ 0.5       ],
       [ 0.13333333],
       [ 0.2       ]])

In [8]:
np.isnan(np.min(train_y_actual['null']))

False

In [9]:
df = pd.DataFrame(data=train_x_actual['null'])

df.head()

Unnamed: 0,0,1
0,0.323139,0.212744
1,0.214086,0.140378
2,0.418392,0.306605
3,0.303485,0.166234
4,0.606874,0.485374


In [10]:
nans = lambda df: df[df.isnull().any(axis=1)]

nans(df)

Unnamed: 0,0,1


In [11]:


fitting.generate_triangles(G, 'Defense_paok', 'diogenis-gorgonis-1', 'pts_per_min')

NameError: name 'G' is not defined