In [1]:
import os.path
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors

In [2]:
pd.set_option('display.float_format', lambda x: '%.6f' % x)

## Load data

In [3]:
def load_data(dirname, filename):
    currdir = os.getcwd()
    datadir = os.path.abspath(os.path.join(currdir, dirname))
    return os.path.abspath(os.path.join(datadir, filename))

In [4]:
datadir = 'outputs/data/'
datafile = 'players-retained-sorted-nmf_weights-norm.csv'

df = pd.read_csv(os.path.join(datadir, datafile))

In [5]:
df_norm = df.drop(columns=['b{}'.format(str(x)) for x in range(1, df.shape[1]//2)])
df_norm
# print(range(1, df.shape[1]//2))

Unnamed: 0,p_id_retained,player,team,b1_norm,b2_norm,b3_norm,b4_norm,b5_norm
0,0,A. Pasaol,UE,0.473871,0.272449,0.113510,0.000002,0.140167
1,1,A. Melecio,DLSU,0.175075,0.274263,0.204652,0.122724,0.223286
2,2,R. Subido,UST,0.083098,0.414100,0.115078,0.102124,0.285599
3,3,D. Ildefonso,NU,0.308939,0.264366,0.110524,0.094568,0.221604
4,4,J. Ahanmisi,ADU,0.043061,0.321832,0.163748,0.355735,0.115624
...,...,...,...,...,...,...,...,...
77,77,F. Jaboneta,UP,0.405793,0.000006,0.000000,0.000000,0.594201
78,78,I. Batalier,UST,0.253072,0.242180,0.138704,0.366045,0.000000
79,79,C. Vito,UP,0.374604,0.299968,0.000000,0.000028,0.325400
80,80,S. Akomo,UST,0.970731,0.000000,0.029269,0.000000,0.000000


## Nearest neighbor

### League-wide

#### Run NN
- convert player basis weights to numpy array
- compute distances and indices of k nearest neighbors for each player

In [33]:
weights_nn_league = df_norm.drop(columns=['p_id_retained', 'player', 'team']).to_numpy()

In [36]:
neighbors = 6
nn_distances = []
nn_indices = []

nn_league = NearestNeighbors(n_neighbors=neighbors, metric='minkowski')
nn_league.fit(weights_nn_league)

for i in range(weights_nn_league.shape[0]):
    nn_league_neighbors = nn_league.kneighbors([weights_nn_league[i]], neighbors)
    nn_distances.append(nn_league_neighbors[0][0])
    nn_indices.append(nn_league_neighbors[1][0])
    
df_nn_distances = pd.DataFrame(nn_distances, columns=['dist_{}'.format(str(x)) for x in range(neighbors)]).drop(columns=['dist_0'])
df_nn_indices = pd.DataFrame(nn_indices, columns=['nn_{}'.format(str(x)) for x in range(neighbors)]).drop(columns=['nn_0'])
df_nn_di = pd.concat([df_nn_indices, df_nn_distances], axis=1)

# df_nn_distances.to_csv(
#     'outputs/data/nn_distances-league-retained.csv', index_label='p_id_retained'
#     )
# df_nn_indices.to_csv(
#     'outputs/data/nn_indices-league-retained.csv', index_label='p_id_retained'
#     )
# df_nn_di.to_csv(
#     'outputs/data/nn_distances-indices-league-retained.csv', index_label='p_id_retained'
#     )

In [37]:
pd.concat([df_nn_indices, df_nn_distances], axis=1)

Unnamed: 0,nn_1,nn_2,nn_3,nn_4,nn_5,dist_1,dist_2,dist_3,dist_4,dist_5
0,65,15,43,28,34,0.159469,0.162773,0.166038,0.170956,0.185395
1,61,5,22,18,3,0.102886,0.126595,0.153864,0.164777,0.166353
2,49,1,22,60,73,0.147784,0.200860,0.205314,0.217924,0.242726
3,7,5,43,62,9,0.071823,0.134705,0.143489,0.152946,0.156262
4,19,17,42,78,46,0.206130,0.213493,0.228941,0.254070,0.257300
...,...,...,...,...,...,...,...,...,...,...
77,47,21,40,34,31,0.304197,0.331367,0.342498,0.383619,0.388350
78,17,67,11,64,62,0.183797,0.186920,0.242228,0.248484,0.249032
79,40,7,73,10,3,0.087361,0.172641,0.173163,0.183176,0.193665
80,52,6,54,68,72,0.027690,0.038756,0.041392,0.069060,0.209611


In [30]:
# df_nn_indices['p_id_retained'] = df_norm.p_id_retained
df_nn_indices['player'] = df_norm.player
df_nn_indices['team'] = df_norm.team

# df_nn_distances['p_id_retained'] = df_norm.p_id_retained
df_nn_distances['player'] = df_norm.player
df_nn_distances['team'] = df_norm.team

df_nn_di['player'] = df_norm.player
df_nn_di['team'] = df_norm.team

In [26]:
df_nn_distances.to_csv(
    'outputs/data/nn_distances-league-retained-with-pt.csv', index_label='p_id_retained'
    )
df_nn_distances

Unnamed: 0,dist_1,dist_2,dist_3,dist_4,dist_5,player,team
0,0.159469,0.162773,0.166038,0.170956,0.185395,A. Pasaol,UE
1,0.102886,0.126595,0.153864,0.164777,0.166353,A. Melecio,DLSU
2,0.147784,0.200860,0.205314,0.217924,0.242726,R. Subido,UST
3,0.071823,0.134705,0.143489,0.152946,0.156262,D. Ildefonso,NU
4,0.206130,0.213493,0.228941,0.254070,0.257300,J. Ahanmisi,ADU
...,...,...,...,...,...,...,...
77,0.304197,0.331367,0.342498,0.383619,0.388350,F. Jaboneta,UP
78,0.183797,0.186920,0.242228,0.248484,0.249032,I. Batalier,UST
79,0.087361,0.172641,0.173163,0.183176,0.193665,C. Vito,UP
80,0.027690,0.038756,0.041392,0.069060,0.209611,S. Akomo,UST


In [27]:
df_nn_indices.to_csv(
    'outputs/data/nn_indices-league-retained-with-pt.csv', index_label='p_id_retained'
    )
df_nn_indices

Unnamed: 0,nn_1,nn_2,nn_3,nn_4,nn_5,player,team
0,65,15,43,28,34,A. Pasaol,UE
1,61,5,22,18,3,A. Melecio,DLSU
2,49,1,22,60,73,R. Subido,UST
3,7,5,43,62,9,D. Ildefonso,NU
4,19,17,42,78,46,J. Ahanmisi,ADU
...,...,...,...,...,...,...,...
77,47,21,40,34,31,F. Jaboneta,UP
78,17,67,11,64,62,I. Batalier,UST
79,40,7,73,10,3,C. Vito,UP
80,52,6,54,68,72,S. Akomo,UST


In [31]:
df_nn_di.to_csv(
    'outputs/data/nn_distances-indices-league-retained-with-pt.csv', index_label='p_id_retained'
    )
df_nn_di

Unnamed: 0,nn_1,nn_2,nn_3,nn_4,nn_5,dist_1,dist_2,dist_3,dist_4,dist_5,player,team
0,65,15,43,28,34,0.159469,0.162773,0.166038,0.170956,0.185395,A. Pasaol,UE
1,61,5,22,18,3,0.102886,0.126595,0.153864,0.164777,0.166353,A. Melecio,DLSU
2,49,1,22,60,73,0.147784,0.200860,0.205314,0.217924,0.242726,R. Subido,UST
3,7,5,43,62,9,0.071823,0.134705,0.143489,0.152946,0.156262,D. Ildefonso,NU
4,19,17,42,78,46,0.206130,0.213493,0.228941,0.254070,0.257300,J. Ahanmisi,ADU
...,...,...,...,...,...,...,...,...,...,...,...,...
77,47,21,40,34,31,0.304197,0.331367,0.342498,0.383619,0.388350,F. Jaboneta,UP
78,17,67,11,64,62,0.183797,0.186920,0.242228,0.248484,0.249032,I. Batalier,UST
79,40,7,73,10,3,0.087361,0.172641,0.173163,0.183176,0.193665,C. Vito,UP
80,52,6,54,68,72,0.027690,0.038756,0.041392,0.069060,0.209611,S. Akomo,UST


#### Enforce symmetry
- if player k is a neighbor of player j then player j should be a neighbor of player k
- get list of neighbors
    - get original k neighbors
    - for each player j, check if k is a neighbor
        - if yes, add j to k's neighbors 

In [11]:
p_neighbors = []
num_players, num_cols = df_nn_indices.shape
nn_cols = ['nn_{}'.format(str(x)) for x in range (1, num_cols-2)]

for i in range(num_players):
    p_neighbors_list = df_nn_indices[df_nn_indices.player==df_nn_indices['player'][i]][nn_cols].values.tolist()[0]
    p = df_nn_indices[df_nn_indices.player==df_nn_indices['player'][i]].player.values[0]
    t = df_nn_indices[df_nn_indices.team==df_nn_indices['team'][i]].team.values[0]
    for j in range(num_players):
        if i in df_nn_indices[df_nn_indices.player==df_nn_indices['player'][j]][nn_cols].values:
            p_neighbors_list.append(j)
    p_neighbors.append([i, p, t, list(set(p_neighbors_list))])

In [12]:
df_p_neighbors = pd.DataFrame(p_neighbors, columns=['p_id_retained', 'player', 'team', 'neighbors'])

df_p_neighbors.to_csv(
    'outputs/data/nn_players.csv', index=False
)

In [13]:
df_p_neighbors_stats = df_p_neighbors.copy(deep=True)

# get number of neighbors
df_p_neighbors_stats['num_neighbors'] = df_p_neighbors_stats.apply(lambda row: len(row.neighbors), axis=1)

In [14]:
df_p_neighbors_stats

Unnamed: 0,p_id_retained,player,team,neighbors,num_neighbors
0,0,A. Pasaol,UE,"[65, 34, 43, 14, 15, 25, 27, 28]",8
1,1,A. Melecio,DLSU,"[2, 3, 5, 42, 17, 18, 22, 61]",8
2,2,R. Subido,UST,"[1, 37, 73, 42, 46, 49, 22, 60]",8
3,3,D. Ildefonso,NU,"[1, 33, 34, 5, 7, 40, 9, 43, 79, 62, 31]",11
4,4,J. Ahanmisi,ADU,"[42, 44, 46, 78, 17, 19]",6
...,...,...,...,...,...
77,77,F. Jaboneta,UP,"[34, 40, 47, 21, 31]",5
78,78,I. Batalier,UST,"[64, 67, 4, 11, 44, 17, 59, 62]",8
79,79,C. Vito,UP,"[3, 7, 40, 73, 10]",5
80,80,S. Akomo,UST,"[68, 6, 72, 52, 54]",5


### Per-team

#### Initialize variables

In [15]:
team_names = np.unique(df_norm.team.values)
weights_teams = []
b_cols = ['b{}_norm'.format(str(x)) for x in range (1, df_norm.shape[1] - 2)]

# weights_team = [df_nn_distances[df_nn_distances.team==t][dist_cols] for t in team_names]
for t in team_names:
    df_norm[df_norm.team==t].to_csv('outputs/data/nmf_weights-per-team-{}.csv'.format(t), index=False)
    weights_teams.append(df_norm[df_norm.team==t][b_cols])

#### Run NN
- convert player basis weights to numpy array
- compute distances and indices to all teammates

In [16]:
nn_distances_t = []
nn_indices_t = []

max_p = len(max(weights_teams, key=len))

for t in weights_teams:
    p = len(t)
    nn_team = NearestNeighbors(n_neighbors=p, metric='minkowski')
    nn_team.fit(t)

#     wt = t
#     for i in wt.index:
#         nn_team_neighbors = nn_team.kneighbors([wt[wt.index==i]][0], p)
#         nn_distances_t.append(nn_team_neighbors[0][0])
#         nn_indices_t.append(nn_team_neighbors[1][0])
        
    wt = t.to_numpy()
    for i in range(p):
        nn_team_neighbors = nn_team.kneighbors([wt[i]], p)
        nn_distances_t.append(nn_team_neighbors[0][0])
        nn_indices_t.append(nn_team_neighbors[1][0])

# df_nn_distances_team = pd.DataFrame(nn_distances_t)

df_nn_distances_team = pd.DataFrame(nn_distances_t, columns=['dist_{}'.format(str(x)) for x in range(max_p)]).drop(columns=['dist_0'])
df_nn_indices_team = pd.DataFrame(nn_indices_t, columns=['nn_{}'.format(str(x)) for x in range(max_p)]).drop(columns=['nn_0'])

In [17]:
df_nn_distances_team['p_id_retained'] = df_norm.sort_values(by=["team", "p_id_retained"]).p_id_retained.reset_index(drop=True)
df_nn_distances_team['player'] = df_norm.sort_values(by=["team", "p_id_retained"]).player.reset_index(drop=True)
df_nn_distances_team['team'] = df_norm.sort_values(by=["team", "p_id_retained"]).team.reset_index(drop=True)

df_nn_indices_team['p_id_retained'] = df_norm.sort_values(by=["team", "p_id_retained"]).p_id_retained.reset_index(drop=True)
df_nn_indices_team['player'] = df_norm.sort_values(by=["team", "p_id_retained"]).player.reset_index(drop=True)
df_nn_indices_team['team'] = df_norm.sort_values(by=["team", "p_id_retained"]).team.reset_index(drop=True)

In [18]:
df_nn_distances_team.to_csv(
    'outputs/data/nn_distances-team-retained.csv', index=False
    )
df_nn_indices_team.to_csv(
    'outputs/data/nn_indices-team-retained.csv', index=False
    )

In [19]:
df_nn_distances_team

Unnamed: 0,dist_1,dist_2,dist_3,dist_4,dist_5,dist_6,dist_7,dist_8,dist_9,dist_10,dist_11,dist_12,p_id_retained,player,team
0,0.220222,0.242207,0.381296,0.409307,0.432588,0.460949,0.464577,0.488132,0.557755,0.558102,0.670129,0.876758,14,T. Ravena,ADMU
1,0.464577,0.517320,0.550279,0.560422,0.625280,0.728684,0.848458,0.870386,0.877289,0.889056,0.905759,0.939592,16,A. Kouame,ADMU
2,0.215066,0.333451,0.380895,0.397504,0.432588,0.467731,0.522221,0.531182,0.560422,0.561430,0.578839,0.727168,24,R. Verano,ADMU
3,0.262166,0.338693,0.422446,0.490967,0.505491,0.522221,0.582219,0.606249,0.670129,0.682090,0.766937,0.870386,30,A. Asistio,ADMU
4,0.187180,0.220222,0.260977,0.288104,0.348081,0.352945,0.380895,0.382971,0.385106,0.505491,0.625280,0.783132,39,I. Go,ADMU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,0.278036,0.485832,0.503798,0.622424,0.674117,0.776059,0.791164,0.905241,0.961949,,,,48,G. Mahinay,UST
78,0.250105,0.289068,0.559706,0.565341,0.674117,0.681850,0.749249,0.762162,0.799075,,,,59,J. Marcos,UST
79,0.278036,0.639996,0.680849,0.749249,0.763868,0.768439,0.771601,0.889350,0.967677,,,,63,E. Caunan,UST
80,0.250105,0.326712,0.458505,0.622424,0.632141,0.639996,0.646182,0.681520,0.848323,,,,78,I. Batalier,UST
