In [1]:
import os.path
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors

In [2]:
pd.set_option('display.float_format', lambda x: '%.6f' % x)

## Load data

In [3]:
def load_data(dirname, filename):
    currdir = os.getcwd()
    datadir = os.path.abspath(os.path.join(currdir, dirname))
    return os.path.abspath(os.path.join(datadir, filename))

In [4]:
datadir = '../../../outputs/spatial-basis'
datafile = 'nmf_weights-players-retained.csv'
outdir = '../../../outputs/nearest-neighbor'

df = pd.read_csv(os.path.join(datadir, datafile))

In [5]:
df_norm = df.drop(columns=['b{}'.format(str(x)) for x in range(1, df.shape[1]//2)])
df_norm
# print(range(1, df.shape[1]//2))

Unnamed: 0,p_id_retained,player,team,b1_norm,b2_norm,b3_norm,b4_norm,b5_norm
0,0,A. Pasaol,UE,0.473871,0.272447,0.113510,0.000002,0.140169
1,1,A. Melecio,DLSU,0.175074,0.274262,0.204652,0.122723,0.223288
2,2,R. Subido,UST,0.083097,0.414097,0.115078,0.102125,0.285603
3,3,D. Ildefonso,NU,0.308937,0.264363,0.110523,0.094570,0.221607
4,4,J. Ahanmisi,ADU,0.043059,0.321835,0.163749,0.355734,0.115624
...,...,...,...,...,...,...,...,...
77,77,F. Jaboneta,UP,0.405794,0.000006,0.000000,0.000000,0.594200
78,78,I. Batalier,UST,0.253076,0.242191,0.138705,0.366028,0.000000
79,79,C. Vito,UP,0.374604,0.299957,0.000000,0.000028,0.325411
80,80,S. Akomo,UST,0.970732,0.000000,0.029268,0.000000,0.000000


## Nearest neighbor

### League-wide

#### Run NN
- convert player basis weights to numpy array
- compute distances and indices of k nearest neighbors for each player

In [6]:
weights_nn_league = df_norm.drop(columns=['p_id_retained', 'player', 'team']).to_numpy()

In [7]:
neighbors = 6
nn_distances = []
nn_indices = []

nn_league = NearestNeighbors(n_neighbors=neighbors, metric='minkowski')
nn_league.fit(weights_nn_league)

for i in range(weights_nn_league.shape[0]):
    nn_league_neighbors = nn_league.kneighbors([weights_nn_league[i]], neighbors)
    nn_distances.append(nn_league_neighbors[0][0])
    nn_indices.append(nn_league_neighbors[1][0])
    
df_nn_distances = pd.DataFrame(nn_distances, columns=['dist_{}'.format(str(x)) for x in range(neighbors)]).drop(columns=['dist_0'])
df_nn_indices = pd.DataFrame(nn_indices, columns=['nn_{}'.format(str(x)) for x in range(neighbors)]).drop(columns=['nn_0'])
df_nn_di = pd.concat([df_nn_indices, df_nn_distances], axis=1)

# df_nn_distances.to_csv(
#     'outputs/data/nn_distances-league-retained.csv', index_label='p_id_retained'
#     )
# df_nn_indices.to_csv(
#     'outputs/data/nn_indices-league-retained.csv', index_label='p_id_retained'
#     )
# df_nn_di.to_csv(
#     'outputs/data/nn_distances-indices-league-retained.csv', index_label='p_id_retained'
#     )

In [8]:
pd.concat([df_nn_indices, df_nn_distances], axis=1)

Unnamed: 0,nn_1,nn_2,nn_3,nn_4,nn_5,dist_1,dist_2,dist_3,dist_4,dist_5
0,65,15,43,28,34,0.159468,0.162775,0.166025,0.170956,0.185396
1,61,5,22,18,3,0.102868,0.126595,0.153864,0.164776,0.166352
2,49,1,22,60,73,0.147785,0.200859,0.205315,0.217921,0.242728
3,7,5,43,62,9,0.071823,0.134706,0.143501,0.152941,0.156262
4,19,17,42,78,46,0.206130,0.213493,0.228939,0.254071,0.257300
...,...,...,...,...,...,...,...,...,...,...
77,47,21,40,34,31,0.304197,0.331368,0.342489,0.383615,0.388351
78,17,67,11,64,62,0.183788,0.186913,0.242226,0.248489,0.249027
79,40,7,73,10,3,0.087356,0.172641,0.173161,0.183175,0.193670
80,52,6,54,68,72,0.027691,0.038756,0.041392,0.069060,0.209610


In [9]:
# df_nn_indices['p_id_retained'] = df_norm.p_id_retained
df_nn_indices['player'] = df_norm.player
df_nn_indices['team'] = df_norm.team

# df_nn_distances['p_id_retained'] = df_norm.p_id_retained
df_nn_distances['player'] = df_norm.player
df_nn_distances['team'] = df_norm.team

df_nn_di['player'] = df_norm.player
df_nn_di['team'] = df_norm.team

In [10]:
df_nn_distances.to_csv(
    '{}/nn_distances-league-retained-with-pt.csv'.format(outdir), index_label='p_id_retained'
    )
df_nn_distances

Unnamed: 0,dist_1,dist_2,dist_3,dist_4,dist_5,player,team
0,0.159468,0.162775,0.166025,0.170956,0.185396,A. Pasaol,UE
1,0.102868,0.126595,0.153864,0.164776,0.166352,A. Melecio,DLSU
2,0.147785,0.200859,0.205315,0.217921,0.242728,R. Subido,UST
3,0.071823,0.134706,0.143501,0.152941,0.156262,D. Ildefonso,NU
4,0.206130,0.213493,0.228939,0.254071,0.257300,J. Ahanmisi,ADU
...,...,...,...,...,...,...,...
77,0.304197,0.331368,0.342489,0.383615,0.388351,F. Jaboneta,UP
78,0.183788,0.186913,0.242226,0.248489,0.249027,I. Batalier,UST
79,0.087356,0.172641,0.173161,0.183175,0.193670,C. Vito,UP
80,0.027691,0.038756,0.041392,0.069060,0.209610,S. Akomo,UST


In [11]:
df_nn_indices.to_csv(
    '{}/nn_indices-league-retained-with-pt.csv'.format(outdir), index_label='p_id_retained'
    )
df_nn_indices

Unnamed: 0,nn_1,nn_2,nn_3,nn_4,nn_5,player,team
0,65,15,43,28,34,A. Pasaol,UE
1,61,5,22,18,3,A. Melecio,DLSU
2,49,1,22,60,73,R. Subido,UST
3,7,5,43,62,9,D. Ildefonso,NU
4,19,17,42,78,46,J. Ahanmisi,ADU
...,...,...,...,...,...,...,...
77,47,21,40,34,31,F. Jaboneta,UP
78,17,67,11,64,62,I. Batalier,UST
79,40,7,73,10,3,C. Vito,UP
80,52,6,54,68,72,S. Akomo,UST


In [12]:
df_nn_di.to_csv(
    '{}/nn_distances-indices-league-retained-with-pt.csv'.format(outdir), index_label='p_id_retained'
    )
df_nn_di

Unnamed: 0,nn_1,nn_2,nn_3,nn_4,nn_5,dist_1,dist_2,dist_3,dist_4,dist_5,player,team
0,65,15,43,28,34,0.159468,0.162775,0.166025,0.170956,0.185396,A. Pasaol,UE
1,61,5,22,18,3,0.102868,0.126595,0.153864,0.164776,0.166352,A. Melecio,DLSU
2,49,1,22,60,73,0.147785,0.200859,0.205315,0.217921,0.242728,R. Subido,UST
3,7,5,43,62,9,0.071823,0.134706,0.143501,0.152941,0.156262,D. Ildefonso,NU
4,19,17,42,78,46,0.206130,0.213493,0.228939,0.254071,0.257300,J. Ahanmisi,ADU
...,...,...,...,...,...,...,...,...,...,...,...,...
77,47,21,40,34,31,0.304197,0.331368,0.342489,0.383615,0.388351,F. Jaboneta,UP
78,17,67,11,64,62,0.183788,0.186913,0.242226,0.248489,0.249027,I. Batalier,UST
79,40,7,73,10,3,0.087356,0.172641,0.173161,0.183175,0.193670,C. Vito,UP
80,52,6,54,68,72,0.027691,0.038756,0.041392,0.069060,0.209610,S. Akomo,UST


#### Enforce symmetry
- if player k is a neighbor of player j then player j should be a neighbor of player k
- get list of neighbors
    - get original k neighbors
    - for each player j, check if k is a neighbor
        - if yes, add j to k's neighbors 

In [13]:
p_neighbors = []
num_players, num_cols = df_nn_indices.shape
nn_cols = ['nn_{}'.format(str(x)) for x in range (1, num_cols-2)]

for i in range(num_players):
    p_neighbors_list = df_nn_indices[df_nn_indices.player==df_nn_indices['player'][i]][nn_cols].values.tolist()[0]
    p = df_nn_indices[df_nn_indices.player==df_nn_indices['player'][i]].player.values[0]
    t = df_nn_indices[df_nn_indices.team==df_nn_indices['team'][i]].team.values[0]
    for j in range(num_players):
        if i in df_nn_indices[df_nn_indices.player==df_nn_indices['player'][j]][nn_cols].values:
            p_neighbors_list.append(j)
    p_neighbors.append([i, p, t, list(set(p_neighbors_list))])

In [14]:
df_p_neighbors = pd.DataFrame(p_neighbors, columns=['p_id_retained', 'player', 'team', 'neighbors'])

df_p_neighbors.to_csv(
    '{}/nn_players.csv'.format(outdir), index=False
)

In [15]:
df_p_neighbors_stats = df_p_neighbors.copy(deep=True)

# get number of neighbors
df_p_neighbors_stats['num_neighbors'] = df_p_neighbors_stats.apply(lambda row: len(row.neighbors), axis=1)
df_p_neighbors_stats.to_csv(
    '{}/nn_players_2.csv'.format(outdir), index=False
)

In [16]:
df_p_neighbors_stats

Unnamed: 0,p_id_retained,player,team,neighbors,num_neighbors
0,0,A. Pasaol,UE,"[65, 34, 43, 14, 15, 25, 27, 28]",8
1,1,A. Melecio,DLSU,"[2, 5, 17, 18, 22, 61]",6
2,2,R. Subido,UST,"[1, 37, 42, 49, 22, 60]",6
3,3,D. Ildefonso,NU,"[33, 34, 5, 7, 9, 43, 62]",7
4,4,J. Ahanmisi,ADU,"[42, 44, 78, 17, 19]",5
...,...,...,...,...,...
77,77,F. Jaboneta,UP,"[40, 34, 21, 47]",4
78,78,I. Batalier,UST,"[64, 67, 4, 11, 17, 59]",6
79,79,C. Vito,UP,"[40, 73, 10, 7]",4
80,80,S. Akomo,UST,"[54, 52, 6, 68]",4


### Per-team

#### Initialize variables

In [17]:
team_names = np.unique(df_norm.team.values)
weights_teams = []
b_cols = ['b{}_norm'.format(str(x)) for x in range (1, df_norm.shape[1] - 2)]

# weights_team = [df_nn_distances[df_nn_distances.team==t][dist_cols] for t in team_names]
for t in team_names:
    df_norm[df_norm.team==t].to_csv('{}/nmf_weights-per-team-{}.csv'.format(outdir, t), index=False)
    weights_teams.append(df_norm[df_norm.team==t][b_cols])

#### Run NN
- convert player basis weights to numpy array
- compute distances and indices to all teammates

In [18]:
nn_distances_t = []
nn_indices_t = []

max_p = len(max(weights_teams, key=len))

for t in weights_teams:
    p = len(t)
    nn_team = NearestNeighbors(n_neighbors=p, metric='minkowski')
    nn_team.fit(t)

#     wt = t
#     for i in wt.index:
#         nn_team_neighbors = nn_team.kneighbors([wt[wt.index==i]][0], p)
#         nn_distances_t.append(nn_team_neighbors[0][0])
#         nn_indices_t.append(nn_team_neighbors[1][0])
        
    wt = t.to_numpy()
    for i in range(p):
        nn_team_neighbors = nn_team.kneighbors([wt[i]], p)
        nn_distances_t.append(nn_team_neighbors[0][0])
        nn_indices_t.append(nn_team_neighbors[1][0])

# df_nn_distances_team = pd.DataFrame(nn_distances_t)

df_nn_distances_team = pd.DataFrame(nn_distances_t, columns=['dist_{}'.format(str(x)) for x in range(max_p)]).drop(columns=['dist_0'])
df_nn_indices_team = pd.DataFrame(nn_indices_t, columns=['nn_{}'.format(str(x)) for x in range(max_p)]).drop(columns=['nn_0'])

In [19]:
df_nn_distances_team['p_id_retained'] = df_norm.sort_values(by=["team", "p_id_retained"]).p_id_retained.reset_index(drop=True)
df_nn_distances_team['player'] = df_norm.sort_values(by=["team", "p_id_retained"]).player.reset_index(drop=True)
df_nn_distances_team['team'] = df_norm.sort_values(by=["team", "p_id_retained"]).team.reset_index(drop=True)

df_nn_indices_team['p_id_retained'] = df_norm.sort_values(by=["team", "p_id_retained"]).p_id_retained.reset_index(drop=True)
df_nn_indices_team['player'] = df_norm.sort_values(by=["team", "p_id_retained"]).player.reset_index(drop=True)
df_nn_indices_team['team'] = df_norm.sort_values(by=["team", "p_id_retained"]).team.reset_index(drop=True)

In [20]:
df_nn_distances_team.to_csv(
    '{}/nn_distances-team-retained.csv'.format(outdir), index=False
    )
df_nn_indices_team.to_csv(
    '{}/nn_indices-team-retained.csv'.format(outdir), index=False
    )

In [21]:
df_nn_distances_team

Unnamed: 0,dist_1,dist_2,dist_3,dist_4,dist_5,dist_6,dist_7,dist_8,dist_9,dist_10,dist_11,dist_12,p_id_retained,player,team
0,0.220214,0.242194,0.381298,0.409315,0.432584,0.460951,0.464572,0.488133,0.557757,0.558106,0.670134,0.876765,14,T. Ravena,ADMU
1,0.464572,0.517326,0.550284,0.560439,0.625285,0.728686,0.848459,0.870397,0.877329,0.889061,0.905776,0.939593,16,A. Kouame,ADMU
2,0.215080,0.333443,0.380905,0.397503,0.432584,0.467734,0.522222,0.531178,0.560439,0.561423,0.578850,0.727167,24,R. Verano,ADMU
3,0.262162,0.338692,0.422441,0.490974,0.505501,0.522222,0.582220,0.606256,0.670134,0.682097,0.766945,0.870397,30,A. Asistio,ADMU
4,0.187180,0.220214,0.260977,0.288103,0.348088,0.352951,0.380905,0.382968,0.385127,0.505501,0.625285,0.783162,39,I. Go,ADMU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,0.278036,0.485833,0.503799,0.622415,0.674120,0.776057,0.791164,0.905243,0.961947,,,,48,G. Mahinay,UST
78,0.250115,0.289063,0.559718,0.565340,0.674120,0.681851,0.749251,0.762159,0.799078,,,,59,J. Marcos,UST
79,0.278036,0.639989,0.680852,0.749251,0.763869,0.768440,0.771599,0.889352,0.967675,,,,63,E. Caunan,UST
80,0.250115,0.326726,0.458494,0.622415,0.632140,0.639989,0.646166,0.681517,0.848316,,,,78,I. Batalier,UST
