In [1]:
import pandas as pd
import math
import pandas as pd
import numpy as np
import networkx as nx
from tqdm import tqdm_notebook

from recommender_system import BaseModel

In [116]:
def build_graph(path) -> nx.Graph:
    """
    Create graph representation of dataset
    Parameters
    ----------
    path : str
        Relative path to csv file
    """
    df = pd.read_csv(path, usecols=['uid1', 'uid2', 'target'])
    df.columns = ['uid1', 'uid2', 'weight']
    df['weight'] = 1 / df['weight']
    g = nx.from_pandas_edgelist(df, source='uid1', target='uid2', edge_attr=['weight'], create_using=nx.DiGraph())
    return g

In [117]:
train_df = pd.read_csv(f'data/validate/train_df.csv')
users = pd.read_csv(f'data/validate/train_index_map.csv', index_col=0, header=None)[1].values
test_df = pd.read_csv(f'data/validate/test_df.csv')
train_graph = build_graph(f'data/validate/train_df.csv')

In [3]:
train_df

Unnamed: 0,uid1,uid2,time,intensity,target
0,632788,2592653,1.000000,2.306888e-01,1.230689
1,3073342,6016278,1.000000,1.740404e-04,1.000174
2,13413071,5605047,1.000000,1.364075e-01,1.136408
3,3063103,15617763,1.000000,2.005034e-01,1.200503
4,10680843,6803928,1.000000,7.213303e-12,1.000000
...,...,...,...,...,...
725067,9606946,5388999,0.362229,5.489187e-03,1.005489
725068,15508128,5388999,0.362228,8.768662e-05,1.000088
725069,5388999,15508128,0.362228,8.765214e-05,1.000088
725070,1344176,13588536,0.362228,3.839385e-02,1.038394


In [4]:
test_df

Unnamed: 0,uid1,uid2,time,intensity,target
0,5453330,6643757,3.622275e-01,0.000123,1.000123
1,6643757,5453330,3.622275e-01,0.000132,1.000132
2,13433776,6326944,3.622274e-01,0.057462,1.057462
3,1179249,8900442,3.622274e-01,0.000029,1.000029
4,8900442,1179249,3.622274e-01,0.000018,1.000018
...,...,...,...,...,...
310741,16334984,14883929,4.134741e-05,0.525210,1.525210
310742,6237941,5731372,3.166315e-05,0.532890,1.532890
310743,5731372,6237941,3.166315e-05,0.485696,1.485696
310744,7775398,1319381,3.275974e-07,0.719708,1.719708


In [5]:
def skip_errors(f):
    def _arg_wrapper(uid1, uid2, g):
        try: 
            return f(uid1, uid2, g)
        except (KeyError, nx.NetworkXError):
            return np.nan
    return _arg_wrapper

@skip_errors
def CommonNeighbors(u, v, g):
    u_neighbors = set(g.neighbors(u))
    v_neighbors = set(g.neighbors(v))
    return len(u_neighbors.intersection(v_neighbors))

@skip_errors
def AdamicAdar(u, v, g):
    u_neighbors = set(g.neighbors(u))
    v_neighbors = set(g.neighbors(v))
    aa = 0
    for i in u_neighbors.intersection(v_neighbors):
        aa += 1 / math.log(len(list(g.neighbors(i))))
    return aa

@skip_errors
def ResourceAllocation(u, v, g):
    u_neighbors = set(g.neighbors(u))
    v_neighbors = set(g.neighbors(v))
    ra = 0
    for i in u_neighbors.intersection(v_neighbors):
        ra += 1 / float(len(list(g.neighbors(i))))
    return ra

@skip_errors
def JaccardCoefficent(u, v, g):
    u_neighbors = set(g.neighbors(u))
    v_neighbors = set(g.neighbors(v))
    return len(u_neighbors.intersection(v_neighbors)) / float(len(u_neighbors.union(v_neighbors)))

@skip_errors
def PreferentialAttachment(u, v, g):
    return len(list(g.neighbors(u))) * len(list(g.neighbors(v)))

In [6]:
feature_set = {'common_neighbors': CommonNeighbors, 
              'adamic_adar': AdamicAdar,
              'resourse_allocation': ResourceAllocation,
              'jaccard_coef': JaccardCoefficent,
              'pref_attachment': PreferentialAttachment,
              }


In [7]:
for feature in feature_set:
    test_df[feature] = test_df.apply(lambda x: feature_set[feature](x['uid1'], x['uid2'], train_graph), axis=1)

In [8]:
for feature in feature_set:
    train_df[feature] = train_df.apply(lambda x: feature_set[feature](x['uid1'], x['uid2'], train_graph), axis=1)

In [9]:
X_train, y_train = train_df.drop(columns=['uid1', 'uid2', 'time', 'intensity', 'target']), train_df['target']
X_test, y_test = train_df.drop(columns=['uid1', 'uid2', 'time', 'intensity', 'target']), train_df['target']

In [10]:
import lightgbm as lgb

In [11]:
tr = lgb.Dataset(np.array(X_train), np.array(y_train))
te = lgb.Dataset(np.array(X_train), np.array(y_train))

In [12]:
param_lgb = {
    'random_state': 0,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'boost': 'gbdt',
    'feature_fraction': 0.8,
    'learning_rate': 0.01,
    'metric':'auc',
    'num_leaves': 31,
    'num_threads': 8,
    'objective': 'regression',
    'metric': 'rmse'
}

In [13]:
bst = lgb.train(param_lgb, tr, num_boost_round=5000, 
          valid_sets=te, early_stopping_rounds=int(5 / param_lgb['learning_rate']), verbose_eval=100)

Training until validation scores don't improve for 500 rounds.
[100]	valid_0's rmse: 0.0888325
[200]	valid_0's rmse: 0.088495
[300]	valid_0's rmse: 0.0883827
[400]	valid_0's rmse: 0.0883143
[500]	valid_0's rmse: 0.0882592
[600]	valid_0's rmse: 0.0882124
[700]	valid_0's rmse: 0.0881681
[800]	valid_0's rmse: 0.0881267
[900]	valid_0's rmse: 0.0880864
[1000]	valid_0's rmse: 0.0880479
[1100]	valid_0's rmse: 0.0880095
[1200]	valid_0's rmse: 0.0879726
[1300]	valid_0's rmse: 0.0879379
[1400]	valid_0's rmse: 0.0879021
[1500]	valid_0's rmse: 0.0878683
[1600]	valid_0's rmse: 0.0878351
[1700]	valid_0's rmse: 0.0878038
[1800]	valid_0's rmse: 0.0877708
[1900]	valid_0's rmse: 0.0877392
[2000]	valid_0's rmse: 0.0877074
[2100]	valid_0's rmse: 0.0876758
[2200]	valid_0's rmse: 0.0876453
[2300]	valid_0's rmse: 0.0876153
[2400]	valid_0's rmse: 0.0875843
[2500]	valid_0's rmse: 0.0875535
[2600]	valid_0's rmse: 0.0875232
[2700]	valid_0's rmse: 0.0874931
[2800]	valid_0's rmse: 0.0874644
[2900]	valid_0's rmse: 

In [14]:
def predict(user):
    user_friends = list(train_graph.neighbors(user)) 
    new_df = pd.DataFrame({'uid1': user, 'uid2': list(set(users) - set(user_friends) - {user})})
    for feature in feature_set:
        new_df[feature] = new_df.apply(lambda x: feature_set[feature](x['uid1'], x['uid2'], train_graph), axis=1)
    pred = bst.predict(new_df.drop(columns=['uid1', 'uid2']))
    return pd.Series(pred, index=new_df['uid2']).sort_values(ascending=False)

In [15]:
predict(6643757)

uid2
6516347     1.195799
11835379    1.149712
14001360    1.149712
9602549     1.149712
3442162     1.149712
              ...   
7274553     1.039421
7976454     1.039411
5201231     1.039363
1586393     1.039196
16543998    1.038130
Length: 38987, dtype: float64

In [16]:
test_df[test_df['uid1'] == 6643757]

Unnamed: 0,uid1,uid2,time,intensity,target,common_neighbors,adamic_adar,resourse_allocation,jaccard_coef,pref_attachment
1,6643757,5453330,0.362227,0.000132,1.000132,1.0,0.378923,0.071429,0.125,20.0


In [17]:
bst.feature_importance(), X_train.columns

(array([18260, 29566, 35745, 31935, 34494]),
 Index(['common_neighbors', 'adamic_adar', 'resourse_allocation',
        'jaccard_coef', 'pref_attachment'],
       dtype='object'))

In [107]:
def predict(user):
    if user in train_graph.nodes:
        new_df = pd.DataFrame({'uid1': user, 
                               'uid2': list(set(np.unique([y for x in users[user] for y in x])))})
        if not new_df.empty:
            for feature in feature_set:
                new_df[feature] = new_df.apply(lambda x: feature_set[feature](x['uid1'], x['uid2'], train_graph), axis=1)
            pred = bst.predict(new_df.drop(columns=['uid1', 'uid2']))
            pred = new_df['uid2'].iloc[pred.argsort()[-5:]].values[::-1]
            if len(pred) == 5:
                return pred
            else:
                for i in range(5 - len(pred)):
                    pred = np.append(pred, 0)
                return pred
        else:
            return np.array([18260, 29566, 35745, 31935, 34494])
    else:
        return np.array([18260, 29566, 35745, 31935, 34494])

In [59]:
def get_second_level_neighbors(graph) -> dict:
    neighbors_dict = {}
    for node in sorted(graph.nodes):
        bfs = nx.bfs_successors(graph, node)
        neighbors = next(bfs)[1]
        second_level = []
        for i in neighbors:
            try:
                neigh = next(bfs)[1]
                second_level.append(neigh) 
            except StopIteration:
                break
        neighbors_dict[node] = second_level
    return neighbors_dict

In [60]:
users = get_second_level_neighbors(train_graph)

In [81]:
test = test_df.groupby('uid1').groups

In [113]:
predictions = []
for user in tqdm_notebook(list(test.keys())[:10000]):
    predictions.append(predict(user))

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

In [76]:
from metrics import mapk

In [115]:
mapk([list(x) for x in list(test.values())[:10000]], predictions, k=5)

0.0