In [351]:
import pandas as pd
import math
import pandas as pd
import numpy as np
import networkx as nx
from tqdm import tqdm_notebook

from recommender_system import BaseModel

In [352]:
def build_graph(path) -> nx.Graph:
    """
    Create graph representation of dataset
    Parameters
    ----------
    path : str
        Relative path to csv file
    """
    df = pd.read_csv(path, usecols=['uid1', 'uid2', 'target'])
    df.columns = ['uid1', 'uid2', 'weight']
    df['weight'] = 1 / df['weight']
    g = nx.from_pandas_edgelist(df, source='uid1', target='uid2', edge_attr=['weight'], create_using=nx.DiGraph())
    return g

In [353]:
train_df = pd.read_csv(f'data/validate/train_df.csv')
users = pd.read_csv(f'data/validate/train_index_map.csv', index_col=0, header=None)[1].values
test_df = pd.read_csv(f'data/validate/test_df.csv')
train_graph = build_graph(f'data/validate/train_df.csv')

In [354]:
import numpy as np
np.random.seed(0)

In [355]:
train_df['target'] = 1
train_df = train_df.drop(columns=['intensity', 'time'])

In [356]:
test_df['target'] = 1
test_df = test_df.drop(columns=['intensity', 'time'])

In [357]:
p = train_df[['uid1', 'uid2']].stack().value_counts(normalize=True)
train_neg = pd.DataFrame({'uid1': np.random.choice(a=p.index.values, size=300000, p=p.values), 
                          'uid2': np.random.choice(a=p.index.values, size=300000, p=p.values),
                          'target': [0] * 300000})

In [358]:
p = test_df[['uid1', 'uid2']].stack().value_counts(normalize=True)
test_neg = pd.DataFrame({'uid1': np.random.choice(a=p.index.values, size=100000, p=p.values), 
                         'uid2': np.random.choice(a=p.index.values, size=100000, p=p.values),
                         'target': [0] * 100000})

In [359]:
def skip_errors(f):
    def _arg_wrapper(uid1, uid2, g):
        try: 
            return f(uid1, uid2, g)
        except (KeyError, nx.NetworkXError, ValueError, ZeroDivisionError, nx.NodeNotFound):
            return np.nan
    return _arg_wrapper

@skip_errors
def CommonNeighbors(u, v, g):
    u_neighbors = set(g.neighbors(u))
    v_neighbors = set(g.neighbors(v))
    return len(u_neighbors.intersection(v_neighbors))

@skip_errors
def AdamicAdar(u, v, g):
    u_neighbors = set(g.neighbors(u))
    v_neighbors = set(g.neighbors(v))
    aa = 0
    for i in u_neighbors.intersection(v_neighbors):
        aa += 1 / math.log(len(list(g.neighbors(i))))
    return aa

@skip_errors
def ResourceAllocation(u, v, g):
    u_neighbors = set(g.neighbors(u))
    v_neighbors = set(g.neighbors(v))
    ra = 0
    for i in u_neighbors.intersection(v_neighbors):
        ra += 1 / float(len(list(g.neighbors(i))))
    return ra

@skip_errors
def JaccardCoefficent(u, v, g):
    u_neighbors = set(g.neighbors(u))
    v_neighbors = set(g.neighbors(v))
    return len(u_neighbors.intersection(v_neighbors)) / float(len(u_neighbors.union(v_neighbors)))

@skip_errors
def PreferentialAttachment(u, v, g):
    return len(list(g.neighbors(u))) * len(list(g.neighbors(v)))

In [360]:
feature_set = {
    'common_neighbors': CommonNeighbors, 
    'adamic_adar': AdamicAdar,
    'resourse_allocation': ResourceAllocation,
    'jaccard_coef': JaccardCoefficent,
    'pref_attachment': PreferentialAttachment,
}

In [361]:
train_df = pd.concat([train_df, train_neg], axis=0).sample(frac=1)
test_df = pd.concat([test_df, test_neg], axis=0).sample(frac=1)

In [362]:
train_graph = nx.from_pandas_edgelist(train_df, source='uid1', target='uid2', create_using=nx.DiGraph())

In [363]:
for feature in feature_set:
    test_df[feature] = test_df.apply(lambda x: feature_set[feature](x['uid1'], x['uid2'], train_graph), axis=1)

In [364]:
for feature in feature_set:
    train_df[feature] = train_df.apply(lambda x: feature_set[feature](x['uid1'], x['uid2'], train_graph), axis=1)

In [365]:
X_train, y_train = train_df.drop(columns=['uid1', 'uid2', 'target']), train_df['target']
X_test, y_test = test_df.drop(columns=['uid1', 'uid2', 'target']), test_df['target']

In [366]:
import lightgbm as lgb

In [367]:
tr = lgb.Dataset(np.array(X_train), np.array(y_train))
te = lgb.Dataset(np.array(X_test), np.array(y_test))

In [368]:
param_lgb = {
    'random_state': 0,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'boost': 'gbdt',
    'feature_fraction': 0.8,
    'learning_rate': 0.01,
    'metric':'auc',
    'num_leaves': 31,
    'num_threads': 8,
    'objective': 'binary',
    'metric': 'auc'
}

In [369]:
bst = lgb.train(param_lgb, tr, num_boost_round=5000, 
          valid_sets=te, early_stopping_rounds=int(5 / param_lgb['learning_rate']), verbose_eval=100)

Training until validation scores don't improve for 500 rounds.
[100]	valid_0's auc: 0.610555
[200]	valid_0's auc: 0.607249
[300]	valid_0's auc: 0.604816
[400]	valid_0's auc: 0.599304
[500]	valid_0's auc: 0.594563
Early stopping, best iteration is:
[6]	valid_0's auc: 0.618067


In [370]:
def get_second_level_neighbors(graph) -> dict:
    neighbors_dict = {}
    for node in tqdm_notebook(sorted(graph.nodes)):
        neighbors = set(graph.neighbors(node))
        second_level = []
        for i in neighbors:
            second_level += list(graph.neighbors(i))
        second_level = Counter(second_level)
        del second_level[node] 
        for word in list(second_level):
            if word in neighbors:
                del second_level[word] 
        neighbors_dict[node] = second_level
    return neighbors_dict

In [371]:
users = get_second_level_neighbors(train_graph)

HBox(children=(IntProgress(value=0, max=38993), HTML(value='')))

In [372]:
popular = train_df[['uid1', 'uid2']].stack().value_counts()
popular = list(popular.index)[:5]

In [378]:
def predict(user):
    if user in train_graph.nodes:
        pred = np.array(users[user].most_common()[:5])
        if len(pred) > 0:
            pred = pred[:, 0]
        if len(pred) == 5:
            return pred
        else:
            for i in range(5 - len(pred)):
                pred = np.append(pred, popular[i])
            return pred
    else:
        return popular

In [379]:
test_graph = BaseModel().build_graph(f'data/validate/test_df.csv')
test_users = BaseModel().get_neighbors_dict(test_graph)

In [380]:
predictions = []
for user in tqdm_notebook(list(test_users.keys())):
    predictions.append(predict(user))

HBox(children=(IntProgress(value=0, max=26956), HTML(value='')))

In [376]:
from metrics import mapk

In [381]:
mapk([list(x) for x in list(test_users.values())], predictions, k=5)

0.019787983943326235