In [1]:
import pandas as pd
import math
import pandas as pd
import numpy as np
import networkx as nx
from tqdm import tqdm_notebook
from scipy.sparse import csr_matrix, save_npz
from sklearn.metrics.pairwise import linear_kernel

from recommender_system import BaseModel

In [2]:
def build_graph(df) -> nx.Graph:
    g = nx.from_pandas_edgelist(df, source='uid1', target='uid2', create_using=nx.DiGraph())
    return g

def create_interaction_matrix(df, target_col='target') -> pd.DataFrame:
    interaction_matrix = df[['uid1', 'uid2', target_col]].pivot(index='uid1', columns='uid2')
    interaction_matrix.columns = interaction_matrix.columns.droplevel(0)
    full_index = interaction_matrix.index.union(interaction_matrix.columns).sort_values()
    interaction_matrix = interaction_matrix.reindex(index=full_index, columns=full_index)
    interaction_matrix.update(interaction_matrix.T)
    return interaction_matrix.fillna(0)

In [3]:
df = pd.read_csv(f'data/prod/df.csv')

In [4]:
ldf = len(df)
train_df = df.iloc[:int(0.6 * ldf)]
val_df = df.iloc[int(0.6 * ldf):int(0.8 * ldf)]
test_df = df.iloc[int(0.8 * ldf):]

In [5]:
train_graph = build_graph(train_df)
val_graph = build_graph(val_df)
test_graph = build_graph(test_df)

In [6]:
train_matrix = create_interaction_matrix(train_df)
map_index = train_matrix.index.to_series()
train_csr = csr_matrix(train_matrix)

In [7]:
pred = linear_kernel(train_csr, train_csr)

In [10]:
neighbors = {node: list(train_graph.neighbors(node)) for node in sorted(train_graph.nodes)}

In [16]:
int2uid = map_index.reset_index(drop=True)
uid2int = pd.Series(int2uid.index.values, index=int2uid.values)

In [21]:
for ind, uid in enumerate(int2uid):
    pred[ind, ind] = -1
    if uid in neighbors:
        friends_uids = neighbors[uid]
        friends_idx = uid2int.loc[friends_uids].dropna().astype(int).values
        pred[ind, friends_idx] = -1
sorted_recs = pred.argsort()[:, ::-1]

In [144]:
train_generated = pd.DataFrame()
for i, uid_list in enumerate(sorted_recs[:, :5]):
    for uid2 in uid_list:
        train_generated = train_generated.append(pd.Series({'uid1': int(int2uid.loc[i]), 
                                                            'uid2': int(int2uid.loc[uid2])}), 
                                                 ignore_index=True)
train_generated['target'] = 0

In [145]:
val_df = val_df[['uid1', 'uid2']].sort_values('uid1').reset_index(drop=True)

In [147]:
val_df

Unnamed: 0,uid1,uid2
0,1074,621625
1,1074,9741441
2,1074,12847947
3,1074,8669044
4,1074,550524
...,...,...
207159,16776095,12484165
207160,16776618,7834800
207161,16776618,3015439
207162,16776618,6909990


In [146]:
train_generated[train_generated.isin(val_df)].dropna()

Unnamed: 0,uid1,uid2,target


In [None]:
p = test_df[['uid1', 'uid2']].stack().value_counts(normalize=True)
test_generated = pd.DataFrame({'uid1': np.random.choice(a=p.index.values, size=100000, p=p.values), 
                         'uid2': np.random.choice(a=p.index.values, size=100000, p=p.values),
                         'target': [0] * 100000})
test_generated['target'] = 0

In [177]:
train_df = train_df[['uid1', 'uid2', 'target']]
train_df['target'] = 1

test_df = test_df[['uid1', 'uid2', 'target']]
test_df['target'] = 1

In [54]:
def skip_errors(f):
    def _arg_wrapper(uid1, uid2, g):
        try: 
            return f(uid1, uid2, g)
        except (KeyError, nx.NetworkXError, ValueError, ZeroDivisionError):
            return np.nan
    return _arg_wrapper

@skip_errors
def CommonNeighbors(u, v, g):
    u_neighbors = set(g.neighbors(u))
    v_neighbors = set(g.neighbors(v))
    return len(u_neighbors.intersection(v_neighbors))

@skip_errors
def AdamicAdar(u, v, g):
    u_neighbors = set(g.neighbors(u))
    v_neighbors = set(g.neighbors(v))
    aa = 0
    for i in u_neighbors.intersection(v_neighbors):
        aa += 1 / math.log(len(list(g.neighbors(i))))
    return aa

@skip_errors
def ResourceAllocation(u, v, g):
    u_neighbors = set(g.neighbors(u))
    v_neighbors = set(g.neighbors(v))
    ra = 0
    for i in u_neighbors.intersection(v_neighbors):
        ra += 1 / float(len(list(g.neighbors(i))))
    return ra

@skip_errors
def JaccardCoefficent(u, v, g):
    u_neighbors = set(g.neighbors(u))
    v_neighbors = set(g.neighbors(v))
    return len(u_neighbors.intersection(v_neighbors)) / float(len(u_neighbors.union(v_neighbors)))

@skip_errors
def PreferentialAttachment(u, v, g):
    return len(list(g.neighbors(u))) * len(list(g.neighbors(v)))

In [178]:
feature_set = {
    'common_neighbors': CommonNeighbors, 
    'adamic_adar': AdamicAdar,
    'resourse_allocation': ResourceAllocation,
    'jaccard_coef': JaccardCoefficent,
    'pref_attachment': PreferentialAttachment,
}

In [179]:
train_full = pd.concat([train_df, train_generated], axis=0).sample(frac=1)
test_full = pd.concat([test_df, test_generated], axis=0).sample(frac=1)

In [180]:
train_graph = nx.from_pandas_edgelist(train_full, source='uid1', target='uid2', create_using=nx.DiGraph())

In [181]:
for feature in feature_set:
    test_full[feature] = test_full.apply(lambda x: feature_set[feature](x['uid1'], x['uid2'], train_graph), axis=1)

In [182]:
for feature in feature_set:
    train_full[feature] = train_full.apply(lambda x: feature_set[feature](x['uid1'], x['uid2'], train_graph), axis=1)

In [183]:
X_train, y_train = train_full.drop(columns=['uid1', 'uid2', 'target']), train_full['target']
X_test, y_test = test_full.drop(columns=['uid1', 'uid2', 'target']), test_full['target']

In [157]:
train_full[y_train == 1]

Unnamed: 0,uid1,uid2,target,common_neighbors,adamic_adar,resourse_allocation,jaccard_coef,pref_attachment
374530,7937320.0,5247210.0,1,1,0.225091,0.011765,0.014286,1258
143163,11547619.0,12434929.0,1,16,5.287764,0.860761,0.432432,696
41297,16463184.0,7630807.0,1,19,5.161692,0.633008,0.117284,8034
228731,14457819.0,10492165.0,1,3,1.075844,0.224826,0.044118,330
176199,6916732.0,2761337.0,1,1,0.352956,0.058824,0.020833,598
...,...,...,...,...,...,...,...,...
441584,3689998.0,11251119.0,1,5,1.522909,0.194045,0.227273,126
487866,15102428.0,1932411.0,1,27,7.697645,0.899554,0.675000,1116
77038,16613271.0,1827564.0,1,16,4.427774,0.437103,0.275862,1113
365735,582827.0,9700184.0,1,0,0.000000,0.000000,0.000000,551


In [160]:
test_full[y_test == 0]

Unnamed: 0,uid1,uid2,target,common_neighbors,adamic_adar,resourse_allocation,jaccard_coef,pref_attachment
10329,5149054,1174369,0,0.0,0.00000,0.000000,0.000000,640.0
26507,2245293,10847127,0,0.0,0.00000,0.000000,0.000000,246.0
15327,1780978,13447800,0,0.0,0.00000,0.000000,0.000000,42.0
17830,14995786,7724782,0,,,,,
9895,663493,3450747,0,1.0,0.16345,0.002203,0.010753,840.0
...,...,...,...,...,...,...,...,...
76651,3145889,1193907,0,,,,,
70468,3016773,13344986,0,0.0,0.00000,0.000000,0.000000,104.0
68219,14531303,10229461,0,,,,,
62161,3421751,2321420,0,0.0,0.00000,0.000000,0.000000,128.0


In [161]:
import lightgbm as lgb

In [184]:
tr = lgb.Dataset(np.array(X_train), np.array(y_train))
te = lgb.Dataset(np.array(X_test), np.array(y_test))

In [171]:
param_lgb = {
    'random_state': 0,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'boost': 'gbdt',
    'feature_fraction': 0.8,
    'learning_rate': 0.01,
    'metric':'auc',
    'num_leaves': 31,
    'num_threads': 8,
    'objective': 'binary',
    'metric': 'auc'
}

In [185]:
bst = lgb.train(param_lgb, tr, num_boost_round=5000, 
          valid_sets=te, early_stopping_rounds=int(5 / param_lgb['learning_rate']), verbose_eval=100)

Training until validation scores don't improve for 500 rounds.
[100]	valid_0's auc: 0.217663
[200]	valid_0's auc: 0.219152
[300]	valid_0's auc: 0.219147
[400]	valid_0's auc: 0.218759
[500]	valid_0's auc: 0.218344
Early stopping, best iteration is:
[6]	valid_0's auc: 0.224607


In [186]:
test_full

Unnamed: 0,uid1,uid2,target,common_neighbors,adamic_adar,resourse_allocation,jaccard_coef,pref_attachment
1001167,8981246.0,8896527.0,1,,,,,
1002219,3178047.0,13840364.0,1,,,,,
910233,10619989.0,13567508.0,1,2.0,0.346083,0.006321,0.016949,2156.0
48864,7363235.0,1248131.0,0,6.0,2.210397,0.416517,0.046154,3948.0
832801,12496248.0,14098553.0,1,5.0,1.104617,0.060473,0.028736,3588.0
...,...,...,...,...,...,...,...,...
16130,2397484.0,8794846.0,0,2.0,0.769989,0.149733,0.090909,135.0
838509,8743485.0,12142359.0,1,1.0,0.254335,0.019608,0.012346,1425.0
59619,9062839.0,12885852.0,0,8.0,2.388924,0.286497,0.266667,312.0
92186,14028300.0,3229032.0,0,8.0,1.888481,0.139370,0.109589,1638.0


In [20]:
def predict(user):
    user_friends = list(train_graph.neighbors(user)) 
    new_df = pd.DataFrame({'uid1': user, 'uid2': list(set(users) - set(user_friends) - {user})})
    for feature in feature_set:
        new_df[feature] = new_df.apply(lambda x: feature_set[feature](x['uid1'], x['uid2'], train_graph), axis=1)
    pred = bst.predict(new_df.drop(columns=['uid1', 'uid2']))
    return pd.Series(pred, index=new_df['uid2']).sort_values(ascending=False)

In [126]:
def get_second_level_neighbors(graph) -> dict:
    neighbors_dict = {}
    for node in tqdm_notebook(sorted(graph.nodes)):
        neighbors = set(graph.neighbors(node))
        second_level = []
        for i in neighbors:
            second_level += list(graph.neighbors(i))
        second_level = Counter(second_level)
        del second_level[node] 
        for word in list(second_level):
            if word in neighbors:
                del second_level[word] 
        neighbors_dict[node] = second_level
    return neighbors_dict

In [127]:
users = get_second_level_neighbors(train_graph)

HBox(children=(IntProgress(value=0, max=38993), HTML(value='')))

In [178]:
popular = train_df[['uid1', 'uid2']].stack().value_counts()
popular = list(popular.index)[:5]

In [189]:
def predict(user):
    if user in train_graph.nodes:
        neighbors = np.array(users[user].most_common(50))
        if len(neighbors) > 0:
            neighbors = neighbors[:, 0]
        new_df = pd.DataFrame({'uid1': user, 
                               'uid2': neighbors})
        if not new_df.empty:
            for feature in feature_set:
                new_df[feature] = new_df.apply(lambda x: feature_set[feature](x['uid1'], x['uid2'], train_graph), axis=1)
            pred = bst.predict(new_df.drop(columns=['uid1', 'uid2']))
            pred = new_df['uid2'].iloc[pred.argsort()[-5:]].values[::-1]
            if len(pred) == 5:
                return pred
            else:
                for i in range(5 - len(pred)):
                    pred = np.append(pred, popular[i])
                return pred
        else:
            return popular
    else:
        return popular

In [184]:
def predict(user):
    if user in train_graph.nodes:
        pred = np.array(users[user].most_common()[:5])
        if len(pred) > 0:
            pred = pred[:, 0]
        if len(pred) == 5:
            return pred
        else:
            for i in range(5 - len(pred)):
                pred = np.append(pred, popular[i])
            return pred
    else:
        return popular

In [153]:
test_graph = BaseModel().build_graph(f'data/validate/test_df.csv')
test_users = BaseModel().get_neighbors_dict(test_graph)

In [190]:
predictions = []
for user in tqdm_notebook(list(test_users.keys())):
    predictions.append(predict(user))

HBox(children=(IntProgress(value=0, max=26956), HTML(value='')))

In [182]:
from metrics import mapk

In [191]:
mapk([list(x) for x in list(test_users.values())], predictions, k=5)

0.020846402816278975