In [173]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [174]:
import os
import sys

sys.path.append(os.path.join(os.getcwd(), '..', '..', '..'))

import pandas as pd
import numpy as np
import scipy
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import ndcg_score

In [175]:
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k

In [176]:
from src.defaults import REPO_DS, TRAIN_DS, TEST_DS

In [195]:
repo_features = pd.read_csv(REPO_DS)
train = pd.read_csv(TRAIN_DS)
test = pd.read_csv(TEST_DS)

In [249]:
FEATURES = ['n_all_issues', 'n_branches', 'n_closed_issues', 'n_forks',
            'n_milestones_all', 'n_milestones_closed', 'n_milestones_open',
            'n_open_issues', 'n_pr_all', 'n_pr_closed', 'n_pr_open', 'n_stars']

In [250]:
train.head()

Unnamed: 0,full_name,repo_requirements
0,strizhechenko/netutils-linux,ipaddress
1,strizhechenko/netutils-linux,pytest
2,strizhechenko/netutils-linux,Pygments
3,strizhechenko/netutils-linux,pytest-cov
4,strizhechenko/netutils-linux,pyyaml


## Create the dataset

In [251]:
dataset = Dataset()
dataset.fit(train['full_name'].unique(),
            train['repo_requirements'].unique())

In [252]:
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 1016, num_items 401.


In [253]:
train_interactions, weights_matrix = dataset.build_interactions(
    [x for x in train.itertuples(index=False, name=None)])
test_interactions, weights_matrix = dataset.build_interactions(
    [x for x in test.itertuples(index=False, name=None)])

In [254]:
print(repr(train_interactions))

<1016x401 sparse matrix of type '<class 'numpy.int32'>'
	with 14372 stored elements in COOrdinate format>


## Train CF model

In [255]:
from lightfm import LightFM

model = LightFM(loss='warp')
model.fit(train_interactions)

<lightfm.lightfm.LightFM at 0x7f30b9972e50>

In [256]:
test_recall = recall_at_k(model,
                          test_interactions,
                          train_interactions,
                          k=10).mean()
test_recall

0.3779527559055118

In [257]:
test_precisions = precision_at_k(model,
                                 test_interactions,
                                 train_interactions,
                                 k=10).mean()
test_precisions

0.037795275

## Hybrid approach

In [261]:
for f in FEATURES:
    dataset.fit_partial(users=repo_features['full_name'].values,
                        user_features=repo_features[f].values)

In [262]:
data = ((x.full_name, [getattr(x, col) for col in FEATURES]) for x in repo_features.itertuples())

user_features = dataset.build_user_features(
    data=data
)

In [263]:
model = LightFM(loss='warp')
model.fit(train_interactions, user_features=user_features)

<lightfm.lightfm.LightFM at 0x7f30b9972040>

In [266]:
test_recall = recall_at_k(model,
                          test_interactions,
                          train_interactions,
                          user_features=user_features,
                          k=10).mean()
test_recall

0.33267716535433073

In [267]:
test_precisions = precision_at_k(model,
                                 test_interactions,
                                 train_interactions,
                                 user_features=user_features,
                                 k=10).mean()
test_precisions

0.033267718

## Use MLB instead of a Dataset

In [278]:
train = train.groupby('full_name').agg({'repo_requirements': lambda x: x.to_list()}).reset_index()
train.head()

Unnamed: 0,full_name,repo_requirements
0,01ly/TTBot,"[urllib3, selenium, w3lib, pymongo, requests, ..."
1,0xInfection/TIDoS-Framework,"[ptyprocess, pexpect, shodan, tld, idna, six, ..."
2,3b1b/manim,"[argparse, numpy, Pillow, scipy, sympy, matplo..."
3,521xueweihan/hellogithub.com,"[PyMySQL, uWSGI, gevent, Flask, Flask-WTF, req..."
4,82Flex/DCRM,"[appdirs, certifi, chardet, Click, Django, dja..."


In [279]:
test = test.groupby('full_name').agg({'repo_requirements': lambda x: x.to_list()}).reset_index()
test.head()

Unnamed: 0,full_name,repo_requirements
0,01ly/TTBot,[opencv-python]
1,0xInfection/TIDoS-Framework,[bs4]
2,3b1b/manim,[tqdm]
3,521xueweihan/hellogithub.com,[werkzeug]
4,82Flex/DCRM,[idna]


In [280]:
mlb = MultiLabelBinarizer()

In [281]:
train_interactions = mlb.fit_transform(train['repo_requirements'].values)
train_interactions = scipy.sparse.csr_matrix(train_interactions)
train_interactions.shape

(1016, 401)

In [282]:
test_interactions = mlb.transform(test['repo_requirements'].values)
test_interactions = scipy.sparse.csr_matrix(test_interactions)
test_interactions.shape

(1016, 401)

### Train the model

In [157]:
model = LightFM(no_components=25, loss='warp', learning_schedule='adadelta')

In [158]:
model.fit(interactions=train_interactions,
          epochs=15, num_threads=2)

<lightfm.lightfm.LightFM at 0x7f31142103a0>

### Metrics

In [159]:
K = 10

In [160]:
test_precision = precision_at_k(model,
                                test_interactions,
                                train_interactions,
                                k=K).mean()
test_precision

0.051968507

In [161]:
test_recall = recall_at_k(model,
                          test_interactions,
                          train_interactions,
                          k=K).mean()
test_recall

0.5196850393700787

## NDCG

In [164]:
preds = []
for i in range(len(test)):
    preds.append(model.predict(i, np.arange(401)))
preds = np.vstack(preds)

In [165]:
ndcg_score(y_true=test_interactions.toarray() + train_interactions.toarray(),
           y_score=preds,
           k=10)

0.7357356192762378

## Predict Top N

In [484]:
scores = model.predict(0, np.arange(401))
top_items = np.argsort(-scores)
# top_items

In [426]:
mlb.classes_[top_items][:10]

array(['numpy', 'requests', 'tqdm', 'six', 'scipy', 'matplotlib',
       'Pillow', 'torchvision', 'pytz', 'python-dateutil'], dtype=object)

In [427]:
mlb.classes_[train_interactions[0].indices]

array(['pymongo', 'requests', 'selenium', 'urllib3', 'validators',
       'w3lib'], dtype=object)

In [428]:
mlb.classes_[test_interactions[0].indices]

array(['opencv-python'], dtype=object)