# NonNegative ALS with just Stars

https://spark.apache.org/docs/latest/ml-collaborative-filtering.html

In [1]:
import pandas as pd
import numpy as np

import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS

# Setup a SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
# import src_2.evaluate as evaluate

# # Convert a Pandas DF to a Spark DF
# spark_df = spark.createDataFrame(pandas_df) 

# # Convert a Spark DF to a Pandas DF
# pandas_df = spark_df.toPandas()

In [2]:
train_starred = pd.read_pickle('data/pickled/train_starred.pkl')
test_starred = pd.read_pickle('data/pickled/test_starred.pkl')

In [3]:
spark_train_stars_df = spark.createDataFrame(train_starred)
spark_test_stars_df = spark.createDataFrame(test_starred)

#### check spark dataframes

In [4]:
spark_train_stars_df.show()

+--------+-------+-------------------+---------------+----+--------------------+--------+-------+
| repo_id|user_id|         created_at|          login|type|                 url|language|starred|
+--------+-------+-------------------+---------------+----+--------------------+--------+-------+
|   12031| 889397|1202439272000000000|       cdcarter| USR|https://api.githu...|    Ruby|      1|
|52574448| 889397|1202442872000000000|       cdcarter| USR|https://api.githu...|      \N|      1|
|   12031|  17898|1203008090000000000|        gnufied| USR|https://api.githu...|    Ruby|      1|
|   12031|  25789|1203568157000000000|          Sutto| USR|https://api.githu...|    Ruby|      1|
|   12031|  10942|1203910346000000000|       tarcieri| USR|https://api.githu...|    Ruby|      1|
|   12031|  35126|1204050466000000000|         cypher| USR|https://api.githu...|    Ruby|      1|
|40916741| 401456|1204169012000000000|           dsrw| USR|https://api.githu...|   Shell|      1|
|   12031|  60568|12

In [5]:
spark_test_stars_df.show()

+--------+--------+-------------------+-----------------+----+--------------------+----------+-------+
| repo_id| user_id|         created_at|            login|type|                 url|  language|starred|
+--------+--------+-------------------+-----------------+----+--------------------+----------+-------+
|55567488| 6694284|1485492399000000000|          CaiJiJi| USR|https://api.githu...|    Python|      1|
|    1282|13665131|1485492672000000000|         tarvos21| USR|https://api.githu...|    Python|      1|
|   16356| 6806774|1485492818000000000|       acro5piano| USR|https://api.githu...|       PHP|      1|
|12651428|   11997|1485492868000000000|           alias1| USR|https://api.githu...|    Python|      1|
|  394885| 9279104|1485493232000000000|        nilsdeppe| USR|https://api.githu...|    Python|      1|
|29629404| 2060726|1485493578000000000|           Patola| USR|https://api.githu...|         C|      1|
|   38701|12092442|1485493937000000000|         buhuipao| USR|https://api

#### create ALS recommender model

In [9]:
als_model = ALS(
    itemCol='repo_id',
    userCol='user_id',
    ratingCol='starred',
    nonnegative=True,    
    regParam=0.1,
    rank=10
    ) 

In [10]:
recommender = als_model.fit(spark_train_stars_df)
predictions = recommender.transform(spark_test_stars_df)

In [8]:
predictions.show()

+-------+--------+-------------------+-------------+----+--------------------+--------+-------+----------+
|repo_id| user_id|         created_at|        login|type|                 url|language|starred|prediction|
+-------+--------+-------------------+-------------+----+--------------------+--------+-------+----------+
|   1088| 1966345|1490671147000000000|    craigwong| USR|https://api.githu...|       C|      1|0.89964926|
|   1088| 6504088|1487783348000000000|ghostwriternr| USR|https://api.githu...|       C|      1|0.89964825|
|   1088|10452600|1489503628000000000|     mike168m| USR|https://api.githu...|       C|      1| 0.8996487|
|   1088|26464620|1488393840000000000|  amitbansal7| USR|https://api.githu...|       C|      1|       NaN|
|   1088|31125242|1493660697000000000|     rmccorm4| USR|https://api.githu...|       C|      1|       NaN|
|   1088| 1383890|1492655096000000000|       Xwoder| USR|https://api.githu...|       C|      1|  0.899648|
|   1088| 8635446|1489621495000000000

# NonNegative ALS with Stars & Forks (implicit weighted actions)

https://spark.apache.org/docs/latest/ml-collaborative-filtering.html

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
# from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

# Setup a SparkSession
spark = SparkSession.builder.getOrCreate()

In [92]:
forks_and_stars = pd.read_pickle('data/pickled/forks_and_stars.pkl')
forks_and_stars.columns

Index([u'user_id', u'repo_id', u'forked', u'created_at', u'starred',
       u'interest_rank'],
      dtype='object')

#### since I didn't pickle train and test data for this run using both Star and Forks, create my timestamp ordered train-test split

In [93]:
def ordered_train_test_split(df, order_by, test_size=0.33):
    '''
    Takes a pandas dataframe and returns a train and test dataset ordered by the 
    column name passed to order_by and sliced up to the test_size
    INPUT:
    df: pandas df
    order_by: list or str of column names
    test_size: float percentage of df to use as test set
    OUTPUT:
    train: pandas dataframe
    test: pandas dataframe
    '''
    upto = int(df.shape[0] * (1 - test_size))
    df = df.sort_values(by=order_by, axis=0)
    train = df.iloc[:upto,:]
    test = df.iloc[upto:,:]
    return train, test

In [94]:
train_forks_stars, test_forks_stars = ordered_train_test_split(
    forks_and_stars, order_by=['created_at'], test_size=0.33)

In [95]:
print train_forks_stars.shape
print test_forks_stars.shape

(681076, 6)
(335456, 6)


#### create spark dataframes for train/test data

In [96]:
sp_train_forks_stars = spark.createDataFrame(train_forks_stars)
sp_test_forks_stars = spark.createDataFrame(test_forks_stars)

In [97]:
model = ALS(
    itemCol='repo_id',
    userCol='user_id',
    ratingCol='interest_rank',
    implicitPrefs=True,
    nonnegative=True,    
    maxIter=10, 
    alpha=0.01
    ) 

In [98]:
recommender = model.fit(sp_train_forks_stars)
predictions = recommender.transform(sp_test_forks_stars)

In [99]:
pred_df = predictions.toPandas()

In [100]:
pred_df.sort_values(['user_id','prediction'], ascending=False)

Unnamed: 0,user_id,repo_id,forked,created_at,starred,interest_rank,prediction
17451,36791462,68425726,2.0,1498863052000000000,0.0,2.0,
3042,36791293,20711751,2.0,1498861756000000000,1.0,3.0,
93307,36791293,20711251,0.0,1498861751000000000,1.0,1.0,
24524,36791076,38393576,2.0,1498860785000000000,0.0,2.0,
141628,36790822,24958152,2.0,1498858442000000000,0.0,2.0,
324068,36790741,63547354,2.0,1498860145000000000,0.0,2.0,
320981,36790642,51568068,2.0,1498857388000000000,0.0,2.0,
296748,36790336,13057459,2.0,1498855585000000000,0.0,2.0,
255195,36789893,30334789,2.0,1498853132000000000,0.0,2.0,
183068,36789460,27683242,2.0,1498850815000000000,0.0,2.0,


# NonNegative ALS with Stars & Owned Repos (implicit weighted actions)

In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
# from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

# Setup a SparkSession
spark = SparkSession.builder.getOrCreate()

In [101]:
own_and_star = pd.read_pickle('data/pickled/own_and_star.pkl')
own_and_star.columns

Index([u'repo_id', u'user_id', u'forked_from_repo_id', u'owned', u'created_at',
       u'starred', u'interest_rank'],
      dtype='object')

#### since I didn't pickle train and test data for this run using both Star and Forks, create my timestamp ordered train-test split

In [102]:
def ordered_train_test_split(df, order_by, test_size=0.33):
    '''
    Takes a pandas dataframe and returns a train and test dataset ordered by the 
    column name passed to order_by and sliced up to the test_size
    INPUT:
    df: pandas df
    order_by: list or str of column names
    test_size: float percentage of df to use as test set
    OUTPUT:
    train: pandas dataframe
    test: pandas dataframe
    '''
    upto = int(df.shape[0] * (1 - test_size))
    df = df.sort_values(by=order_by, axis=0)
    train = df.iloc[:upto,:]
    test = df.iloc[upto:,:]
    return train, test

In [103]:
train_owns_stars, test_owns_stars = ordered_train_test_split(
    own_and_star, order_by=['created_at'], test_size=0.33)

In [104]:
print train_owns_stars.shape
print test_owns_stars.shape

(743788, 7)
(366344, 7)


#### create spark dataframes for train/test data

In [106]:
sp_train_owns_stars = spark.createDataFrame(train_owns_stars)
sp_test_owns_stars = spark.createDataFrame(test_owns_stars)

In [107]:
model = ALS(
    itemCol='repo_id',
    userCol='user_id',
    ratingCol='interest_rank',
    implicitPrefs=True,
    nonnegative=True,    
    maxIter=10, 
    alpha=0.01
    ) 

In [108]:
recommender = model.fit(sp_train_owns_stars)
predictions = recommender.transform(sp_test_owns_stars)

In [109]:
pred_df = predictions.toPandas()

In [116]:
pred_df.sort_values(['prediction'], ascending=False).head(1000)

Unnamed: 0,repo_id,user_id,forked_from_repo_id,owned,created_at,starred,interest_rank,prediction
282462,28557277,7505241,0,0.0,1480606331000000000,1.0,1.0,0.631018
280951,28557277,1167967,0,0.0,1487261828000000000,1.0,1.0,0.607801
256745,390,298983,0,0.0,1492947411000000000,1.0,1.0,0.589153
280187,28557277,1169947,0,0.0,1484784289000000000,1.0,1.0,0.572772
317790,4708601,1653184,0,0.0,1494488748000000000,1.0,1.0,0.560179
87547,3231,17118,0,0.0,1483289169000000000,1.0,1.0,0.542329
321467,6876142,11830767,0,0.0,1485087352000000000,1.0,1.0,0.541163
317879,4708601,3903779,0,0.0,1483942209000000000,1.0,1.0,0.539302
247445,9664377,1733408,0,0.0,1497235274000000000,1.0,1.0,0.538258
281864,28557277,2825561,0,0.0,1487201684000000000,1.0,1.0,0.534949


In [120]:
# Predictions where repo-user has just a OWNS relationship
(pred_df.loc[(pred_df.interest_rank == 4) & (~pred_df.prediction.isnull())]).sort_values(
    'prediction', ascending=False)

Unnamed: 0,repo_id,user_id,forked_from_repo_id,owned,created_at,starred,interest_rank,prediction
245113,52878664,2216159,0,4.0,1481755233000000000,0.0,4.0,0.01324694
123447,53531959,599582,0,4.0,1482796812000000000,0.0,4.0,0.0008534327
28325,56322797,10433,0,4.0,1486246603000000000,0.0,4.0,0.0007693606
64988,52668800,265517,0,4.0,1481575228000000000,0.0,4.0,0.0002259616
182690,56341200,2606940,0,4.0,1486310145000000000,0.0,4.0,0.0001769078
285944,53389749,9652,0,4.0,1482540899000000000,0.0,4.0,0.0001645454
75564,52688610,596995,0,4.0,1481277537000000000,0.0,4.0,3.492491e-05
21771,56725833,10269343,0,4.0,1486185254000000000,0.0,4.0,2.203709e-05
270617,52815179,9539832,0,4.0,1481581418000000000,0.0,4.0,6.324926e-06
114402,56507651,7830654,0,4.0,1486218712000000000,0.0,4.0,4.412424e-06


In [124]:
(pred_df.loc[pred_df.user_id == 2216159]).sort_values('prediction')

Unnamed: 0,repo_id,user_id,forked_from_repo_id,owned,created_at,starred,interest_rank,prediction
341667,1888252,2216159,0,0.0,1494020921000000000,1.0,1.0,0.009268
263425,48707609,2216159,0,0.0,1492552689000000000,1.0,1.0,0.010063
301943,448145,2216159,0,0.0,1488875850000000000,1.0,1.0,0.010882
245113,52878664,2216159,0,4.0,1481755233000000000,0.0,4.0,0.013247
208757,22062556,2216159,0,0.0,1493975952000000000,1.0,1.0,0.014508
300289,11938248,2216159,0,0.0,1496414680000000000,1.0,1.0,0.016269
17485,38904,2216159,0,0.0,1486028889000000000,1.0,1.0,0.018419
18109,23166017,2216159,0,0.0,1495119413000000000,1.0,1.0,0.022898
350106,8351622,2216159,0,0.0,1485420106000000000,1.0,1.0,0.024398
246063,18155280,2216159,0,0.0,1481019716000000000,1.0,1.0,0.026036
