In [1]:
import pandas as pd
import numpy as np
import pyspark as ps
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    IntegerType, StringType, IntegerType, FloatType, 
    StructField, StructType, DoubleType
)
from pyspark.ml.recommendation import ALS
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from pyspark.sql.functions import udf, col, isnan
import psycopg2
from pandas.io import sql
from sklearn.model_selection import train_test_split
from pyspark.ml.evaluation import RegressionEvaluator
from itertools import product

In [2]:
conn = psycopg2.connect(dbname='mixmaker2', host='localhost')
cur = conn.cursor()

In [3]:
query = 'SELECT * FROM artists;'
artists = sql.read_sql_query(query, conn)

In [4]:
artists.sort_values('id').head()

Unnamed: 0,id,name,url,scraped,scraped_spotify
4080,1,James Brown,https://www.whosampled.com/James-Brown/,1,0.0
3,2,Traditional Folk,https://www.whosampled.com/Traditional-Folk/,1,0.0
777,3,Kool & the Gang,https://www.whosampled.com/Kool-%26-the-Gang/,1,0.0
778,4,Funkadelic,https://www.whosampled.com/Funkadelic/,1,0.0
779,5,The Beatles,https://www.whosampled.com/The-Beatles/,1,0.0


In [5]:
query = 'SELECT * FROM songs;'
songs = sql.read_sql_query(query, conn)

In [6]:
len(songs)

42879

In [7]:
query = 'SELECT * FROM connections;'
connections = sql.read_sql_query(query, conn)

In [8]:
connections.head()

Unnamed: 0,song_id,sampled_by_song_id,is_connected
0,1,272,1
1,1,273,1
2,1,274,1
3,1,275,1
4,1,276,1


In [9]:
spark_df = spark.createDataFrame(connections)

In [10]:
train, test = spark_df.randomSplit([0.8, 0.2], seed=216)

In [None]:
als_model = ALS(
    itemCol='song_id',
    userCol='sampled_by_song_id',
    ratingCol='is_connected',
    nonnegative=True,
    regParam=0.01,
    rank=10)

recommender = als_model.fit(train)

test_predictions = recommender.transform(test)

In [None]:
test_predictions.show()

In [None]:
preds_df = test_predictins.toPandas()

In [None]:
preds_df.head()

In [None]:
check = preds_df.sort_values('prediction', ascending=False).head(10)

In [None]:
lookup_id = 612
lookup_df = preds_df.loc[preds_df['sampled_by_song_id']==lookup_id,:]

In [None]:
lookup_df.sort_values('prediction', ascending=False).head()

In [None]:
songs[songs['name'] == 'All Night']

In [None]:
artists.loc[artists['name'].apply(lambda x: x.startswith('Beyonc'))]

In [None]:
songs[songs['artist_id'] == 6908]

In [None]:
preds_df.sort_values('prediction', ascending=False).head(10)

In [12]:
als_model = ALS(
    itemCol='song_id',
    userCol='sampled_by_song_id',
    ratingCol='is_connected',
    nonnegative=True,
    alpha = 0.1,
    regParam=0.01,
    rank=50)

recommender = als_model.fit(train)

In [None]:
train_preds = recommender.transform(train)

In [None]:
evaluator = RegressionEvaluator(metricName='rmse', 
                                labelCol='is_connected',
                                predictionCol='prediction')

In [98]:
rmse_train = evaluator.evaluate(train_preds)

In [99]:
rmse_train

0.012385041139706692

In [100]:
lookup_id = 627
lookup_df = preds_df.loc[preds_df['sampled_by_song_id']==lookup_id,:]
songs[songs['id'] == lookup_id]

NameError: name 'preds_df' is not defined

In [101]:
test_predictions = recommender.transform(test)

In [102]:
rmse = evaluator.evaluate(test_predictions)

In [103]:
rmse_test = evaluator.evaluate(
    test_predictions.filter("prediction + 1 > prediction"))

In [104]:
rmse_test

0.23225296325049155

In [None]:
test_predictions.filter("prediction + 1 > prediction").count()

In [None]:
test_predictions.count()

In [None]:
test_predictions.persist().count()

In [None]:
7138 / 12387

In [13]:
cd /Users/brettashley/galvanize/dsi/00-Capstone/mix-maker/mixmaker

/Users/brettashley/galvanize/dsi/00-Capstone/mix-maker/mixmaker


In [14]:
import model

In [15]:
sr = model.SongRecommender()

In [None]:
df.columns

In [None]:
sr.generate_negative_targets(test,
                             'song_id',
                             'sampled_by_song_id',
                             'is_connected',
                             seed=406)

In [None]:
test_with_neg = _

In [None]:
test_with_neg.limit(10).show()

In [None]:
test_with_neg.count()

In [None]:
preds = recommender.transform(test_with_neg)

In [None]:
sr.RMSE(preds, has_nan_values=True)

In [None]:
preds_df = preds.toPandas()

In [None]:
preds_df.columns

In [None]:
df = preds_df.loc[preds_df['is_connected'] == 0,:]
checks = df.sort_values('prediction', ascending=False).head(100)

In [None]:
checks.head()

In [None]:
check_merge_song1 = (checks.merge(songs, left_on='song_id', right_on='id')
                     .drop(columns=['artist_id', 'id', 'scraped', 'index'], axis=1))

In [None]:
check_merge_songs = (check_merge_song1.merge(songs, left_on='sampled_by_song_id', right_on='id')
    .drop(columns=['artist_id', 'id', 'scraped', 'is_connected',
                   'scraped_features_x', 'scraped_features_y'], axis=1))

In [None]:
check_merge_songs

In [None]:
check_merge_songs.columns = ['sampled_by_song_id', 'song_id',
                             'prediction', 'sample_name',
                             'sample_url', 'song_name',
                             'song_url']

In [None]:
check_merge_songs

In [20]:
df = train.toPandas()

In [21]:
df.columns

Index(['song_id', 'sampled_by_song_id', 'is_connected'], dtype='object')

In [77]:
len(df[['song_id', 'sampled_by_song_id']].values)

48687

In [78]:
set((x,y) for (x,y) in product(df.loc[:100,'song_id'], df.loc[:,'sampled_by_song_id']))

{(1, 37922),
 (1, 22872),
 (1, 32515),
 (1, 9145),
 (1, 37519),
 (1, 30184),
 (1, 5203),
 (1, 14854),
 (1, 19573),
 (1, 4664),
 (1, 12515),
 (1, 34212),
 (1, 42095),
 (1, 3912),
 (1, 41716),
 (1, 16575),
 (1, 26466),
 (1, 1493),
 (1, 24324),
 (1, 32207),
 (1, 946),
 (1, 37195),
 (1, 21985),
 (1, 29780),
 (1, 6687),
 (1, 36251),
 (1, 29233),
 (1, 13999),
 (1, 18590),
 (1, 3380),
 (1, 41136),
 (1, 25902),
 (1, 2961),
 (1, 10820),
 (1, 39082),
 (1, 24000),
 (1, 8225),
 (1, 38711),
 (1, 16014),
 (1, 21002),
 (1, 28925),
 (1, 5792),
 (1, 13675),
 (1, 20314),
 (1, 27917),
 (1, 17703),
 (1, 27626),
 (1, 2653),
 (1, 10240),
 (1, 40726),
 (1, 17292),
 (1, 25207),
 (1, 9965),
 (1, 15690),
 (1, 20726),
 (1, 30393),
 (1, 5484),
 (1, 15319),
 (1, 35048),
 (1, 5065),
 (1, 12732),
 (1, 34677),
 (1, 42296),
 (1, 19427),
 (1, 27222),
 (1, 2073),
 (1, 11980),
 (1, 40402),
 (1, 41861),
 (1, 1766),
 (1, 9385),
 (1, 39871),
 (1, 22741),
 (1, 7491),
 (1, 37380),
 (1, 30053),
 (1, 6952),
 (1, 14739),
 (1, 36

In [79]:
all_combos = _

In [80]:
len(all_combos)

29075

In [81]:
existing_combos = set((x,y) for [x,y] in df[['song_id', 'sampled_by_song_id']].values)

In [82]:
len(existing_combos)

48687

In [83]:
diff = all_combos - existing_combos

In [84]:
list(diff)

[(1, 37922),
 (1, 22872),
 (1, 32515),
 (1, 9145),
 (1, 37519),
 (1, 30184),
 (1, 5203),
 (1, 14854),
 (1, 19573),
 (1, 4664),
 (1, 12515),
 (1, 34212),
 (1, 42095),
 (1, 3912),
 (1, 41716),
 (1, 16575),
 (1, 26466),
 (1, 1493),
 (1, 24324),
 (1, 32207),
 (1, 946),
 (1, 37195),
 (1, 21985),
 (1, 29780),
 (1, 6687),
 (1, 36251),
 (1, 29233),
 (1, 13999),
 (1, 18590),
 (1, 3380),
 (1, 41136),
 (1, 25902),
 (1, 2961),
 (1, 10820),
 (1, 39082),
 (1, 24000),
 (1, 8225),
 (1, 38711),
 (1, 16014),
 (1, 21002),
 (1, 28925),
 (1, 5792),
 (1, 13675),
 (1, 20314),
 (1, 27917),
 (1, 17703),
 (1, 27626),
 (1, 2653),
 (1, 10240),
 (1, 40726),
 (1, 17292),
 (1, 25207),
 (1, 9965),
 (1, 15690),
 (1, 20726),
 (1, 30393),
 (1, 5484),
 (1, 15319),
 (1, 35048),
 (1, 5065),
 (1, 12732),
 (1, 34677),
 (1, 42296),
 (1, 19427),
 (1, 27222),
 (1, 2073),
 (1, 11980),
 (1, 40402),
 (1, 41861),
 (1, 1766),
 (1, 9385),
 (1, 39871),
 (1, 22741),
 (1, 7491),
 (1, 37380),
 (1, 30053),
 (1, 6952),
 (1, 14739),
 (1, 36

In [92]:
sr.generate_negative_targets(test, 'song_id', 'sampled_by_song_id', 'is_connected', get_all=True)

500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500


DataFrame[is_connected: bigint, sampled_by_song_id: bigint, song_id: bigint]

In [93]:
to_transform = _

In [94]:
to_transform.persist().count()

309675

In [56]:
(1,2) + (0,)

(1, 2, 0)

In [59]:
train.limit(5).show()

+-------+------------------+------------+
|song_id|sampled_by_song_id|is_connected|
+-------+------------------+------------+
|      1|               273|           1|
|      1|               275|           1|
|      1|               277|           1|
|      1|               278|           1|
|      1|               279|           1|
+-------+------------------+------------+



In [13]:
top10_recs = recommender.recommendForUserSubset(train, 10)

In [35]:
top10_recs.limit(10).toPandas()

Unnamed: 0,sampled_by_song_id,recommendations
0,14450,"[(14397, 0.9948229193687439), (13961, 0.994822..."
1,37307,"[(8709, 1.087941288948059), (10950, 1.02290296..."
2,4935,"[(40272, 1.4924603700637817), (62, 1.480255126..."
3,32445,"[(16944, 1.1549606323242188), (2801, 1.1141314..."
4,32460,"[(38031, 1.177066445350647), (32458, 1.1758714..."
5,22223,"[(36688, 1.1134573221206665), (18055, 1.104509..."
6,7880,"[(2801, 1.1547811031341553), (29967, 1.1495363..."
7,8389,"[(13307, 0.9972371459007263), (23224, 0.789302..."
8,9376,"[(29961, 1.149129867553711), (6046, 1.14415681..."
9,26623,"[(6355, 1.182394027709961), (22538, 1.17986476..."


In [36]:
top10_df = _

In [45]:
top10_df.loc[0,'recommendations']

[Row(song_id=14397, rating=0.9948229193687439),
 Row(song_id=13961, rating=0.9948220252990723),
 Row(song_id=36805, rating=0.930473268032074),
 Row(song_id=36781, rating=0.9083508253097534),
 Row(song_id=37987, rating=0.9028245806694031),
 Row(song_id=19079, rating=0.8996960520744324),
 Row(song_id=9762, rating=0.8976651430130005),
 Row(song_id=7785, rating=0.8748148083686829),
 Row(song_id=37990, rating=0.8731769323348999),
 Row(song_id=9771, rating=0.8636344075202942)]

In [60]:
song_id_list = list([top10_df.loc[0,'sampled_by_song_id']] * 10)

In [62]:
recs0 = pd.DataFrame(song_id_list, columns=['sampled_by_song_id'])

In [65]:
recs_df = pd.DataFrame(top10_df.loc[0,'recommendations'],
                       columns=['song_id', 'rating'])

In [66]:
recs0.merge(recs_df, left_index=True, right_index=True)

Unnamed: 0,sampled_by_song_id,song_id,rating
0,14450,14397,0.994823
1,14450,13961,0.994822
2,14450,36805,0.930473
3,14450,36781,0.908351
4,14450,37987,0.902825
5,14450,19079,0.899696
6,14450,9762,0.897665
7,14450,7785,0.874815
8,14450,37990,0.873177
9,14450,9771,0.863634


In [25]:
check_merge_song1 = (df.merge(songs, left_on='song_id', right_on='id')
                     .drop(columns=['artist_id', 'id', 'scraped'], axis=1))

check_merge_songs = (check_merge_song1.merge(songs, left_on='sampled_by_song_id', right_on='id')
    .drop(columns=['artist_id', 'id', 'scraped', 'is_connected',
                   'scraped_features_x', 'scraped_features_y'], axis=1))

In [27]:
check_merge_songs.head()

Unnamed: 0,song_id,sampled_by_song_id,name_x,url_x,name_y,url_y
0,1,273,Get Up Offa That Thing (1976),https://www.whosampled.com/James-Brown/Get-Up-...,Luv,https://www.whosampled.com/Apink/Luv
1,6145,273,Get Up Offa That Thing,https://www.whosampled.com/James-Brown/Get-Up-...,Luv,https://www.whosampled.com/Apink/Luv
2,1,275,Get Up Offa That Thing (1976),https://www.whosampled.com/James-Brown/Get-Up-...,South Bronx,https://www.whosampled.com/Boogie-Down-Product...
3,27,275,"Get Up, Get Into It, Get Involved (1970)","https://www.whosampled.com/James-Brown/Get-Up,...",South Bronx,https://www.whosampled.com/Boogie-Down-Product...
4,139,275,Unity Skit 3 (1984)\nby Afrika Bambaataa and J...,https://www.whosampled.com/Afrika-Bambaataa/Un...,South Bronx,https://www.whosampled.com/Boogie-Down-Product...


In [34]:
test.filter('sampled_by_song_id = 22223').show()

+-------+------------------+------------+
|song_id|sampled_by_song_id|is_connected|
+-------+------------------+------------+
|  26629|             22223|           1|
|  30452|             22223|           1|
|  32445|             22223|           1|
+-------+------------------+------------+



In [67]:
one_rec = recommender.recommendForUserSubset(
    train.filter('sampled_by_song_id = 22223'), 10)

In [69]:
one_rec.show()

+------------------+--------------------+
|sampled_by_song_id|     recommendations|
+------------------+--------------------+
|             22223|[[36688, 1.113457...|
+------------------+--------------------+



In [75]:
recommender.recommendForUserSubset(
    train.filter('sampled_by_song_id = %s' % 22223), 10).show()

+------------------+--------------------+
|sampled_by_song_id|     recommendations|
+------------------+--------------------+
|             22223|[[36688, 1.113457...|
+------------------+--------------------+



In [82]:
sr.get_predictions_for_song(recommender, train, 13321, 10) # prince, erotic city

Unnamed: 0,sampled_by_song_id,song_id,rating
0,13321,22538,1.073343
1,13321,8690,1.011398
2,13321,35045,1.007149
3,13321,9740,0.995725
4,13321,8751,0.993363
5,13321,13728,0.993086
6,13321,6393,0.984088
7,13321,19120,0.978603
8,13321,19154,0.976129
9,13321,6374,0.973619


In [11]:
als_model = ALS(
    itemCol='song_id',
    userCol='sampled_by_song_id',
    ratingCol='is_connected',
    nonnegative=True,
    alpha = 0.1,
    regParam=0.01,
    rank=50)

recommender = als_model.fit(spark_df)

In [16]:
sr.get_predictions_for_song(recommender, spark_df, 13321, 10)

Unnamed: 0,sampled_by_song_id,song_id,rating
0,13321,13728,0.994413
1,13321,38006,0.977733
2,13321,22885,0.974543
3,13321,36667,0.95544
4,13321,21561,0.948016
5,13321,38033,0.947498
6,13321,37492,0.946418
7,13321,33546,0.929624
8,13321,23467,0.921811
9,13321,38059,0.9193


In [17]:
recs_13321 = _

In [22]:
recs_13321_merged = recs_13321.merge(songs, left_on='song_id', right_on='id')

In [19]:
recs_13321.merge(connections, on=['song_id', 'sampled_by_song_id'])

Unnamed: 0,sampled_by_song_id,song_id,rating,is_connected
0,13321,13728,0.994413,1


In [21]:
recs_13321.merge(connections, left_on=['sampled_by_song_id','song_id'],
                              right_on=['song_id', 'sampled_by_song_id'])

Unnamed: 0,sampled_by_song_id_x,song_id_x,rating,song_id_y,sampled_by_song_id_y,is_connected


In [None]:
recommender.recommendForUserSubset()

In [25]:
user_factors = recommender.userFactors.toPandas().set_index('id')

In [26]:
item_factors = recommender.itemFactors.toPandas().set_index('id')

In [27]:
user_factors.head(10)

Unnamed: 0_level_0,features
id,Unnamed: 1_level_1
10,"[0.13662275671958923, 0.044446516782045364, 0...."
280,"[0.20331062376499176, 0.10586054623126984, 0.0..."
290,"[0.12455614656209946, 0.019895723089575768, 0...."
300,"[0.12455614656209946, 0.019895723089575768, 0...."
310,"[0.06681961566209793, 0.0, 0.5341866612434387,..."
320,"[0.14578653872013092, 0.01445478294044733, 0.0..."
330,"[0.1272231936454773, 0.020871499553322792, 0.0..."
340,"[0.12455614656209946, 0.019895723089575768, 0...."
350,"[0.1216895580291748, 0.033341120928525925, 0.0..."
360,"[0.2257940173149109, 0.016278965398669243, 0.3..."


In [30]:
def predict_rating(user_id, item_id):
    user_vectors = np.array(
        user_factors.loc[user_id, 'features'])
    item_vectors = np.array(
        item_factors.loc[item_id, 'features'])
    return user_vectors @ item_vectors

In [31]:
predict_rating(13321, 13728)

0.9944131887996569

In [None]:
recommender.recommendForUserSubset()

In [32]:
"hi" + None

TypeError: must be str, not NoneType

In [34]:
query = " SELECT * FROM {}" + "" + ";"

In [35]:
query

' SELECT * FROM {};'