In [1]:
import pandas as pd
import numpy as np
import pyspark as ps
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    IntegerType, StringType, IntegerType, FloatType, 
    StructField, StructType, DoubleType
)
from pyspark.ml.recommendation import ALS
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from pyspark.sql.functions import udf, col, isnan
import psycopg2
from pandas.io import sql
from sklearn.model_selection import train_test_split
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
conn = psycopg2.connect(dbname='mixmaker', host='localhost')
cur = conn.cursor()

In [90]:
query = 'SELECT * FROM artists;'
artists = sql.read_sql_query(query, conn)

In [91]:
artists.head()

Unnamed: 0,id,name,url,scraped
0,13,Isaac Hayes,https://www.whosampled.com/Isaac-Hayes/,0
1,14,KC & the Sunshine Band,https://www.whosampled.com/KC-%26-the-Sunshine...,0
2,15,Stevie Wonder,https://www.whosampled.com/Stevie-Wonder/,0
3,16,"Earth, Wind & Fire","https://www.whosampled.com/Earth,-Wind-%26-Fire/",0
4,17,Bee Gees,https://www.whosampled.com/Bee-Gees/,0


In [92]:
query = 'SELECT * FROM songs;'
songs = sql.read_sql_query(query, conn)

In [93]:
len(songs)

29231

In [94]:
query = 'SELECT * FROM connections;'
connections = sql.read_sql_query(query, conn)

In [95]:
connections = connections.drop('remixed_by_song_id', axis=1)

In [96]:
connections['is_connected'] = np.ones(len(connections))

In [97]:
connections.head()

Unnamed: 0,song_id,sampled_by_song_id,is_connected
0,1,374,1.0
1,1,375,1.0
2,1,376,1.0
3,1,377,1.0
4,1,378,1.0


In [98]:
spark_df = spark.createDataFrame(connections)

In [99]:
train, test = spark_df.randomSplit([0.8, 0.2], seed=216)

In [13]:
als_model = ALS(
    itemCol='song_id',
    userCol='sampled_by_song_id',
    ratingCol='is_connected',
    nonnegative=True,
    regParam=0.01,
    rank=10,
    implicitPrefs=True)

recommender = als_model.fit(train)

test_predictions = recommender.transform(test)

In [122]:
test_predictions.show()

+-------+------------------+------------+------------+
|song_id|sampled_by_song_id|is_connected|  prediction|
+-------+------------------+------------+------------+
|    463|              9170|         1.0|         NaN|
|    471|              9896|         1.0| 7.016704E-7|
|    471|              9170|         1.0|4.4592525E-6|
|  13285|             14299|         1.0|         NaN|
|  14570|             14987|         1.0|0.0026822868|
|    392|              9170|         1.0|         0.0|
|  11280|             11800|         1.0|         NaN|
|  11280|             11713|         1.0|         NaN|
|  11280|             11835|         1.0|  0.01750579|
|  11280|             11808|         1.0|         NaN|
|  11280|             11694|         1.0|         NaN|
|  11280|             11736|         1.0|         NaN|
|  11280|             11717|         1.0|         NaN|
|  11280|              1125|         1.0|         NaN|
|  11280|             11764|         1.0|         NaN|
|  11280| 

In [68]:
preds_df = test_predictins.toPandas()

In [69]:
preds_df.head()

Unnamed: 0,song_id,sampled_by_song_id,is_connected,prediction
0,463,9170,1.0,
1,471,9896,1.0,0.609067
2,471,9170,1.0,0.505995
3,13285,14299,1.0,
4,14570,14987,1.0,0.610512


In [70]:
check = preds_df.sort_values('prediction', ascending=False).head(10)

In [78]:
lookup_id = 612
lookup_df = preds_df.loc[preds_df['sampled_by_song_id']==lookup_id,:]

In [79]:
lookup_df.sort_values('prediction', ascending=False).head()

Unnamed: 0,song_id,sampled_by_song_id,is_connected,prediction
262,28,612,1.0,1.257137
841,9152,612,1.0,1.076854
3387,83,612,1.0,1.069884
3492,42,612,1.0,1.061826
1655,20,612,1.0,1.035928


In [80]:
songs[songs['name'] == 'All Night']

Unnamed: 0,id,artist_id,name,url,scraped
2188,2214,1324,All Night,https://www.whosampled.com/Milira/All-Night,0
4535,4563,2758,All Night,https://www.whosampled.com/Maurice-Tamraz/All-...,0
13413,13428,4207,All Night,https://www.whosampled.com/Angelo-Ferreri/All-...,0
15351,15368,6788,All Night,https://www.whosampled.com/Luxury-Elite/All-Night,0
15366,15383,6798,All Night,https://www.whosampled.com/Cinque/All-Night,0


In [92]:
artists.loc[artists['name'].apply(lambda x: x.startswith('Beyonc'))]

Unnamed: 0,id,name,url,scraped
6907,6908,Beyoncé,https://www.whosampled.com/Beyonc%C3%A9/,0


In [93]:
songs[songs['artist_id'] == 6908]

Unnamed: 0,id,artist_id,name,url,scraped
15559,15579,6908,Black Culture,https://www.whosampled.com/Beyonc%C3%A9/Black-...,0


In [71]:
preds_df.sort_values('prediction', ascending=False).head(10)

Unnamed: 0,song_id,sampled_by_song_id,is_connected,prediction
3908,134,421,1.0,1.293696
2993,9774,4652,1.0,1.288245
112,65,1471,1.0,1.280834
2851,82,4064,1.0,1.276294
2149,9,627,1.0,1.264793
262,28,612,1.0,1.257137
90,31,2830,1.0,1.25428
1748,120,3331,1.0,1.21286
222,34,2599,1.0,1.210035
110,65,2297,1.0,1.203673


In [146]:
als_model = ALS(
    itemCol='song_id',
    userCol='sampled_by_song_id',
    ratingCol='is_connected',
    nonnegative=True,
    alpha = 0.1,
    regParam=0.01,

    rank=50)

recommender = als_model.fit(train)

In [147]:
train_preds = recommender.transform(train)

In [148]:
evaluator = RegressionEvaluator(metricName='rmse', 
                                labelCol='is_connected',
                                predictionCol='prediction')

In [149]:
rmse_train = evaluator.evaluate(train_preds)

In [150]:
rmse_train

0.0174042459623237

In [125]:
lookup_id = 627
lookup_df = preds_df.loc[preds_df['sampled_by_song_id']==lookup_id,:]
songs[songs['id'] == lookup_id]

Unnamed: 0,id,artist_id,name,url,scraped
599,627,251,Keep on Doin',https://www.whosampled.com/Kev-E-Kev/Keep-on-D...,0


In [126]:
test_predictions = recommender.transform(test)

In [127]:
rmse = evaluator.evaluate(test_predictions)

In [128]:
rmse_test = evaluator.evaluate(
    test_predictions.filter("prediction + 1 > prediction"))

In [129]:
rmse_test

0.131964625788508

In [151]:
test_predictions.filter("prediction + 1 > prediction").count()

9851

In [152]:
test_predictions.count()

12605

In [213]:
test_predictions.persist().count()

3967

In [17]:
cd /Users/brettashley/galvanize/dsi/00-Capstone/mix-maker/src

/Users/brettashley/galvanize/dsi/00-Capstone/mix-maker/src


In [18]:
import model

In [19]:
sr = model.SongRecommender()

In [298]:
df.columns

Index(['song_id', 'sampled_by_song_id', 'is_connected', 'prediction'], dtype='object')

In [301]:
new_df = pd.DataFrame(new, columns=df.columns[:2])

In [302]:
new_df['is_connected'] = 0

In [303]:
new_df.head()

Unnamed: 0,song_id,sampled_by_song_id,is_connected
0,9767,2815,0
1,14456,10660,0
2,14396,2404,0
3,94,4005,0
4,83,9523,0


In [306]:
test_pd = test.toPandas()

In [307]:
test_pd.head()

Unnamed: 0,song_id,sampled_by_song_id,is_connected
0,1,374,1.0
1,1,376,1.0
2,1,378,1.0
3,1,382,1.0
4,1,383,1.0


In [313]:
check = pd.concat([test_pd, new_df]).reset_index()

In [130]:
sr.generate_negative_targets(test,
                             'song_id',
                             'sampled_by_song_id',
                             'is_connected',
                             seed=406)

DataFrame[index: bigint, is_connected: double, sampled_by_song_id: bigint, song_id: bigint]

In [131]:
test_with_neg = _

In [22]:
test_with_neg.limit(10).show()

+-----+------------+------------------+-------+
|index|is_connected|sampled_by_song_id|song_id|
+-----+------------+------------------+-------+
|    0|         1.0|               374|      1|
|    1|         1.0|               376|      1|
|    2|         1.0|               378|      1|
|    3|         1.0|               382|      1|
|    4|         1.0|               383|      1|
|    5|         1.0|               391|      1|
|    6|         1.0|               397|      1|
|    7|         1.0|               403|      1|
|    8|         1.0|               410|      1|
|    9|         1.0|               422|      1|
+-----+------------+------------------+-------+



In [132]:
test_with_neg.count()

25210

In [133]:
preds = recommender.transform(test_with_neg)

In [134]:
sr.RMSE(preds, has_nan_values=True)

0.6702252586102077

In [137]:
preds_df = preds.toPandas()

In [138]:
preds_df.columns

Index(['index', 'is_connected', 'sampled_by_song_id', 'song_id', 'prediction'], dtype='object')

In [139]:
df = preds_df.loc[preds_df['is_connected'] == 0,:]
checks = df.sort_values('prediction', ascending=False).head(100)

In [140]:
checks.head()

Unnamed: 0,index,is_connected,sampled_by_song_id,song_id,prediction
11889,10904,0.0,4992,17487,2.117134
245,5667,0.0,24455,14377,2.006908
23227,12067,0.0,4585,13202,1.951384
11898,6007,0.0,23580,17487,1.9242
7962,8196,0.0,25257,16088,1.889433


In [141]:
check_merge_song1 = (checks.merge(songs, left_on='song_id', right_on='id')
                     .drop(columns=['artist_id', 'id', 'scraped', 'index'], axis=1))

In [142]:
check_merge_songs = (check_merge_song1.merge(songs, left_on='sampled_by_song_id', right_on='id')
    .drop(columns=['artist_id', 'id', 'scraped', 'is_connected'], axis=1))

In [143]:
check_merge_songs

Unnamed: 0,sampled_by_song_id,song_id,prediction,name_x,url_x,name_y,url_y
0,4992,17487,2.117134,I Want Your Love (1978),https://www.whosampled.com/Chic/I-Want-Your-Love/,Murda Style,https://www.whosampled.com/Kid-Lib/Murda-Style
1,23580,17487,1.924200,I Want Your Love (1978),https://www.whosampled.com/Chic/I-Want-Your-Love/,Mechanics,https://www.whosampled.com/Dom-%26-Roland/Mech...
2,23580,9784,1.597071,Get Down on It (1981),https://www.whosampled.com/Kool-%26-the-Gang/G...,Mechanics,https://www.whosampled.com/Dom-%26-Roland/Mech...
3,24455,14377,2.006908,Somebody's Watching Me (1983)\nby Rockwell fea...,https://www.whosampled.com/Rockwell/Somebody%2...,MXXXIII,https://www.whosampled.com/DJ-Groove-(Russia)/...
4,4585,13202,1.951384,Let's Get It On (1973),https://www.whosampled.com/Marvin-Gaye/Let%27s...,Crystal Cuts,https://www.whosampled.com/Sully/Crystal-Cuts
5,25257,16088,1.889433,The Big Bang Theory (1979),https://www.whosampled.com/Parliament/The-Big-...,Conquering,https://www.whosampled.com/Acid-Lab/Conquering
6,24352,16088,1.680222,The Big Bang Theory (1979),https://www.whosampled.com/Parliament/The-Big-...,Tribal Connections Pt. 2,https://www.whosampled.com/DJ-Crazee-M/Tribal-...
7,4913,9790,1.874424,Ladies' Night (1979),https://www.whosampled.com/Kool-%26-the-Gang/L...,Deranged,https://www.whosampled.com/Special-Request/Der...
8,10035,9790,1.583541,Ladies' Night (1979),https://www.whosampled.com/Kool-%26-the-Gang/L...,Lights Out,https://www.whosampled.com/III-Most-Wanted/Lig...
9,3783,12206,1.849226,A Hard Day's Night (1964),https://www.whosampled.com/The-Beatles/A-Hard-...,So Long,https://www.whosampled.com/Seba/So-Long


In [144]:
check_merge_songs.columns = ['sampled_by_song_id', 'song_id',
                             'prediction', 'sample_name',
                             'sample_url', 'song_name',
                             'song_url']

In [145]:
check_merge_songs

Unnamed: 0,sampled_by_song_id,song_id,prediction,sample_name,sample_url,song_name,song_url
0,4992,17487,2.117134,I Want Your Love (1978),https://www.whosampled.com/Chic/I-Want-Your-Love/,Murda Style,https://www.whosampled.com/Kid-Lib/Murda-Style
1,23580,17487,1.924200,I Want Your Love (1978),https://www.whosampled.com/Chic/I-Want-Your-Love/,Mechanics,https://www.whosampled.com/Dom-%26-Roland/Mech...
2,23580,9784,1.597071,Get Down on It (1981),https://www.whosampled.com/Kool-%26-the-Gang/G...,Mechanics,https://www.whosampled.com/Dom-%26-Roland/Mech...
3,24455,14377,2.006908,Somebody's Watching Me (1983)\nby Rockwell fea...,https://www.whosampled.com/Rockwell/Somebody%2...,MXXXIII,https://www.whosampled.com/DJ-Groove-(Russia)/...
4,4585,13202,1.951384,Let's Get It On (1973),https://www.whosampled.com/Marvin-Gaye/Let%27s...,Crystal Cuts,https://www.whosampled.com/Sully/Crystal-Cuts
5,25257,16088,1.889433,The Big Bang Theory (1979),https://www.whosampled.com/Parliament/The-Big-...,Conquering,https://www.whosampled.com/Acid-Lab/Conquering
6,24352,16088,1.680222,The Big Bang Theory (1979),https://www.whosampled.com/Parliament/The-Big-...,Tribal Connections Pt. 2,https://www.whosampled.com/DJ-Crazee-M/Tribal-...
7,4913,9790,1.874424,Ladies' Night (1979),https://www.whosampled.com/Kool-%26-the-Gang/L...,Deranged,https://www.whosampled.com/Special-Request/Der...
8,10035,9790,1.583541,Ladies' Night (1979),https://www.whosampled.com/Kool-%26-the-Gang/L...,Lights Out,https://www.whosampled.com/III-Most-Wanted/Lig...
9,3783,12206,1.849226,A Hard Day's Night (1964),https://www.whosampled.com/The-Beatles/A-Hard-...,So Long,https://www.whosampled.com/Seba/So-Long
