In [1]:
import pandas as pd
import numpy as np
import pyspark as ps
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    IntegerType, StringType, IntegerType, FloatType, 
    StructField, StructType, DoubleType
)
from pyspark.ml.recommendation import ALS
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from pyspark.sql.functions import udf, col, isnan
import psycopg2
from pandas.io import sql
from sklearn.model_selection import train_test_split
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
conn = psycopg2.connect(dbname='mixmaker', host='localhost')
cur = conn.cursor()

In [3]:
query = 'SELECT * FROM artists;'
artists = sql.read_sql_query(query, conn)

In [4]:
artists.head()

Unnamed: 0,id,name,url,scraped
0,13,Isaac Hayes,https://www.whosampled.com/Isaac-Hayes/,0
1,14,KC & the Sunshine Band,https://www.whosampled.com/KC-%26-the-Sunshine...,0
2,15,Stevie Wonder,https://www.whosampled.com/Stevie-Wonder/,0
3,16,"Earth, Wind & Fire","https://www.whosampled.com/Earth,-Wind-%26-Fire/",0
4,17,Bee Gees,https://www.whosampled.com/Bee-Gees/,0


In [5]:
query = 'SELECT * FROM songs;'
songs = sql.read_sql_query(query, conn)

In [6]:
len(songs)

25390

In [7]:
query = 'SELECT * FROM connections;'
connections = sql.read_sql_query(query, conn)

In [8]:
connections = connections.drop('remixed_by_song_id', axis=1)

In [9]:
connections['is_connected'] = np.ones(len(connections))

In [10]:
connections.head()

Unnamed: 0,song_id,sampled_by_song_id,is_connected
0,1,374,1.0
1,1,375,1.0
2,1,376,1.0
3,1,377,1.0
4,1,378,1.0


In [11]:
spark_df = spark.createDataFrame(connections)

In [12]:
train, test = spark_df.randomSplit([0.8, 0.2], seed=216)

In [13]:
als_model = ALS(
    itemCol='song_id',
    userCol='sampled_by_song_id',
    ratingCol='is_connected',
    nonnegative=True,
    regParam=0.01,
    rank=10,
    implicitPrefs=True)

recommender = als_model.fit(train)

test_predictions = recommender.transform(test)

In [122]:
test_predictions.show()

+-------+------------------+------------+------------+
|song_id|sampled_by_song_id|is_connected|  prediction|
+-------+------------------+------------+------------+
|    463|              9170|         1.0|         NaN|
|    471|              9896|         1.0| 7.016704E-7|
|    471|              9170|         1.0|4.4592525E-6|
|  13285|             14299|         1.0|         NaN|
|  14570|             14987|         1.0|0.0026822868|
|    392|              9170|         1.0|         0.0|
|  11280|             11800|         1.0|         NaN|
|  11280|             11713|         1.0|         NaN|
|  11280|             11835|         1.0|  0.01750579|
|  11280|             11808|         1.0|         NaN|
|  11280|             11694|         1.0|         NaN|
|  11280|             11736|         1.0|         NaN|
|  11280|             11717|         1.0|         NaN|
|  11280|              1125|         1.0|         NaN|
|  11280|             11764|         1.0|         NaN|
|  11280| 

In [68]:
preds_df = test_predictins.toPandas()

In [69]:
preds_df.head()

Unnamed: 0,song_id,sampled_by_song_id,is_connected,prediction
0,463,9170,1.0,
1,471,9896,1.0,0.609067
2,471,9170,1.0,0.505995
3,13285,14299,1.0,
4,14570,14987,1.0,0.610512


In [70]:
check = preds_df.sort_values('prediction', ascending=False).head(10)

In [78]:
lookup_id = 612
lookup_df = preds_df.loc[preds_df['sampled_by_song_id']==lookup_id,:]

In [79]:
lookup_df.sort_values('prediction', ascending=False).head()

Unnamed: 0,song_id,sampled_by_song_id,is_connected,prediction
262,28,612,1.0,1.257137
841,9152,612,1.0,1.076854
3387,83,612,1.0,1.069884
3492,42,612,1.0,1.061826
1655,20,612,1.0,1.035928


In [80]:
songs[songs['name'] == 'All Night']

Unnamed: 0,id,artist_id,name,url,scraped
2188,2214,1324,All Night,https://www.whosampled.com/Milira/All-Night,0
4535,4563,2758,All Night,https://www.whosampled.com/Maurice-Tamraz/All-...,0
13413,13428,4207,All Night,https://www.whosampled.com/Angelo-Ferreri/All-...,0
15351,15368,6788,All Night,https://www.whosampled.com/Luxury-Elite/All-Night,0
15366,15383,6798,All Night,https://www.whosampled.com/Cinque/All-Night,0


In [92]:
artists.loc[artists['name'].apply(lambda x: x.startswith('Beyonc'))]

Unnamed: 0,id,name,url,scraped
6907,6908,Beyoncé,https://www.whosampled.com/Beyonc%C3%A9/,0


In [93]:
songs[songs['artist_id'] == 6908]

Unnamed: 0,id,artist_id,name,url,scraped
15559,15579,6908,Black Culture,https://www.whosampled.com/Beyonc%C3%A9/Black-...,0


In [71]:
preds_df.sort_values('prediction', ascending=False).head(10)

Unnamed: 0,song_id,sampled_by_song_id,is_connected,prediction
3908,134,421,1.0,1.293696
2993,9774,4652,1.0,1.288245
112,65,1471,1.0,1.280834
2851,82,4064,1.0,1.276294
2149,9,627,1.0,1.264793
262,28,612,1.0,1.257137
90,31,2830,1.0,1.25428
1748,120,3331,1.0,1.21286
222,34,2599,1.0,1.210035
110,65,2297,1.0,1.203673


In [103]:
lookup_id = 627
lookup_df = preds_df.loc[preds_df['sampled_by_song_id']==lookup_id,:]
songs[songs['id'] == lookup_id]

Unnamed: 0,id,artist_id,name,url,scraped
601,627,251,Keep on Doin',https://www.whosampled.com/Kev-E-Kev/Keep-on-D...,0


In [28]:
als_model = ALS(
    itemCol='song_id',
    userCol='sampled_by_song_id',
    ratingCol='is_connected',
    nonnegative=True,
    regParam=0.01,
    rank=50)

recommender = als_model.fit(train)

In [29]:
train_preds = recommender.transform(train)

In [30]:
evaluator = RegressionEvaluator(metricName='rmse', 
                                labelCol='is_connected',
                                predictionCol='prediction')

In [31]:
rmse_train = evaluator.evaluate(train_preds)

In [32]:
rmse_train

0.01757780244861475

In [163]:
rmse = evaluator.evaluate(test_predictions)

In [164]:
test_predictions = recommender.transform(test)

In [165]:
rmse_test = evaluator.evaluate(
    test_predictions.filter("prediction + 1 > prediction"))

In [166]:
rmse_test

0.2810005494691985

In [176]:
# randomly generate test data with 0's
df = test.toPandas()

In [177]:
df.head()

Unnamed: 0,song_id,sampled_by_song_id,is_connected
0,1,374,1.0
1,1,376,1.0
2,1,378,1.0
3,1,382,1.0
4,1,383,1.0


In [182]:
len(df.loc[(df['song_id'] == 1) & (df['sampled_by_song_id'] == 374),:])

1

In [185]:
df['song_id'].unique()

array([    1,     2,     3,     4,     5,     6,     8,     9,    10,
          11,    12,    13,    14,    15,    16,    17,    18,    19,
          20,    21,    22,    23,    25,    26,    28,    29,    30,
          31,    32,    33,    34,    35,    36,    37,    38,    39,
          40,    41,    42,    43,    44,    45,    46,    47,    48,
          49,    50,    51,    52,    53,    54,    55,    57,    58,
          60,    61,    62,    63,    64,    65,    66,    67,    68,
          69,    70,    71,    73,    74,    75,    76,    77,    78,
          79,    81,    82,    83,    84,    85,    86,    87,    88,
          89,    91,    94,    95,    96,    97,    98,    99,   101,
         102,   104,   109,   111,   113,   115,   117,   118,   119,
         120,   121,   122,   124,   127,   128,   130,   131,   132,
         134,   138,   139,   140,   141,   142,   145,   146,   151,
         156,   157,   158,   159,   163,   164,   166,   168,   173,
         176,   178,

In [188]:
id1 = np.random.choice(df['song_id'].unique())
id2 = np.random.choice(df['sampled_by_song_id'].unique())

In [189]:
id1, id2

(864, 1984)

In [190]:
len(df.loc[(df['song_id'] == id1) & (df['sampled_by_song_id'] == id2),:])

0

In [None]:
if len(df.loc[(df['song_id'] == id1)
            & (df['sampled_by_song_id'] == id2),:]) == 0:
    pd.

In [194]:
test_predictions.persist().count()

3967

In [203]:
song_id_df = test_predictions.select('song_id').toPandas()

In [205]:
song_id_df['song_id'].unique()

array([  463,   471, 13285, 14570,   392, 11280, 11316, 12367,    31,
       13207, 14377, 14420,    85,   451,    65,   458, 12210, 12284,
       12322,    53, 12207, 14365,    78, 12238, 12315, 13214, 14437,
         375,  9968, 12169, 12182, 12384,    34, 12179,   101,   115,
       12362, 14410,    81,   385, 11286, 12287,    28,   183,   436,
        9781, 14367, 12269,   406,   412, 12191, 14455,  9687, 13301,
          76,  9771, 14644,    26, 12329,   384, 12298, 12337, 12404,
       14375, 14391,    44,   159,   192,  9787, 12192, 11326,   460,
        9840, 10010, 11308,    12,  9824, 14575,  9825,   388,   417,
       12232, 13248,    91,   409, 14351,  9783, 11350, 14364, 13219,
          22,   128, 13295, 14415,   122, 12170, 13275, 14436, 14453,
         157,  9769, 12165,   246,  9847, 11311, 12193, 14534,   111,
         224,    47,   140,   416,   132,  9782,   185, 14354,   386,
       11330, 12168,   473, 12316,   146, 12167, 14349, 12190, 14417,
           1, 12186,

In [196]:
test_predictions.select('sampled_by_song_id').distinct().count()

3613

In [206]:
a = [(5,6), (2,3)]

In [207]:
pd.DataFrame(a)

Unnamed: 0,0,1
0,5,6
1,2,3


In [213]:
test_predictions.persist().count()

3967

In [214]:
test_predictions.filter('song_id = 1 and sampled_by_song_id = 496').count()

1

In [17]:
cd /Users/brettashley/galvanize/dsi/00-Capstone/mix-maker/src

/Users/brettashley/galvanize/dsi/00-Capstone/mix-maker/src


In [18]:
import model

In [19]:
sr = model.SongRecommender()

In [295]:
new = _

In [296]:
new

[(9767, 2815), (14456, 10660), (14396, 2404), (94, 4005), (83, 9523)]

In [298]:
df.columns

Index(['song_id', 'sampled_by_song_id', 'is_connected', 'prediction'], dtype='object')

In [301]:
new_df = pd.DataFrame(new, columns=df.columns[:2])

In [302]:
new_df['is_connected'] = 0

In [303]:
new_df.head()

Unnamed: 0,song_id,sampled_by_song_id,is_connected
0,9767,2815,0
1,14456,10660,0
2,14396,2404,0
3,94,4005,0
4,83,9523,0


In [306]:
test_pd = test.toPandas()

In [307]:
test_pd.head()

Unnamed: 0,song_id,sampled_by_song_id,is_connected
0,1,374,1.0
1,1,376,1.0
2,1,378,1.0
3,1,382,1.0
4,1,383,1.0


In [313]:
check = pd.concat([test_pd, new_df]).reset_index()

In [27]:
check.tail()

NameError: name 'check' is not defined

In [36]:
sr.generate_negative_targets(test,
                             'song_id',
                             'sampled_by_song_id',
                             'is_connected')

DataFrame[index: bigint, is_connected: double, sampled_by_song_id: bigint, song_id: bigint]

In [37]:
test_with_neg = _

In [22]:
test_with_neg.limit(10).show()

+-----+------------+------------------+-------+
|index|is_connected|sampled_by_song_id|song_id|
+-----+------------+------------------+-------+
|    0|         1.0|               374|      1|
|    1|         1.0|               376|      1|
|    2|         1.0|               378|      1|
|    3|         1.0|               382|      1|
|    4|         1.0|               383|      1|
|    5|         1.0|               391|      1|
|    6|         1.0|               397|      1|
|    7|         1.0|               403|      1|
|    8|         1.0|               410|      1|
|    9|         1.0|               422|      1|
+-----+------------+------------------+-------+



In [38]:
test_with_neg.count()

21654

In [39]:
preds = recommender.transform(test_with_neg)

In [25]:
preds.limit(20).show()

+-----+------------+------------------+-------+------------+
|index|is_connected|sampled_by_song_id|song_id|  prediction|
+-----+------------+------------------+-------+------------+
| 2510|         1.0|              9166|    471| 5.555993E-6|
| 3927|         1.0|             11980|  11317|         NaN|
| 4350|         1.0|             14300|  13285|         NaN|
| 4351|         1.0|             14304|  13289|         NaN|
| 4814|         1.0|             15801|  14832|         NaN|
| 4847|         1.0|             15725|  15727|2.2121003E-5|
|   69|         0.0|               932|  15727|2.7311917E-5|
| 5205|         1.0|             17232|  17420|9.559116E-10|
| 5204|         1.0|             12537|  17420|9.431096E-10|
| 5207|         1.0|             17234|  17420|9.559116E-10|
| 5206|         1.0|             17233|  17420|9.559116E-10|
| 7342|         1.0|             22725|  18911|         NaN|
| 7338|         1.0|             19065|  18911| 0.008421166|
| 7339|         1.0|    

In [40]:
sr.RMSE(preds, has_nan_values=True)

0.6761823030720727

In [41]:
preds_df = preds.toPandas()

In [55]:
preds_df.columns

Index(['index', 'is_connected', 'sampled_by_song_id', 'song_id', 'prediction'], dtype='object')

In [83]:
df = preds_df.loc[preds_df['is_connected'] == 0,:]
checks = df.sort_values('prediction', ascending=False).head(100)

In [84]:
checks.head()

Unnamed: 0,index,is_connected,sampled_by_song_id,song_id,prediction
10355,9716,0.0,23257,14395,1.965681
20818,166,0.0,23325,12164,1.9097
6832,8581,0.0,1664,11297,1.892884
15307,7694,0.0,2793,4275,1.888544
21107,5361,0.0,24851,12206,1.865279


In [85]:
check_merge_song1 = (checks.merge(songs, left_on='song_id', right_on='id')
                     .drop(columns=['artist_id', 'id', 'scraped', 'index'], axis=1))

In [86]:
check_merge_songs = (check_merge_song1.merge(songs, left_on='sampled_by_song_id', right_on='id')
    .drop(columns=['artist_id', 'id', 'scraped', 'is_connected'], axis=1))

In [87]:
check_merge_songs

Unnamed: 0,sampled_by_song_id,song_id,prediction,name_x,url_x,name_y,url_y
0,23257,14395,1.965681,Earth Song (1995),https://www.whosampled.com/Michael-Jackson/Ear...,Super Hero (My Knight),https://www.whosampled.com/The-House-Crew/Supe...
1,23325,12164,1.909700,All You Need Is Love (1967),https://www.whosampled.com/The-Beatles/All-You...,Fuck Jr Pxxxxx,https://www.whosampled.com/Albertslund-Terror-...
2,1664,11297,1.892884,"Mommy, What's a Funkadelic? (1970)","https://www.whosampled.com/Funkadelic/Mommy,-W...",In Effect,https://www.whosampled.com/DJ-Red-Alert-%26-Mi...
3,2793,4275,1.888544,The Michael Jackson Medley (Ultimix Megamix),https://www.whosampled.com/Michael-Jackson/The...,It Gets Rough Sometimes,https://www.whosampled.com/BTK/It-Gets-Rough-S...
4,2793,13213,1.610627,Ain't Nothing Like the Real Thing (1968)\nby M...,https://www.whosampled.com/Marvin-Gaye/Ain%27t...,It Gets Rough Sometimes,https://www.whosampled.com/BTK/It-Gets-Rough-S...
5,25193,4275,1.708179,The Michael Jackson Medley (Ultimix Megamix),https://www.whosampled.com/Michael-Jackson/The...,Bright Light,https://www.whosampled.com/Necrotype/Bright-Light
6,23271,4275,1.696342,The Michael Jackson Medley (Ultimix Megamix),https://www.whosampled.com/Michael-Jackson/The...,Streets of San Fran Brixton,https://www.whosampled.com/Genaside-II/Streets...
7,24851,12206,1.865279,A Hard Day's Night (1964),https://www.whosampled.com/The-Beatles/A-Hard-...,In Extremis,https://www.whosampled.com/Drum-Cypha/In-Extremis
8,25122,12206,1.865279,A Hard Day's Night (1964),https://www.whosampled.com/The-Beatles/A-Hard-...,Bunker Buster,https://www.whosampled.com/Resonant-Evil/Bunke...
9,3423,21903,1.861575,Popcorn With a Feeling,https://www.whosampled.com/James-Brown/Popcorn...,Helium Crew Anthem 2,https://www.whosampled.com/Syndakit/Helium-Cre...
