In [1]:
import pandas as pd
import numpy as np
import pyspark as ps
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    IntegerType, StringType, IntegerType, FloatType, 
    StructField, StructType, DoubleType
)
from pyspark.ml.recommendation import ALS
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from pyspark.sql.functions import udf, col, isnan
import psycopg2
from pandas.io import sql
from sklearn.model_selection import train_test_split
from pyspark.ml.evaluation import RegressionEvaluator
from itertools import product

In [2]:
conn = psycopg2.connect(dbname='mixmaker2', host='localhost')
cur = conn.cursor()

In [3]:
query = 'SELECT * FROM artists;'
artists = sql.read_sql_query(query, conn)

In [4]:
artists.sort_values('id').head()

Unnamed: 0,id,name,url,scraped,scraped_spotify
4080,1,James Brown,https://www.whosampled.com/James-Brown/,1,0.0
3,2,Traditional Folk,https://www.whosampled.com/Traditional-Folk/,1,0.0
777,3,Kool & the Gang,https://www.whosampled.com/Kool-%26-the-Gang/,1,0.0
778,4,Funkadelic,https://www.whosampled.com/Funkadelic/,1,0.0
779,5,The Beatles,https://www.whosampled.com/The-Beatles/,1,0.0


In [5]:
query = 'SELECT * FROM songs;'
songs = sql.read_sql_query(query, conn)

In [6]:
len(songs)

42879

In [7]:
query = 'SELECT * FROM connections;'
connections = sql.read_sql_query(query, conn)

In [8]:
connections.head()

Unnamed: 0,song_id,sampled_by_song_id,is_connected
0,1,272,1
1,1,273,1
2,1,274,1
3,1,275,1
4,1,276,1


In [9]:
spark_df = spark.createDataFrame(connections)

In [10]:
train, test = spark_df.randomSplit([0.8, 0.2], seed=216)

In [None]:
als_model = ALS(
    itemCol='song_id',
    userCol='sampled_by_song_id',
    ratingCol='is_connected',
    nonnegative=True,
    regParam=0.01,
    rank=10)

recommender = als_model.fit(train)

test_predictions = recommender.transform(test)

In [None]:
test_predictions.show()

In [None]:
preds_df = test_predictins.toPandas()

In [None]:
preds_df.head()

In [None]:
check = preds_df.sort_values('prediction', ascending=False).head(10)

In [None]:
lookup_id = 612
lookup_df = preds_df.loc[preds_df['sampled_by_song_id']==lookup_id,:]

In [None]:
lookup_df.sort_values('prediction', ascending=False).head()

In [None]:
songs[songs['name'] == 'All Night']

In [None]:
artists.loc[artists['name'].apply(lambda x: x.startswith('Beyonc'))]

In [None]:
songs[songs['artist_id'] == 6908]

In [None]:
preds_df.sort_values('prediction', ascending=False).head(10)

In [None]:
als_model = ALS(
    itemCol='song_id',
    userCol='sampled_by_song_id',
    ratingCol='is_connected',
    nonnegative=True,
    alpha = 0.1,
    regParam=0.01,
    rank=50)

recommender = als_model.fit(train)

In [None]:
train_preds = recommender.transform(train)

In [None]:
evaluator = RegressionEvaluator(metricName='rmse', 
                                labelCol='is_connected',
                                predictionCol='prediction')

In [None]:
rmse_train = evaluator.evaluate(train_preds)

In [None]:
rmse_train

In [None]:
lookup_id = 627
lookup_df = preds_df.loc[preds_df['sampled_by_song_id']==lookup_id,:]
songs[songs['id'] == lookup_id]

In [None]:
test_predictions = recommender.transform(test)

In [None]:
rmse = evaluator.evaluate(test_predictions)

In [None]:
rmse_test = evaluator.evaluate(
    test_predictions.filter("prediction + 1 > prediction"))

In [None]:
rmse_test

In [None]:
test_predictions.filter("prediction + 1 > prediction").count()

In [None]:
test_predictions.count()

In [None]:
test_predictions.persist().count()

In [None]:
7138 / 12387

In [44]:
cd /Users/brettashley/galvanize/dsi/00-Capstone/mix-maker/src

/Users/brettashley/galvanize/dsi/00-Capstone/mix-maker/src


In [49]:
import model

In [50]:
sr = model.SongRecommender()

In [None]:
df.columns

In [None]:
sr.generate_negative_targets(test,
                             'song_id',
                             'sampled_by_song_id',
                             'is_connected',
                             seed=406)

In [None]:
test_with_neg = _

In [None]:
test_with_neg.limit(10).show()

In [None]:
test_with_neg.count()

In [None]:
preds = recommender.transform(test_with_neg)

In [None]:
sr.RMSE(preds, has_nan_values=True)

In [None]:
preds_df = preds.toPandas()

In [None]:
preds_df.columns

In [None]:
df = preds_df.loc[preds_df['is_connected'] == 0,:]
checks = df.sort_values('prediction', ascending=False).head(100)

In [None]:
checks.head()

In [None]:
check_merge_song1 = (checks.merge(songs, left_on='song_id', right_on='id')
                     .drop(columns=['artist_id', 'id', 'scraped', 'index'], axis=1))

In [None]:
check_merge_songs = (check_merge_song1.merge(songs, left_on='sampled_by_song_id', right_on='id')
    .drop(columns=['artist_id', 'id', 'scraped', 'is_connected',
                   'scraped_features_x', 'scraped_features_y'], axis=1))

In [None]:
check_merge_songs

In [None]:
check_merge_songs.columns = ['sampled_by_song_id', 'song_id',
                             'prediction', 'sample_name',
                             'sample_url', 'song_name',
                             'song_url']

In [None]:
check_merge_songs

In [11]:
df = train.toPandas()

In [19]:
len(df[['song_id', 'sampled_by_song_id']].values)

48687

In [34]:
set((x,y) for (x,y) in product(df.loc[:100,'song_id'], df.loc[:,'sampled_by_song_id']))

{(1, 37922),
 (1, 22872),
 (1, 32515),
 (1, 9145),
 (1, 37519),
 (1, 30184),
 (1, 5203),
 (1, 14854),
 (1, 19573),
 (1, 4664),
 (1, 12515),
 (1, 34212),
 (1, 42095),
 (1, 3912),
 (1, 41716),
 (1, 16575),
 (1, 26466),
 (1, 1493),
 (1, 24324),
 (1, 32207),
 (1, 946),
 (1, 37195),
 (1, 21985),
 (1, 29780),
 (1, 6687),
 (1, 36251),
 (1, 29233),
 (1, 13999),
 (1, 18590),
 (1, 3380),
 (1, 41136),
 (1, 25902),
 (1, 2961),
 (1, 10820),
 (1, 39082),
 (1, 24000),
 (1, 8225),
 (1, 38711),
 (1, 16014),
 (1, 21002),
 (1, 28925),
 (1, 5792),
 (1, 13675),
 (1, 20314),
 (1, 27917),
 (1, 17703),
 (1, 27626),
 (1, 2653),
 (1, 10240),
 (1, 40726),
 (1, 17292),
 (1, 25207),
 (1, 9965),
 (1, 15690),
 (1, 20726),
 (1, 30393),
 (1, 5484),
 (1, 15319),
 (1, 35048),
 (1, 5065),
 (1, 12732),
 (1, 34677),
 (1, 42296),
 (1, 19427),
 (1, 27222),
 (1, 2073),
 (1, 11980),
 (1, 40402),
 (1, 41861),
 (1, 1766),
 (1, 9385),
 (1, 39871),
 (1, 22741),
 (1, 7491),
 (1, 37380),
 (1, 30053),
 (1, 6952),
 (1, 14739),
 (1, 36

In [35]:
all_combos = _

In [36]:
len(all_combos)

29075

In [37]:
existing_combos = set((x,y) for [x,y] in df[['song_id', 'sampled_by_song_id']].values)

In [38]:
len(existing_combos)

48687

In [40]:
diff = all_combos - existing_combos

In [47]:
list(diff)

[(1, 37922),
 (1, 22872),
 (1, 32515),
 (1, 9145),
 (1, 37519),
 (1, 30184),
 (1, 5203),
 (1, 14854),
 (1, 19573),
 (1, 4664),
 (1, 12515),
 (1, 34212),
 (1, 42095),
 (1, 3912),
 (1, 41716),
 (1, 16575),
 (1, 26466),
 (1, 1493),
 (1, 24324),
 (1, 32207),
 (1, 946),
 (1, 37195),
 (1, 21985),
 (1, 29780),
 (1, 6687),
 (1, 36251),
 (1, 29233),
 (1, 13999),
 (1, 18590),
 (1, 3380),
 (1, 41136),
 (1, 25902),
 (1, 2961),
 (1, 10820),
 (1, 39082),
 (1, 24000),
 (1, 8225),
 (1, 38711),
 (1, 16014),
 (1, 21002),
 (1, 28925),
 (1, 5792),
 (1, 13675),
 (1, 20314),
 (1, 27917),
 (1, 17703),
 (1, 27626),
 (1, 2653),
 (1, 10240),
 (1, 40726),
 (1, 17292),
 (1, 25207),
 (1, 9965),
 (1, 15690),
 (1, 20726),
 (1, 30393),
 (1, 5484),
 (1, 15319),
 (1, 35048),
 (1, 5065),
 (1, 12732),
 (1, 34677),
 (1, 42296),
 (1, 19427),
 (1, 27222),
 (1, 2073),
 (1, 11980),
 (1, 40402),
 (1, 41861),
 (1, 1766),
 (1, 9385),
 (1, 39871),
 (1, 22741),
 (1, 7491),
 (1, 37380),
 (1, 30053),
 (1, 6952),
 (1, 14739),
 (1, 36

In [58]:
sr.generate_negative_targets(test, 'song_id', 'sampled_by_song_id', 'is_connected', get_all=True)

ValueError: cannot insert level_0, already exists

In [56]:
(1,2) + (0,)

(1, 2, 0)

In [59]:
train.limit(5).show()

+-------+------------------+------------+
|song_id|sampled_by_song_id|is_connected|
+-------+------------------+------------+
|      1|               273|           1|
|      1|               275|           1|
|      1|               277|           1|
|      1|               278|           1|
|      1|               279|           1|
+-------+------------------+------------+



In [60]:
pd.concat()

Unnamed: 0,song_id,sampled_by_song_id,is_connected
0,1,273,1
1,1,275,1
2,1,277,1
3,1,278,1
4,1,279,1
5,1,282,1
6,1,283,1
7,1,284,1
8,1,285,1
9,1,286,1
