In [2]:
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
from spotipy import oauth2
import csv
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LinearRegression
from sklearn.ensemble.forest import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
import base64
import pandas as pd
import warnings



In [3]:
def get_credentials():
    cid = '8c4cb3ef1c0c412181ea69d07b3df0ca'
    secret = 'a07e05c762a24e12933c4c118c6c18ca'
    redirect_uri = 'http://localhost:8910/callback'
    return cid, secret, redirect_uri

In [4]:
def read_data(dataset_path):
    with open(dataset_path, 'r') as token_file:
        lines = csv.reader(token_file)
        dataset = list(lines)
        data = np.array(dataset)   
        return data

In [5]:
class KNNPredictor:
    def __init__(self, data):
        self.data = data
        self.preprocessing()
        self.train()
    
    def preprocessing(self):
        indices = []
        self.id_index = -1
        info = []
        choose = -1
        for i in range(self.data.shape[1]):
            if self.data[0][i] in ['key', 'artists', 'release_date', 'name', 'id']:
                indices.append(i)
                if self.data[0][i] == 'id':
                    self.id_index = i

            if self.data[0][i] in ['artists', 'name']:
                info.append(i)


        self.data = self.data[1:,:]
        np.random.shuffle(self.data)
        self.ids = np.array(self.data[:,self.id_index:self.id_index+1])
        song_info = np.array(self.data[:,info])
        self.data = np.delete(self.data, indices, axis=1)
        self.ids = np.ndarray.flatten(self.ids)
        self.data = self.data.astype('float64')

    def train(self):
        self.model = NearestNeighbors(n_neighbors = 26, algorithm = 'ball_tree')
        scalar = StandardScaler()
        scalar.fit(self.data)
        self.data = scalar.transform(self.data)
        self.model.fit(self.data)
    
    def predict(self, song_id):
        for j in range(self.data.shape[0]):
            if self.ids[j] == song_id:
                choose = j
        distances, indices = self.model.kneighbors([self.data[choose]])
        indices = np.ndarray.flatten(indices[:,1:])
        tracks = self.ids[indices]
        return tracks

In [14]:
class LinearPredictor:
    
    def __init__(self, data):
        self.data = data
        self.preprocessing()
    
    def preprocessing(self):
        indices = []
        self.id_index = -1
        info = []
        choose = -1
        for i in range(self.data.shape[1]):
            if self.data[0][i] in ['key', 'artists', 'release_date', 'name', 'id']:
                indices.append(i)
                if self.data[0][i] == 'id':
                    self.id_index = i

            if self.data[0][i] in ['artists', 'name']:
                info.append(i)


        self.data = self.data[1:,:]
        np.random.shuffle(self.data)
        self.ids = np.array(self.data[:,self.id_index:self.id_index+1])
        self.song_info = np.array(self.data[:,info])
        self.data = np.delete(self.data, indices, axis=1)
        self.ids = np.ndarray.flatten(self.ids)
        self.data = self.data.astype('float64')
    
    def train_and_predict(self, song_id):
        knnmodel = NearestNeighbors(n_neighbors = 5001, algorithm = 'ball_tree')
        scalar = StandardScaler()
        scalar.fit(self.data)
        self.data = scalar.transform(self.data)
        knnmodel.fit(self.data)
        
        for j in range(self.data.shape[0]):
            if self.ids[j] == song_id:
                self.choose = j
        
        distances, indices = knnmodel.kneighbors([self.data[self.choose]])
        indices = np.ndarray.flatten(indices[:,1:])
        tracks = self.ids[indices]
        tracks = np.ndarray.flatten(tracks)
        
        relevances = []
        threshold = 100
        relevance = threshold
        for i, _ in enumerate(tracks):
            relevances.append(relevance)
            relevance -= 0.0006
        
        song_train = self.data[indices]
        self.model = LinearRegression().fit(song_train, relevances)
        predictions = self.model.predict(self.data)
        sort_indices = np.argsort(predictions)[::-1]
        predictions = np.array(predictions)
        predictions = predictions.astype('float64')
        predictions = predictions[sort_indices]
        
        first_threshold_index = 0
        for i in range(predictions.shape[0]):
            if predictions[i] <= threshold:
                first_threshold_index = i
                break

        playlist_added = self.ids[sort_indices][first_threshold_index:first_threshold_index+100]
        
        return playlist_added

In [7]:
class RandomForestPredictor:
    
    def __init__(self, data, sp, username, playlist_id):
        self.data = data
        self.sp = sp
        self.username = username
        self.playlist_id = playlist_id
        self.preprocess()
    
    def preprocess(self):
        self.data = self.data[1:, :]
        song_valence = self.data[:, 17]
        song_acousticness = self.data[:, 0]
        song_artists = self.data[:, 1]
        song_danceability = self.data[:, 2]

        song_duration_ms = self.data[:, 3]
        song_energy = self.data[:, 4]
        song_id = self.data[:, 6]
        song_popularity = self.data[:, 13]

        song_instrumentalness = self.data[:, 7]
        song_key = self.data[:, 8]
        song_liveness = self.data[:, 9]
        song_loudness = self.data[:, 10]

        song_mode = self.data[:, 11]
        song_name = self.data[:, 12]
        song_speechiness = self.data[:, 15]
        song_tempo = self.data[:, 16]
        song_year = self.data[:, 18]

        self.song_info2 = {'name': [], 'id': [], 'artists': []}
        features_dict = {'valence': [], 'year': [], 'popularity': [], 'acousticness': [], 'danceability': [], 'duration_ms': [],
                         'energy': [], 'instrumentalness': [], 'key': [], 'liveness': [], 'loudness': [], 'mode': [],
                         'speechiness': [], 'tempo': []}
        for i in range(0, self.data.shape[0]):
            self.song_info2['name'].append(song_name[i])
            self.song_info2['id'].append(song_id[i])
            self.song_info2['artists'].append(song_artists[i])
            features_dict['valence'].append(song_valence[i])
            features_dict['acousticness'].append(song_acousticness[i])
            features_dict['danceability'].append(song_danceability[i])
            features_dict['duration_ms'].append(song_duration_ms[i])
            features_dict['popularity'].append(song_popularity[i])
            features_dict['energy'].append(song_energy[i])
            features_dict['instrumentalness'].append(song_instrumentalness[i])
            features_dict['key'].append(song_key[i])
            features_dict['liveness'].append(song_liveness[i])
            features_dict['loudness'].append(song_loudness[i])
            features_dict['mode'].append(song_mode[i])
            features_dict['speechiness'].append(song_speechiness[i])
            features_dict['tempo'].append(song_tempo[i])
            features_dict['year'].append(song_year[i])
        # playlist_df = pd.DataFrame(features_dict, index=song_info['name'])

        # Create a dataframe of your playlist including tracks' names and audio features
        sourcePlaylist = self.sp.user_playlist(self.username, self.playlist_id)
        tracks = sourcePlaylist["tracks"]
        self.songs = tracks["items"]

        track_ids = []
        track_names = []

        for i in range(0, len(self.songs)):
            if self.songs[i]['track']['id'] != None:  # Removes the local tracks in your playlist if there is any
                track_ids.append(self.songs[i]['track']['id'])
                track_names.append(self.songs[i]['track']['name'])

        features = []
        for i in range(0, len(track_ids)):
            audio_features = self.sp.audio_features(track_ids[i])
            tr = self.sp.track(track_ids[i])['popularity']
            year = int(self.sp.track(track_ids[i])['album']['release_date'][:4])
            for track in audio_features:
                # print(track)
                features.append(track)
            features[i]['popularity'] = tr
            features[i]['year'] = year
        playlist_df = pd.DataFrame(features, index=track_names)

        for j in range(0, len(features)):
            indexx = (np.linspace(0, len(features) - 1, num=len(features))).astype(int)
            indexx = np.delete(indexx, j)
            input_ = np.zeros(12)
            input_[0] = (len(features) * playlist_df['valence'][j] + np.sum(playlist_df['valence'][indexx])) / (2 * len(features) - 1)
            input_[1] = (len(features) * playlist_df['acousticness'][j] + np.sum(playlist_df['acousticness'][indexx])) / (2 * len(features) - 1)
            input_[2] = (len(features) * playlist_df['danceability'][j] + np.sum(playlist_df['danceability'][indexx])) / (2 * len(features) - 1)
            input_[3] = (len(features) * playlist_df['popularity'][j] + np.sum(playlist_df['popularity'][indexx])) / (2 * len(features) - 1)
            input_[4] = (len(features) * playlist_df['energy'][j] + np.sum(playlist_df['energy'][indexx])) / (2 * len(features) - 1)
            input_[5] = (len(features) * playlist_df['instrumentalness'][j] + np.sum(playlist_df['instrumentalness'][1:])) / (2 * len(features) - 1)
            input_[6] = (len(features) * playlist_df['liveness'][j] + np.sum(playlist_df['liveness'][indexx])) / (2 * len(features) - 1)
            input_[7] = (len(features) * playlist_df['loudness'][j] + np.sum(playlist_df['loudness'][indexx])) / (2 * len(features) - 1)
            input_[8] = (len(features) * playlist_df['mode'][j] + np.sum(playlist_df['mode'][indexx])) / (2 * len(features) - 1)
            input_[9] = (len(features) * playlist_df['speechiness'][j] + np.sum(playlist_df['speechiness'][indexx])) / (2 * len(features) - 1)
            input_[10] = (len(features) * playlist_df['tempo'][j] + np.sum(playlist_df['tempo'][indexx])) / (2 * len(features) - 1)
            input_[11] = (len(features) * playlist_df['year'][j] + np.sum(playlist_df['year'][indexx])) / (2 * len(features) - 1)

            features_array = np.zeros((self.data.shape[0] + 1, 12))

            features_array[:self.data.shape[0], 0] = np.array(features_dict['valence']).T
            features_array[:self.data.shape[0], 1] = np.array(features_dict['acousticness']).T
            features_array[:self.data.shape[0], 2] = np.array(features_dict['danceability']).T
            features_array[:self.data.shape[0], 3] = np.array(features_dict['popularity']).T
            features_array[:self.data.shape[0], 4] = np.array(features_dict['energy']).T
            features_array[:self.data.shape[0], 5] = np.array(features_dict['instrumentalness']).T
            features_array[:self.data.shape[0], 6] = np.array(features_dict['liveness']).T
            features_array[:self.data.shape[0], 7] = np.array(features_dict['loudness']).T
            features_array[:self.data.shape[0], 8] = np.array(features_dict['mode']).T
            features_array[:self.data.shape[0], 9] = np.array(features_dict['speechiness']).T
            features_array[:self.data.shape[0], 10] = np.array(features_dict['tempo']).T
            features_array[:self.data.shape[0], 11] = np.array(features_dict['year']).T

            features_array[self.data.shape[0], :] = input_

            self.features_array = features_array.astype('float64')
        
    def train_and_predict(self):
        model = NearestNeighbors(n_neighbors=5000, algorithm='ball_tree')

        scalar = StandardScaler()
        scalar.fit(self.features_array)
        self.features_array = scalar.transform(self.features_array)
        input_2 = self.features_array[self.data.shape[0], :]
        model.fit(self.features_array[:self.data.shape[0], :])
        distances, indices = model.kneighbors([input_2])
        recorded_indices2 = indices

        closest_1000_point = {'valence': [], 'year': [], 'popularity': [], 'mode': [], 'acousticness': [],
                              'danceability': [],
                              'energy': [], 'instrumentalness': [], 'liveness': [], 'loudness': [], 'speechiness': [],
                              'tempo': [], 'ratings': []}

        for k in range(0, 5000):
            closest_1000_point['valence'].append(self.features_array[indices[0, k], 0])
            closest_1000_point['acousticness'].append(self.features_array[indices[0, k], 1])
            closest_1000_point['danceability'].append(self.features_array[indices[0, k], 2])
            closest_1000_point['popularity'].append(self.features_array[indices[0, k], 3])
            closest_1000_point['energy'].append(self.features_array[indices[0, k], 4])
            closest_1000_point['instrumentalness'].append(self.features_array[indices[0, k], 5])
            closest_1000_point['liveness'].append(self.features_array[indices[0, k], 6])
            closest_1000_point['loudness'].append(self.features_array[indices[0, k], 7])
            closest_1000_point['mode'].append(self.features_array[indices[0, k], 8])
            closest_1000_point['speechiness'].append(self.features_array[indices[0, k], 9])
            closest_1000_point['tempo'].append(self.features_array[indices[0, k], 10])
            closest_1000_point['year'].append(self.features_array[indices[0, k], 11])

            rem = k // 500
            closest_1000_point['ratings'].append(10 - rem)
        playlist_dfk = pd.DataFrame(closest_1000_point)
        x_train2 = playlist_dfk.drop(['ratings'], axis=1)
        y_train2 = playlist_dfk['ratings']
        #playlist_creator(X_train, y_train, recorded_indices, song_info, sp, playlist_recs['id'])
        #playlist_creator(x_train2, y_train2, recorded_indices2, song_info2, spr, playlist_id):
        warnings.filterwarnings('ignore')

        # CELL [9]
        # Random Forests second
        best_score = 0
        skf = StratifiedKFold(n_splits=5, shuffle=True)
        for train_index, test_index in skf.split(x_train2, y_train2):
            # Decision Trees First
            x_train_cv = np.array(x_train2)[train_index, :]
            x_test_cv = np.array(x_train2)[test_index, :]
            y_train_cv = np.array(y_train2)[train_index]
            y_test_cv = np.array(y_train2)[test_index]

            gcv1 = GridSearchCV(RandomForestClassifier(n_estimators=100, n_jobs=-1), n_jobs=-1, param_grid={'max_features': range(3, 6), 'min_samples_leaf': [4, 5, 6], 'max_depth': [11, 12, 13, 14, 15]})
            gcv1.fit(x_train_cv, y_train_cv)
            print(gcv1.best_estimator_)
            print(gcv1.best_score_)
            score = gcv1.best_score_
            if score > best_score:
                recorded_train_indexes = train_index
                recorded_test_indexes = test_index
                recorded_best_parameters = gcv1.best_estimator_
        # CELL [10]

        # Make predictions
        X_train_set = np.array(x_train2)[recorded_train_indexes, :]
        X_test_set = np.array(x_train2)[recorded_test_indexes, :]
        y_train_set = np.array(y_train2)[recorded_train_indexes]
        y_test_set = np.array(y_train2)[recorded_test_indexes]
        gcv1.best_estimator_ = recorded_best_parameters
        gcv1.best_estimator_.fit(X_train_set, y_train_set)
        # rec_playlist_df_scaled = StandardScaler().fit_transform(rec_playlist_df)
        # rec_playlist_df_pca = pca1.transform(rec_playlist_df_scaled)
        # X_test_last = csr_matrix(hstack([rec_playlist_df_pca, X_test_names]))
        y_pred_class = gcv1.best_estimator_.predict(X_test_set)
        # print(y_pred_class)
        # CELL [11]
        rec_playlist_ratings = y_pred_class
        # rec_playlist_df = rec_playlist_df.sort_values('ratings', ascending = False)
        # rec_playlist_df = rec_playlist_df.reset_index()
        # Pick the top ranking tracks to add your new playlist 9, 10 will work
        ten_indexes = []
        for k in range(0, len(y_pred_class)):
            if y_pred_class[k] == 10:
                ten_indexes.append(k)
        # recs_to_add = rec_playlist_df[rec_playlist_df['ratings']>=9]['index'].values.tolist()

        recomended_song_indexes = []
        for k in ten_indexes:
            recomended_song_indexes.append(recorded_indices2[0, recorded_test_indexes[k]])
        # for i in nine_indexes:
        #    recomended_song_indexes.append(recorded_indices[0, test_index[i]])
        # print(recomended_song_indexes)

        # Check what is about to happen :)
        # for i in recomended_song_indexes:
        #     print(song_info['id'][i])
        # Add tracks to the new playlist
        tracks = []
        for k in recomended_song_indexes[:len(self.songs)*3]:
            tracks.append(self.song_info2['id'][k])
        return np.array(tracks)

In [8]:
def create_playlist(playlist_name, tracks, username, client_id, client_secret, redirect_uri):
    scope = "playlist-modify-public"
    token = util.prompt_for_user_token(username,scope,client_id=client_id,client_secret=client_secret,redirect_uri=redirect_uri) 
    sp = spotipy.Spotify(auth=token)
    sp.user_playlist_create(username, name=playlist_name)
    playlists = sp.user_playlists(username)
    tracks = np.ndarray.flatten(tracks)
    tracks = list(tracks)
    sp.user_playlist_add_tracks(username, playlists['items'][0]['id'], tracks)

In [9]:
def create_playlist_kNN():
    cid, secret, redirect_uri = get_credentials()
    data = read_data('./data/data.csv')
    knnPredictor = KNNPredictor(data)
    song_id = input('Enter song ID: ')
    tracks = knnPredictor.predict(song_id)
    username = input('Enter username: ')
    playlist_name = input('Enter playlist name: ')
    create_playlist(playlist_name, tracks, username, client_id=cid, client_secret=secret, redirect_uri=redirect_uri)

def create_playlist_LinearRegression():
    cid, secret, redirect_uri = get_credentials()
    data = read_data('./data/data.csv')
    linearPredictor = LinearPredictor(data)
    song_id = input('Enter song ID: ')
    tracks = linearPredictor.train_and_predict(song_id)
    username = input('Enter username: ')
    playlist_name = input('Enter playlist name: ')
    create_playlist(playlist_name, tracks, username, client_id=cid, client_secret=secret, redirect_uri=redirect_uri)

def create_playlist_RandomForestRegression():
    cid, secret, redirect_uri = get_credentials()
    data = read_data('./data/data.csv')
    
    scope = "user-library-read playlist-modify-public playlist-read-private user-library-modify"
    username = input('Enter username: ')
    playlist_id = input('Enter Playlist ID: ')
    token = util.prompt_for_user_token(username,scope,client_id=cid,client_secret=secret,redirect_uri=redirect_uri) 
    sp = spotipy.Spotify(auth=token)
    forestPredictor = RandomForestPredictor(data, sp, username, playlist_id)
    tracks = forestPredictor.train_and_predict()
    playlist_name = input('Enter playlist name: ')
    create_playlist(playlist_name, tracks, username, client_id=cid, client_secret=secret, redirect_uri=redirect_uri)

In [15]:
create_playlist_LinearRegression()

Enter song ID: 1C2QJNTmsTxCDBuIgai8QV
Enter username: ardabyk07
Enter playlist name: Playlist Generated for Ercüment Çiçek (with Linear Regression)
