## Downloading the million songs dataset

In [5]:
from urllib.request import urlretrieve
from zipfile import ZipFile 
import pandas as pd
import numpy as np
import sqlite3
import pandas as pd 

# track metadata columns
columns = ['track_id',
 'title',
 'song_id',
 'release',
 'artist_id',
 'artist_mbid',
 'artist_name',
 'duration',
 'artist_familiarity',
 'artist_hotttnesss',
 'year',
 'track_7digitalid',
 'shs_perf',
 'shs_work']

# Downloading the 
data_url = 'http://millionsongdataset.com/sites/default/files/challenge/train_triplets.txt.zip'
filename = 'train_triplets.txt.zip'
output_folder = "data"

urlretrieve(data_url, filename)

# loading the temp.zip and creating a zip object 
with ZipFile(filename, 'r') as zObject: 
    zObject.extractall(path=output_folder) 

h5_link = 'http://millionsongdataset.com/sites/default/files/AdditionalFiles/track_metadata.db'
filename = 'track_metadata.db'
urlretrieve(h5_link, filename)

conn_tmdb = sqlite3.connect('track_metadata.db')

res = conn_tmdb.execute("SELECT * FROM songs")
data = res.fetchall()

track_metadata = pd.DataFrame(data, columns=columns)
track_metadata.to_csv('track_metadata.csv', index=False)


KeyboardInterrupt



In [None]:
!pip3 install xgboost seaborn

## Data Loading

In [4]:
# Necessary imports 
import numpy as np 
import pandas as pd 
import xgboost as xg 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error as MSE 

## Reading Taste Profile Data

In [26]:
taste_profile = pd.read_csv('data/train_triplets.txt',sep='\t',header=None,names = ['user_id','song_id','play_count'], nrows = 500000)#
taste_profile.shape

(500000, 3)

## Visualizing first few rows

In [27]:
taste_profile.head()

Unnamed: 0,user_id,song_id,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1


## Song most played

In [37]:
print(taste_profile['song_id'].value_counts())

song_id
SOFRQTD12A81C233C0    989
SOAXGDH12A8C13F8A1    807
SONYKOW12AB01849C9    694
SOAUWYT12A81C206F1    673
SOSXLTC12AF72A7F54    608
                     ... 
SOMBOYD12AB0183BC9      1
SOMIVSA12A8C1419AA      1
SOMMKCC12AB01849B0      1
SOMTLGA12A6D4FA56D      1
SOOATPK12A8C13B1AF      1
Name: count, Length: 105069, dtype: int64


## Visualizing play count

In [28]:
print(taste_profile['play_count'].value_counts())

play_count
1      293506
2       76738
3       34081
5       23721
4       18887
        ...  
153         1
147         1
221         1
302         1
237         1
Name: count, Length: 199, dtype: int64


## Since the the frequency of high ouliter are low, lets remove the higher outliers. since lower bound is 1 and with highest frequency we are removing only upper bound.

In [29]:
def remove_high_outliers(taste_profile):
    percentile_25 = taste_profile['play_count'].quantile(0.25)
    percentile_75 = taste_profile['play_count'].quantile(0.75)
    iqr_play_count = percentile_75 - percentile_25
    # since lower bound is 1 and with highest frequency we are removing only upper bound.
    upper_play_count_bound = percentile_75 + 1.5 * iqr_play_count
    print(percentile_25, percentile_75, upper_play_count_bound)

    # Identify and removing upper bound outliers based on IQR
    taste_profile = taste_profile[(taste_profile['play_count'] <= upper_play_count_bound)]
    return taste_profile

In [30]:
print("Before outlier removal", len(taste_profile))
taste_profile = remove_high_outliers(taste_profile)
taste_profile = taste_profile.reset_index(drop=True)
print("After outlier removal", len(taste_profile))

Before outlier removal 500000
1.0 3.0 6.0
After outlier removal 457845


## we have remove around 42000 users enteries

##  Reading track metadata

In [32]:
# Convert the table to a Pandas DataFrame
track_metadata = pd.read_csv('track_metadata.csv')
track_metadata.head()

Unnamed: 0,track_id,title,song_id,release,artist_id,artist_mbid,artist_name,duration,artist_familiarity,artist_hotttnesss,year,track_7digitalid,shs_perf,shs_work
0,TRMMMYQ128F932D901,Silent Night,SOQMMHC12AB0180CB8,Monster Ballads X-Mas,ARYZTJS1187B98C555,357ff05d-848a-44cf-b608-cb34b5701ae5,Faster Pussy cat,252.05506,0.649822,0.394032,2003,7032331,-1,0
1,TRMMMKD128F425225D,Tanssi vaan,SOVFVAK12A8C1350D9,Karkuteillä,ARMVN3U1187FB3A1EB,8d7ef530-a6fd-4f8f-b2e2-74aec765e0f9,Karkkiautomaatti,156.55138,0.439604,0.356992,1995,1514808,-1,0
2,TRMMMRX128F93187D9,No One Could Ever,SOGTUKN12AB017F4F1,Butter,ARGEKB01187FB50750,3d403d44-36ce-465c-ad43-ae877e65adc4,Hudson Mohawke,138.97098,0.643681,0.437504,2006,6945353,-1,0
3,TRMMMCH128F425532C,Si Vos Querés,SOBNYVR12A8C13558C,De Culo,ARNWYLR1187B9B2F9C,12be7648-7094-495f-90e6-df4189d68615,Yerba Brava,145.05751,0.448501,0.372349,2003,2168257,-1,0
4,TRMMMWA128F426B589,Tangle Of Aspens,SOHSBXH12A8C13B0DF,Rene Ablaze Presents Winter Sessions,AREQDTE1269FB37231,,Der Mystic,514.29832,0.0,0.0,0,2264873,-1,0


## List of Artist with maximum songs

In [34]:
print(track_metadata['artist_name'].value_counts())

artist_name
Michael Jackson                                                               194
Johnny Cash                                                                   193
Beastie Boys                                                                  187
Joan Baez                                                                     181
Neil Diamond                                                                  176
                                                                             ... 
Queen vs The Miami Project                                                      1
Ernst Mosch Und Seine Original Egerländer Musikanten                            1
Jimmie Rodgers.                                                                 1
Cherokee;Erika Sulpacio;Tim Owens;Myron McKinley;Andrew Gooche;Taku Hirano      1
Killer Mike feat. Gangsta Pill and Nario of Grind Time Rap Gang                 1
Name: count, Length: 72665, dtype: int64


## Merging tracks and user data

In [36]:
combined_data = pd.merge(pd.DataFrame(track_metadata,columns=['song_id']),taste_profile,on="song_id",how='inner')
combined_data.head()

Unnamed: 0,song_id,user_id,play_count
0,SOSDCFG12AB0184647,9dc8b0f000792df949a0a0ad8eba2724335568f7,1
1,SOIMMJJ12AF72AD643,90b540cbb748698bed8eed0803b8b982b742317a,2
2,SOGNNYL12A6D4F910B,ef60f1da62dd1d05ae22e2990907b167c3171389,1
3,SOGNNYL12A6D4F910B,d9c5b04d124ee437ff58490ad3024fcaf054cf7a,1
4,SODDEQU12AAF3B2FC8,50fbf687c5eb1df4375671893c1394474822c87e,2


## Training XGBoost

In [42]:
taste_profile.head()

Unnamed: 0,user_id,song_id,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1


In [43]:
from sklearn.preprocessing import LabelEncoder 
data_encoder_song = LabelEncoder()
data_encoder_user = LabelEncoder()

taste_profile['song_id'] = data_encoder_song.fit_transform(taste_profile['song_id'])
taste_profile['user_id'] = data_encoder_user.fit_transform(taste_profile['user_id'])

In [44]:
X, y = taste_profile.iloc[:, :-1], taste_profile.iloc[:, -1] 

In [46]:
# Splitting 
train_X, test_X, train_y, test_y = train_test_split(X, y,  test_size = 0.3, random_state = 123) 

In [47]:
# Instantiation 
xgb_r = xg.XGBRegressor(objective ='reg:linear', n_estimators = 10, seed = 123) 

In [48]:
# Fitting the model 
xgb_r.fit(train_X, train_y) 



## Prediction on Test Data

In [49]:
# Predict the model 
pred = xgb_r.predict(test_X)

## Evaluation on Test Data

In [52]:
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
# Evaluate model
rmse = np.sqrt(mean_squared_error(test_y, pred))
r2 = r2_score(test_y, pred)
mae = np.sqrt(mean_absolute_error(test_y, pred))
print('Test RMSE:', rmse)
print('Test MAE:', mae)
print('Test r2:', r2)

Test RMSE: 1.2892230414358912
Test MAE: 0.9898306176776185
Test r2: 0.002058394623884019


## Running on Train Data

In [54]:
# Predict the model 
train_pred = xgb_r.predict(train_X)

## Evaluation on Train Data

In [55]:
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
# Evaluate model
train_rmse = np.sqrt(mean_squared_error(train_y, train_pred))
train_r2 = r2_score(train_y, train_pred)
train_mae = np.sqrt(mean_absolute_error(train_y, train_pred))
print('Test RMSE:', train_rmse)
print('Test MAE:', train_mae)
print('Test r2:', train_r2)

Test RMSE: 1.2911044698562515
Test MAE: 0.9898207276304773
Test r2: 0.006092014396978707
