## Libraries

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds
%matplotlib inline

In [106]:
from surprise import SVD, NMF, accuracy
from surprise import Dataset, Reader

from surprise.model_selection import cross_validate, train_test_split
from sklearn.metrics import pairwise as met

## Dataset

In [2]:
df = pd.read_csv('CSV Data/updated-deal-details-infomation.csv', usecols = ['Deal_ID', 'Artists_Pitched'])
df.head()

Unnamed: 0,Deal_ID,Artists_Pitched
0,16934,"6ix-mhz-25276, fitoor-the-band, haze, ruhani, ..."
1,16935,"6ix-mhz-25276, fitoor-the-band, haze, ruhani, ..."
2,17041,"6ix-mhz-25276, fitoor-the-band, roots, randeep..."
3,17045,"6ix-mhz-25276, fitoor-the-band, roots, randeep..."
4,6480,"aakar, ysmusical, bhooteshwara, simetri, the-k..."


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14894 entries, 0 to 14893
Data columns (total 2 columns):
Deal_ID            14894 non-null int64
Artists_Pitched    1359 non-null object
dtypes: int64(1), object(1)
memory usage: 232.8+ KB


In [5]:
# Some User have not been pitched
# Remove them from the dataset
df.dropna(inplace = True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1359 entries, 0 to 1358
Data columns (total 2 columns):
Deal_ID            1359 non-null int64
Artists_Pitched    1359 non-null object
dtypes: int64(1), object(1)
memory usage: 31.9+ KB


### Artists Pitched

Artist that have been pitched to the User are stored via their ID in the artists column. The <i>artists</i> column is of object type.

Convert it into different columns and then melt the dataframe

In [28]:
t = df.set_index('Deal_ID')['Artists_Pitched'].str.split(r",", expand = True).stack()
t = t.reset_index(level = 1, drop = True).to_frame('Artists_Pitched')
t = pd.get_dummies(t, prefix = None, prefix_sep = "").groupby(level = 0).sum()

In [29]:
t.head()

Unnamed: 0_level_0,Artists_Pitched 10th-street,Artists_Pitched 4-degrees-of-freedom,Artists_Pitched 8-bit-culprit,Artists_Pitched Aardee,Artists_Pitched Abhijeet-Sawant,Artists_Pitched Akira's-Tricolour,Artists_Pitched Antara-,Artists_Pitched Antara-Mitra,Artists_Pitched Anuragi,Artists_Pitched Bhavya-Pandit,...,Artists_Pitchedyasser-desai,Artists_Pitchedzaeden,Artists_Pitchedzakir-khan,Artists_Pitchedzaroon,Artists_Pitchedzenith,Artists_Pitchedzenith-dance-troupe,Artists_Pitchedzephyrtone,Artists_Pitchedzheel-band,Artists_Pitchedzikrr,Artists_Pitchedzipout's-dance-addiction
Deal_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
717,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
727,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
753,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
786,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
795,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
t = t.rename(columns = lambda x: x.replace("Artists_Pitched", "").strip())
t = t.sort_index(axis = 1, ascending = False)
t.reset_index(inplace = True)

In [32]:
# Merging with the main datafram
df = pd.merge(df.drop('Artists_Pitched', axis = 1), t, on = ["Deal_ID"])
df.head()

Unnamed: 0,Deal_ID,zulfiqar-khan,zublee-baruah,zubeen-garg,ziya,zipout's-dance-addiction,zipout's-dance-addiction.1,zikrr,zikrr.1,zheel-band,...,Anuragi,Antara-Mitra,Antara-,Akira's-Tricolour,Abhijeet-Sawant,Aardee,8-bit-culprit,6ix-mhz-25276,4-degrees-of-freedom,10th-street
0,16934,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,16935,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,17041,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,17045,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,6480,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
df.set_index(['Deal_ID'], inplace = True)

```
0: Artists not pitched.
1: Artists pitched.
```

In [38]:
df.head()

Unnamed: 0_level_0,zulfiqar-khan,zublee-baruah,zubeen-garg,ziya,zipout's-dance-addiction,zipout's-dance-addiction,zikrr,zikrr,zheel-band,zephyrtone,...,Anuragi,Antara-Mitra,Antara-,Akira's-Tricolour,Abhijeet-Sawant,Aardee,8-bit-culprit,6ix-mhz-25276,4-degrees-of-freedom,10th-street
Deal_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16934,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
16935,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
17041,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
17045,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
6480,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### pre-process for SVD

In [41]:
indices = pd.Series(df.reset_index().index, index = df.reset_index().Deal_ID)

In [45]:
P = df.as_matrix()
P_mean = np.mean(P, axis = 1)
pitched_demeaned = P - P_mean.reshape(-1, 1)

In [44]:
df.as_matrix().shape

(1359, 2652)

In [46]:
pitched_demeaned.shape

(1359, 2652)

In [47]:
pitched_demeaned

array([[-0.00226244, -0.00226244, -0.00226244, ...,  0.99773756,
        -0.00226244, -0.00226244],
       [-0.00226244, -0.00226244, -0.00226244, ...,  0.99773756,
        -0.00226244, -0.00226244],
       [-0.00565611, -0.00565611, -0.00565611, ...,  0.99434389,
        -0.00565611, -0.00565611],
       ...,
       [-0.00226244, -0.00226244, -0.00226244, ..., -0.00226244,
        -0.00226244, -0.00226244],
       [-0.00829563, -0.00829563, -0.00829563, ..., -0.00829563,
        -0.00829563, -0.00829563],
       [-0.00377074, -0.00377074, -0.00377074, ..., -0.00377074,
        -0.00377074, -0.00377074]])

## SVD
SVD return 3 vectors.
    
    1. U: Left Singular Vector. It Represents Deal "Features"
    2. Sigma: Diagonal Vector. It Represents the weights.
    3. V: Right Singular Vector. It Represents Artists "Feature"
    
<center>A = U * Sigma * V.T</center>

In [48]:
# k = 20
U, sigma, V_t = svds(pitched_demeaned, k = 20)
sigma = np.diag(sigma)

In [93]:
# k = 50
U_50, sigma_50, V_t_50 = svds(pitched_demeaned, k = 50)
sigma_50 = np.diag(sigma_50)

In [49]:
all_predicted_pitches = np.dot(np.dot(U, sigma), V_t) + P_mean.reshape(-1, 1)

preds = pd.DataFrame(all_predicted_pitches, columns = df.columns)
preds.head()

Unnamed: 0,zulfiqar-khan,zublee-baruah,zubeen-garg,ziya,zipout's-dance-addiction,zipout's-dance-addiction.1,zikrr,zikrr.1,zheel-band,zephyrtone,...,Anuragi,Antara-Mitra,Antara-,Akira's-Tricolour,Abhijeet-Sawant,Aardee,8-bit-culprit,6ix-mhz-25276,4-degrees-of-freedom,10th-street
0,0.002352,0.002521,-0.005354,0.00233,0.003632,0.00308,0.001772,0.006954,0.002358,-0.000472,...,0.001514,0.001514,0.001514,0.001514,0.001514,0.001514,0.002182,0.008433,0.002356,-0.005007
1,0.002352,0.002521,-0.005354,0.00233,0.003632,0.00308,0.001772,0.006954,0.002358,-0.000472,...,0.001514,0.001514,0.001514,0.001514,0.001514,0.001514,0.002182,0.008433,0.002356,-0.005007
2,0.004332,0.004849,-0.032209,0.003447,0.011772,0.008413,0.001416,0.036031,0.004175,-0.000385,...,-0.001249,-0.001249,-0.001249,-0.001249,-0.001249,-0.001249,0.003423,0.038806,0.004117,-0.019762
3,0.004332,0.004849,-0.032209,0.003447,0.011772,0.008413,0.001416,0.036031,0.004175,-0.000385,...,-0.001249,-0.001249,-0.001249,-0.001249,-0.001249,-0.001249,0.003423,0.038806,0.004117,-0.019762
4,0.003121,0.003178,0.001876,0.0033,0.00311,0.003127,0.002967,0.00014,0.003165,-0.000676,...,0.003066,0.003066,0.003066,0.003066,0.003066,0.003066,0.003086,0.002878,0.003177,-0.001294


In [69]:
original = pd.read_csv('../GTQ+PYR Deals/Data/updated-deal-details-infomation.csv', usecols = ['Deal_ID', 'Artists_Pitched'])

In [86]:
original.head(10)

Unnamed: 0,Deal_ID,Artists_Pitched
0,16934,"6ix-mhz-25276, fitoor-the-band, haze, ruhani, ..."
1,16935,"6ix-mhz-25276, fitoor-the-band, haze, ruhani, ..."
2,17041,"6ix-mhz-25276, fitoor-the-band, roots, randeep..."
3,17045,"6ix-mhz-25276, fitoor-the-band, roots, randeep..."
4,6480,"aakar, ysmusical, bhooteshwara, simetri, the-k..."
5,7125,"aakar, ysmusical, mist, the-secret-mission, th..."
6,908,aakash-gupta
7,11486,aakash-gupta
8,3734,aakhira
9,18235,"aanya-singh, bhawna-kataria, kritika-sharma, k..."


In [78]:
def getRecommendation(DealID, num_recommendation = 5):
    # Get the index value for the Deal ID
    deal_row_num = indices[DealID]
    
    # Sort the Predictions in non-increasing order
    ## Select TOP 5 Artists ID
    sorted_user_predictions = preds.iloc[deal_row_num].sort_values(ascending = False)[:num_recommendation]
    
    # Get the Originally Pitched Artists on that DEAL
    original_pitched = original[(original['Deal_ID'] == DealID)].iloc[0]['Artists_Pitched'].split(",")
    
    # Recommended
    recommended = pd.DataFrame({'Artists': sorted_user_predictions.index, 'Pitched_value': sorted_user_predictions.values})
    
    return original_pitched, recommended

In [90]:
actual, recomm = getRecommendation(DealID = 18235)

In [91]:
actual

['aanya-singh',
 ' bhawna-kataria',
 ' kritika-sharma',
 ' kanika-srivastava',
 ' sonal-jain',
 ' vandana-bisht']

In [92]:
recomm

Unnamed: 0,Artists,Pitched_value
0,kanika-srivastava,0.636112
1,vandana-bisht,0.581564
2,kritika-sharma,0.524131
3,sonal-jain,0.326825
4,richa-bhatia,0.277438


In [95]:
all_predicted_pitches = np.dot(np.dot(U_50, sigma_50), V_t_50) + P_mean.reshape(-1, 1)

preds = pd.DataFrame(all_predicted_pitches, columns = df.columns)
preds.head()

Unnamed: 0,zulfiqar-khan,zublee-baruah,zubeen-garg,ziya,zipout's-dance-addiction,zipout's-dance-addiction.1,zikrr,zikrr.1,zheel-band,zephyrtone,...,Anuragi,Antara-Mitra,Antara-,Akira's-Tricolour,Abhijeet-Sawant,Aardee,8-bit-culprit,6ix-mhz-25276,4-degrees-of-freedom,10th-street
0,0.001463,0.005133,0.009209,0.001148,-0.009861,-0.006923,0.000108,0.002931,0.001398,0.014073,...,0.000371,0.000371,0.000371,0.000371,0.000371,0.000371,0.001176,0.315512,0.001338,-0.001200703
1,0.001463,0.005133,0.009209,0.001148,-0.009861,-0.006923,0.000108,0.002931,0.001398,0.014073,...,0.000371,0.000371,0.000371,0.000371,0.000371,0.000371,0.001176,0.315512,0.001338,-0.001200703
2,0.000628,0.007439,0.020212,-0.002474,-0.02129,-0.016086,-0.002749,0.025491,0.000296,0.033795,...,0.000708,0.000708,0.000708,0.000708,0.000708,0.000708,-0.000532,0.87572,2.1e-05,-0.002750444
3,0.000628,0.007439,0.020212,-0.002474,-0.02129,-0.016086,-0.002749,0.025491,0.000296,0.033795,...,0.000708,0.000708,0.000708,0.000708,0.000708,0.000708,-0.000532,0.87572,2.1e-05,-0.002750444
4,0.003317,0.00607,0.005469,0.003668,0.00331,0.003279,0.003104,-0.001505,0.003412,-0.001205,...,-0.000326,-0.000326,-0.000326,-0.000326,-0.000326,-0.000326,0.003179,-0.005775,0.003434,-6.332174e-07


In [96]:
actual, recomm = getRecommendation(DealID = 18235, num_recommendation=10)

In [97]:
actual

['aanya-singh',
 ' bhawna-kataria',
 ' kritika-sharma',
 ' kanika-srivastava',
 ' sonal-jain',
 ' vandana-bisht']

In [98]:
recomm

Unnamed: 0,Artists,Pitched_value
0,kanika-srivastava,0.88167
1,kritika-sharma,0.82328
2,vandana-bisht,0.809657
3,sonal-jain,0.557662
4,richa-bhatia,0.474343
5,shivam,0.402754
6,pratika-sood,0.284015
7,Kabir-Singh-Bhandari,0.262221
8,aanya-singh,0.259207
9,anchor-gaurav,0.257324


In [99]:
actual, recomm = getRecommendation(DealID = 6556, num_recommendation=10)

In [100]:
actual

['sunidhi-chauhan',
 ' benny-dayal',
 ' ammy-virk',
 ' monali-thakur',
 ' neha-kakkar',
 ' armaan-malik',
 ' diljit-dosanjh',
 ' badshah',
 ' ankit-tiwari',
 ' kailash-kher',
 ' hariharan',
 ' the-richa-sharma',
 ' vishal-and-shekhar',
 ' sukhwinder-singh']

In [101]:
recomm

Unnamed: 0,Artists,Pitched_value
0,ammy-virk,0.812171
1,neha-kakkar,0.745361
2,badshah,0.714456
3,diljit-dosanjh,0.626054
4,benny-dayal,0.607041
5,armaan-malik,0.607041
6,kailash-kher,0.607041
7,sukhwinder-singh,0.607041
8,sharry-mann,0.578186
9,bohemia,0.506703


In [103]:
U.shape

(1359, 20)

In [104]:
V_t.shape

(20, 2652)

In [105]:
V_t[0]

array([ 0.00175074,  0.00032693,  0.00965219, ..., -0.02326146,
        0.00115841, -0.02148058])

In [123]:
met.cosine_distances(U[0].reshape(1, -1), U[100].reshape(1, -1))

array([[0.95522384]])

In [126]:
df_2 = df.copy()
df_2.reset_index(inplace = True)

In [129]:
df_2 = pd.melt(df_2, id_vars = 'Deal_ID')

## Surprise

In [133]:
reader = Reader(rating_scale=(0, 1))

data = Dataset.load_from_df(df_2, reader)
trainset, testset = train_test_split(data, test_size=.25)

In [134]:
# Let's train a new Nonnegative SVD
model = SVD(n_factors=100, biased=False)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2798c7d7748>

In [135]:
# In reality, we should perform a train/test split and check RMSE to see if our model is trained
# but today, for simplicity, I'm skipping this step
predictions = model.test(testset)
accuracy.rmse(predictions)

RMSE: 0.0475


0.04747164602385083

## END