In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

In [2]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [3]:
user_data = pd.read_table('C:/Users/Viyatra/Desktop/recommend_itembased/packagedata/productlist.tsv',
                          header = None, nrows = 2e7,
                          names = ['users', 'productid', 'packagename', 'totalvisits'],
                          usecols = ['users', 'packagename', 'totalvisits'])

In [4]:
user_profiles = pd.read_table('C:/Users/Viyatra/Desktop/recommend_itembased/packagedata/userlist.tsv',
                          header = None,
                          names = ['users', 'gender', 'age', 'country', 'signup'],
                          usecols = ['users', 'country'])

In [5]:
user_data.head()
user_profiles.head()

Unnamed: 0,users,country
0,00000c289a1829a808ac09c00daf10bc3c4e223b,Germany
1,00001411dc427966b17297bf4d69e7e193135d89,Canada
2,00004d2ac9316e22dc007ab2243d6fcb239e707d,Germany
3,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,Mexico
4,00007a47085b9aab8af55f52ec8846ac479ac4fe,United States


In [6]:
if user_data['packagename'].isnull().sum() > 0:
    user_data = user_data.dropna(axis = 0, subset = ['packagename'])

totalpackagevisit = (user_data.
        groupby(by=['packagename'])['totalvisits'].
        sum().
        reset_index().
        rename(columns={'totalvisits': 'total_packagevisits'})
    [['packagename', 'total_packagevisits']]
        )
totalpackagevisit.head()

Unnamed: 0,packagename,total_packagevisits
0,cours de la somme,9
1,oliver shanti & friends,3
2,!!!,19814
3,!5:b>@ 3070,33
4,!action pact!,143


In [7]:
user_data_with_packagevisits = user_data.merge(totalpackagevisit, left_on='packagename', right_on='packagename', how='left')
user_data_with_packagevisits.head()

print (totalpackagevisit['total_packagevisits'].describe())
print (totalpackagevisit['total_packagevisits'].quantile(np.arange(.9, 1, .01)))

count     82093.000
mean       2775.286
std       21123.336
min           1.000
25%          48.000
50%         180.000
75%         724.000
max     1896944.000
Name: total_packagevisits, dtype: float64
0.900    3096.000
0.910    3619.160
0.920    4240.640
0.930    5076.560
0.940    6289.440
0.950    7929.000
0.960   10405.920
0.970   14858.000
0.980   23363.920
0.990   50182.200
Name: total_packagevisits, dtype: float64


In [8]:
popularity_threshold = 4000
user_data_popular_package = user_data_with_packagevisits.query('total_packagevisits >= @popularity_threshold')
user_data_popular_package.head()

Unnamed: 0,users,packagename,totalvisits,total_packagevisits
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137,4242
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,189681
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897,12232
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717,22405
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706,7043


In [9]:
combined = user_data_popular_package.merge(user_profiles, left_on = 'users', right_on = 'users', how = 'left')
usa_data = combined.query('country == \'United States\'')
usa_data.head()

Unnamed: 0,users,packagename,totalvisits,total_packagevisits,country
148,00007a47085b9aab8af55f52ec8846ac479ac4fe,devendra banhart,456,149168,United States
149,00007a47085b9aab8af55f52ec8846ac479ac4fe,boards of canada,407,360138,United States
150,00007a47085b9aab8af55f52ec8846ac479ac4fe,cocorosie,386,157346,United States
151,00007a47085b9aab8af55f52ec8846ac479ac4fe,aphex twin,213,252034,United States
152,00007a47085b9aab8af55f52ec8846ac479ac4fe,animal collective,203,207603,United States


In [10]:
if not usa_data[usa_data.duplicated(['users', 'packagename'])].empty:
    initial_rows = usa_data.shape[0]

    print ('Initial dataframe shape {0}'.format(usa_data.shape))
    usa_data = usa_data.drop_duplicates(['users', 'packagename'])
    current_rows = usa_data.shape[0]
    print ('New dataframe shape {0}'.format(usa_data.shape))
    print ('Removed {0} rows'.format(initial_rows - current_rows))

Initial dataframe shape (156348, 5)
New dataframe shape (156347, 5)
Removed 1 rows


# implementing the Nearest neighbour#

In [11]:
wide_package_data = usa_data.pivot(index = 'packagename', columns = 'users', values = 'totalvisits').fillna(0)
wide_package_data_sparse = csr_matrix(wide_package_data.values)

from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(wide_package_data_sparse)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

# Making Recommendation#

In [36]:
query_index = np.random.choice(wide_package_data.shape[0])
distances, indices = model_knn.kneighbors(wide_package_data.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print ('Recommendations for {0}:\n'.format(wide_package_data.index[query_index]))
    else:
        print ('{0}: {1}, with distance of {2}:'.format(i, wide_package_data.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for the teenagers:

1: those dancing days, with distance of 0.7054465899233704:
2: esg, with distance of 0.7586023005297466:
3: scarlett johansson, with distance of 0.7971057301114011:
4: itzhak perlman, with distance of 0.8229065050437651:
5: the knife, with distance of 0.8288652059235249:
