# Demo Synth Data

In [1]:
# import libraries

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist, squareform

In [2]:
# names and tags

names=['id__'+str(i) for i in range(2000)]

tags=['r&b', 'rock', 'jazz', 'techno', 'pop', 'indie',
      'cinema', 'theater', 'beers', 'wine', 'party', 'trips',
      'running', 'gym', 'golf', 'basket', 'football', 'yoga']

In [8]:
# synth data

data=np.random.randint(0, 6, (len(names), len(tags)))
s_data=pd.DataFrame(data, columns=tags, index=names)
s_data['plan']=np.random.randint(0, 2, (len(names), 1))
s_data['id']=[i for i in range(len(names))]
s_data.shape

Unnamed: 0,r&b,rock,jazz,techno,pop,indie,cinema,theater,beers,wine,party,trips,running,gym,golf,basket,football,yoga,plan,id
id__0,5,4,2,2,3,3,5,5,5,2,5,5,3,0,1,1,2,0,1,0
id__1,0,0,4,2,5,2,3,2,2,2,2,4,3,5,0,4,1,3,1,1
id__2,3,4,5,1,3,1,5,2,2,3,0,3,3,5,4,0,0,4,1,2
id__3,5,0,1,3,2,1,3,4,1,5,4,5,5,2,4,2,5,3,1,3
id__4,2,3,4,3,0,3,4,1,3,1,5,3,0,3,3,4,5,5,0,4


In [4]:
# metrics for squareform

'''
'euclidean', 'minkowski', 'cityblock', 'seuclidean', 'sqeuclidean', 'cosine', 'correlation'

'hamming', 'jaccard', 'chebyshev', 'canberra', 'braycurtis', 'mahalanobis', 'yule'

'matching', 'dice', 'kulsinski', 'rogerstanimoto', 'russellrao', 'sokalmichener'

'sokalsneath', 'wminkowski'
'''



metric='euclidean'  

In [9]:
squareform(pdist(s_data.iloc[:, :-2], metric))

array([[ 0.        , 11.26942767, 11.        , ...,  8.94427191,
        10.81665383, 11.95826074],
       [11.26942767,  0.        ,  8.71779789, ..., 10.04987562,
        10.        , 10.29563014],
       [11.        ,  8.71779789,  0.        , ...,  9.11043358,
         8.83176087, 10.95445115],
       ...,
       [ 8.94427191, 10.04987562,  9.11043358, ...,  0.        ,
         7.93725393,  8.18535277],
       [10.81665383, 10.        ,  8.83176087, ...,  7.93725393,
         0.        , 10.86278049],
       [11.95826074, 10.29563014, 10.95445115, ...,  8.18535277,
        10.86278049,  0.        ]])

In [10]:
# similarity matrix

similar=pd.DataFrame(1/(1 + squareform(pdist(s_data.iloc[:, :-1], metric))), 
                         index=s_data.index, columns=s_data.index)

similar.head()

Unnamed: 0,id__0,id__1,id__2,id__3,id__4,id__5,id__6,id__7,id__8,id__9,...,id__1990,id__1991,id__1992,id__1993,id__1994,id__1995,id__1996,id__1997,id__1998,id__1999
id__0,1.0,0.081503,0.083333,0.095358,0.084959,0.08302,0.06764,0.103517,0.096331,0.095841,...,0.093953,0.085297,0.078187,0.106088,0.084626,0.093051,0.099449,0.1,0.084297,0.076923
id__1,0.081503,1.0,0.102904,0.085983,0.092176,0.091325,0.1,0.08302,0.091325,0.089301,...,0.080634,0.08302,0.087047,0.078977,0.078448,0.085638,0.079792,0.090094,0.090499,0.088152
id__2,0.083333,0.102904,1.0,0.084626,0.08741,0.084626,0.088913,0.080634,0.081503,0.096331,...,0.085638,0.076435,0.078448,0.094414,0.09261,0.114338,0.08741,0.098375,0.101131,0.083333
id__3,0.095358,0.085983,0.084626,1.0,0.08853,0.08853,0.08007,0.098908,0.10056,0.098908,...,0.085983,0.093953,0.09683,0.110348,0.079792,0.078711,0.084297,0.094882,0.093498,0.090094
id__4,0.084959,0.092176,0.08741,0.08853,1.0,0.095358,0.086333,0.090499,0.091747,0.081503,...,0.09683,0.098908,0.103517,0.091325,0.085983,0.089695,0.101711,0.120771,0.10056,0.091747


In [11]:
similar.shape

(2000, 2000)

# New User

In [12]:
# new fixed user

n_rating=np.random.randint(0, 6, (1, len(tags)))
n_user={k:v for k, v in list(zip(tags, n_rating[0]))}
n_user['id']='id_2001'
n_user['plan']=0

n_user

{'r&b': 1,
 'rock': 0,
 'jazz': 0,
 'techno': 4,
 'pop': 3,
 'indie': 0,
 'cinema': 2,
 'theater': 5,
 'beers': 4,
 'wine': 4,
 'party': 5,
 'trips': 5,
 'running': 1,
 'gym': 0,
 'golf': 3,
 'basket': 2,
 'football': 4,
 'yoga': 4,
 'id': 'id_2001',
 'plan': 0}

In [13]:
# new user introduced into system

s_data=s_data.append(n_user, ignore_index=True)
names.append(n_user['id'])
s_data.index=names

In [14]:
# similarity for new user

similar = pd.DataFrame(1/(1 + squareform(pdist(s_data.iloc[:, :-2], metric))), 
                         index=s_data.index, columns=s_data.index)


similarities = similar[n_user['id']].sort_values(ascending=False)



closer_users=[]
for e in similarities.index:
    if s_data.ix[e].plan==1:
        closer_users.append(e)
   
    
# closer users with plan
closer_users[:10]

['id__100',
 'id__1005',
 'id__1133',
 'id__1904',
 'id__1370',
 'id__932',
 'id__1978',
 'id__1426',
 'id__1825',
 'id__460']

In [16]:
s_data.ix[closer_users[:10]]

Unnamed: 0,r&b,rock,jazz,techno,pop,indie,cinema,theater,beers,wine,party,trips,running,gym,golf,basket,football,yoga,plan,id
id__100,4,0,1,3,0,0,1,4,5,2,5,4,0,0,5,1,3,3,1,100
id__1005,1,4,0,3,3,1,2,3,5,5,4,3,2,2,2,0,3,2,1,1005
id__1133,2,3,2,3,1,0,1,1,4,5,5,3,1,1,3,2,5,2,1,1133
id__1904,1,2,0,3,3,1,5,4,4,4,5,2,5,1,5,3,3,5,1,1904
id__1370,2,2,2,3,2,2,0,3,4,4,5,5,0,0,5,4,5,0,1,1370
id__932,3,1,2,1,5,1,4,5,4,3,4,4,2,4,3,0,4,5,1,932
id__1978,0,1,1,5,3,3,1,1,1,2,4,5,1,2,2,1,5,3,1,1978
id__1426,1,0,1,3,3,1,3,5,4,1,5,1,4,2,2,0,3,2,1,1426
id__1825,4,0,2,3,3,1,1,3,3,3,2,5,2,2,3,3,0,3,1,1825
id__460,3,0,1,1,2,1,4,5,4,4,2,4,4,0,4,4,1,3,1,460


# New User Input

In [17]:
# function for new users

def new_user(df, rb, rock, jazz, techno, pop, indie, cinema, theater, beers, wine,
             party, trips, running, gym, golf, basket, football, yoga, metric):
    
    
    tags=['r&b', 'rock', 'jazz', 'techno', 'pop', 'indie',
          'cinema', 'theater', 'beers', 'wine', 'party', 'trips',
          'running', 'gym', 'golf', 'basket', 'football', 'yoga']
    
    
    rating=[rb, rock, jazz, techno, pop, indie, cinema, theater, beers, wine,
             party, trips, running, gym, golf, basket, football, yoga, metric]
    
    
    
    n_user={k:v for k, v in list(zip(tags, n_rating[0]))}
    n_user['id']='id__'+str(len(s_data.id)+1)
    n_user['plan']=0
    
    
    names=list(df.index)
    df=df.append(n_user, ignore_index=True)
    names.append(n_user['id'])
    df.index=names
    
    
    
    similar = pd.DataFrame(1/(1 + squareform(pdist(df.iloc[:, :-2], metric))), 
                         index=df.index, columns=df.index)


    similarities = similar[n_user['id']].sort_values(ascending=False)



    closer_users=[]
    for e in similarities.index:
        if df.ix[e].plan==1:
            closer_users.append(e)
    
    
    
    return df.ix[closer_users[:10]]
    
    
    
    

# Input

In [None]:
print('Rate r&b from 0 to 5:')
rb = int(input())

In [None]:
print('Rate rock from 0 to 5:')
rock = int(input())

In [None]:
print('Rate jazz from 0 to 5:')
jazz = int(input())

In [None]:
print('Rate techno from 0 to 5:')
techno = int(input())

In [None]:
print('Rate pop from 0 to 5:')
pop = int(input())

In [None]:
print('Rate indie from 0 to 5:')
indie = int(input())

In [None]:
print('Rate cinema from 0 to 5:')
cinema = int(input())

In [None]:
print('Rate theater from 0 to 5:')
theater = int(input())

In [None]:
print('Rate beers from 0 to 5:')
beers = int(input())

In [None]:
print('Rate wine from 0 to 5:')
wine = int(input())

In [None]:
print('Rate party from 0 to 5:')
party = int(input())

In [None]:
print('Rate trips from 0 to 5:')
trips = int(input())

In [None]:
print('Rate running from 0 to 5:')
running = int(input())

In [None]:
print('Rate gym from 0 to 5:')
gym = int(input())

In [None]:
print('Rate golf from 0 to 5:')
golf = int(input())

In [None]:
print('Rate basket from 0 to 5:')
basket = int(input())

In [None]:
print('Rate football from 0 to 5:')
football = int(input())

In [None]:
print('Rate yoga from 0 to 5:')
yoga = int(input())

In [None]:
# metrics for squareform

'''
'euclidean', 'minkowski', 'cityblock', 'seuclidean', 'sqeuclidean', 'cosine', 'correlation'

'hamming', 'jaccard', 'chebyshev', 'canberra', 'braycurtis', 'mahalanobis', 'yule'

'matching', 'dice', 'kulsinski', 'rogerstanimoto', 'russellrao', 'sokalmichener'

'sokalsneath', 'wminkowski'
'''



metric='cosine'  

In [None]:
plans=new_user(s_data, rb, rock, jazz, techno, pop, indie, cinema, theater, beers, wine,
               party, trips, running, gym, golf, basket, football, yoga, metric)

In [None]:
display(plans)