In [1]:
#importing the libraries
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN

In [3]:
#get the user reviews as csv file and keep only the required columns
u_rev = pd.read_csv('zomato_user_review.csv',index_col = 'User_id',low_memory = False)
u_rev = u_rev.drop(columns = ['Review_id','User_name','Rating','Datetime'])

#get list of user ids
u_ids = list(set(u_rev.index))

In [7]:
#get the list of restaurants from which the user has taken delivery for each user
u_res_del = {}
cntr = 0
for u in u_ids:
    n_res = 0
    u_data = pd.DataFrame(u_rev.loc[u])
    res = []
    for index,row in u_data.iterrows():
        try:
            review = str(row['Review']).lower()
            if 'deliver' in review or 'swiggy' in review or 'foodpanda' in review or 'packing' in review or 'sent' in review:
                res.append(str(row['Name']).strip())
                n_res += 1
        except:
            pass
    if n_res > 0:
        cntr += 1
        print('\t{}'.format(cntr),end ='\r')
        u_res_del[u] = list(set(res))

	16613

In [10]:
#print the user list along with their restaurant list
write_fl = ['user_id,res_list']
for u in u_res_del:
    line = ','.join(u_res_del[u])
    write_fl.append('{},"{}"'.format(u,line))

write_fl = '\n'.join(write_fl)

with open('user_del_res.csv','w',encoding = 'utf8') as fl:
    fl.write(write_fl)

In [11]:
#the list of users for which the plotting will be done
u_mod = [u for u in u_res_del.keys()]

In [12]:
#get the list of restaurants which the user has reviewed for each user
u_res_rev = {}
cntr = 0
for u in u_mod:
    u_data = pd.DataFrame(u_rev.loc[u])
    try:
        res=[x.strip() for x in u_data.Name]
    except:
        res=[]
    if len(res) > 0:
        cntr += 1
        print('\t{}'.format(cntr),end = '\r')
        u_res_rev[u] = list(set(res))

	16613

In [14]:
#print the user list along with their restaurant list
write_fl = ['user_id,res_list']
for u in u_res_rev:
    line = ','.join(u_res_rev[u])
    write_fl.append('{},"{}"'.format(u,line))

write_fl = '\n'.join(write_fl)

with open('user_res_list.csv','w',encoding='utf8') as fl:
    fl.write(write_fl)

In [2]:
#some helper functions
def centroid(arr):
    l = len(arr)
    s = np.sum(arr,axis=0)
    s /= l
    return s 

#convert list to printable string
def get_simplified_string(points):
    l = []
    for p in points:
        for i in p:
            l.append(i)
    s = ','.join([str(x) for x in l])
    return s

#get res dataset for each user for clustering
def get_res_df(res_loc,res):
    l = []
    for r in res:
        try:
            l.append(res_loc.loc[r])
        except Exception as e:
            pass
    return pd.DataFrame(l)

#DBSCAN clustering
def df_to_labels(df):
    kms_per_radian = 6371.0088
    eps = 8.0 / kms_per_radian
    db = DBSCAN(eps = eps,min_samples = 2,algorithm = 'ball_tree', metric = 'haversine')
    db.fit_predict(np.radians(df))
    return db.labels_

#calculate final centroid of each cluster
def final_ll(res_df,labels):
    l_cent = []
    cluster_points = {}
    for i in range(res_df.shape[0]):
        if labels[i] == -1:
            l_cent.append(list(res_df.iloc[i]))
        else:
            lb = labels[i]
            if lb not in cluster_points:
                cluster_points[lb] = []
            cluster_points[lb].append(list(res_df.iloc[i]))
    for lb in cluster_points:
        l_cent.append(centroid(cluster_points[lb]))
    return l_cent

#calcuate centroid from restaurant list
def res_to_centroid(res,res_loc):
    res_df = get_res_df(res_loc,res)
    labels = df_to_labels(res_df)
    cent = final_ll(res_df,labels)
    return cent

In [89]:
#load the dataset
res_loc = pd.read_csv('res_location.csv',index_col='res')
del_res = pd.read_csv('user_del_res.csv')

In [104]:
#get centroid based on restaurant delivered for each user
u_cent = []
for i in range(del_res.shape[0]):
    try:
        data = {}
        data['user_id'] = del_res.iloc[i]['user_id']
        res = del_res.iloc[i]['res_list'].split(',')
        data['centroid'] = get_simplified(res_to_centroid(res,res_loc))
        u_cent.append(data)
    except Exception as e:
        pass
cent=pd.DataFrame(u_cent)
cent.to_csv('user_del_cent.csv',index = False)

In [3]:
#load the dataset
res_loc = pd.read_csv('res_location.csv',index_col='res')
rev_res = pd.read_csv('user_res_list.csv')

In [11]:
#get centroid based on restaurant reviewed for each user
u_cent = []
for i in range(rev_res.shape[0]):
    try:
        data = {}
        data['user_id'] = rev_res.iloc[i]['user_id']
        res = rev_res.iloc[i]['res_list'].split(',')
        data['centroid'] = get_simplified_string(res_to_centroid(res,res_loc))
        u_cent.append(data)
    except Exception as e:
        pass
u_res_cent = pd.DataFrame(u_cent)
u_res_cent.to_csv('user_res_cent.csv',index = False)