In [1]:
import pandas as pd
from geopy.distance import geodesic

In [2]:
chats_df = pd.read_json('./data_generator/fake_chats.json', orient='index').drop('sentlocation', axis=1)
profiles_df = pd.read_json('./data_generator/fake_profiles.json', orient='index').rename_axis('profile_id').reset_index()

In [3]:
def chat_calc_distance(row):
  user1_coord = (row['LATITUDE_user1'], row['LONGITUDE_user1'])
  user2_coord = (row['LATITUDE_user2'], row['LONGITUDE_user2'])
  geo_dist = geodesic(user1_coord, user2_coord)
  dist_num = float(str(geo_dist)[:-2])
  return dist_num

def add_user_loc(user, df):
  latitude_user1 = float(df[df['profile_id'] == user]['LATITUDE'])
  longitude_user1 = float(df[df['profile_id'] == user]['LONGITUDE'])
  df['LATITUDE_user1'] = latitude_user1
  df['LONGITUDE_user1'] = longitude_user1
  return df.rename({'LATITUDE':'LATITUDE_user2', 'LONGITUDE':'LONGITUDE_user2'}, axis=1)

def n_nearest_users(user, df, n=10):
  mod_df = add_user_loc(user, df)
  mod_df['distance'] = mod_df.apply(lambda row: chat_calc_distance(row), axis=1)
  return mod_df[mod_df['distance']>0].sort_values('distance')[:n]

In [4]:
# calculate distances between chatters
def user_chat_pref(chats_df, profiles_df):
  df_chat_loc = chats_df.merge(profiles_df[['profile_id','LONGITUDE', 'LATITUDE']], left_on='user1', right_on='profile_id')
  df_chat_loc = df_chat_loc.merge(profiles_df[['profile_id','LONGITUDE', 'LATITUDE']], left_on='user2', right_on='profile_id', suffixes=('_user1', '_user2'))
  df_chat_loc['distance'] = df_chat_loc.apply(lambda row: chat_calc_distance(row), axis=1)
  df_chat_loc['norm_dist'] = (df_chat_loc['distance'] - df_chat_loc['distance'].min()) / (df_chat_loc['distance'].max() - df_chat_loc['distance'].min())
  # TODO: create a sent_location prediction as a score via logistic regression
  df_chat_loc['sentlocation'] = (df_chat_loc['n_messages']/40)*(1/4)+df_chat_loc['user1_sent_picture']*(1/4) + df_chat_loc['user2_sent_picture']*(1/4) + df_chat_loc['norm_dist']*(1/4)
  return df_chat_loc

In [5]:
user_chat_scores = user_chat_pref(chats_df, profiles_df)

In [6]:
def collab_filter(user_score_df):
  piv_test = user_score_df.pivot_table(index=['user1'], columns=['user2'], values='sentlocation')
  piv_test.fillna(0, inplace=True)
  piv_test = piv_test.apply(lambda row: (row-row.mean()) / (row.max() - row.min()))
  return piv_test

scaled_scores = collab_filter(user_chat_scores)

In [7]:
user_similarity_df = scaled_scores.corr(method='pearson')
user_similarity_df.head()

user2,1027320,1067056,1068746,1075404,1147496,1182424,1226472,1231670,1367835,1395439,...,9701116,9713057,9780740,9782966,9842816,9872457,9877452,9933142,9978327,9995851
user2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1027320,1.0,-0.014679,0.015571,0.072525,-0.000429,0.001886,-0.050683,-0.016128,-0.013182,-0.002495,...,0.164399,-0.069564,0.036899,0.083941,0.135823,0.050696,-0.015893,0.038507,-0.017915,0.065558
1067056,-0.014679,1.0,-0.05253,0.012821,0.04963,-0.012625,0.04197,-0.066749,-0.057961,-0.054432,...,-0.058484,0.020211,-0.059262,0.030693,-0.028957,-0.054755,0.024708,-0.058414,0.141305,-0.034664
1068746,0.015571,-0.05253,1.0,-0.019157,0.026965,0.022379,-0.042629,-0.006776,-0.069248,-0.060612,...,-0.065124,0.032062,-0.065991,-0.007367,0.115012,0.020911,0.135101,0.125456,0.017067,-0.007478
1075404,0.072525,0.012821,-0.019157,1.0,0.068923,-0.052756,0.033065,0.071643,0.07373,0.003618,...,0.059103,0.108753,-8.5e-05,-0.01072,-0.002532,-0.056046,-0.035643,-0.0803,0.018337,0.019219
1147496,-0.000429,0.04963,0.026965,0.068923,1.0,-0.056469,-0.049083,0.03071,-0.02172,-0.033049,...,-0.056233,-0.009534,-0.056982,-0.041111,-0.061149,-0.031482,-0.007681,0.138048,-0.046199,-0.026532


In [12]:
def get_similar_user(user, chat_sim_df=user_similarity_df):
    corr_output = chat_sim_df[user].sort_values(ascending=False)[1:10]
    output_df = corr_output.reset_index().set_axis(['profile_id', 'similarity_score'], axis=1)
    return output_df

def distance_method(user):
    n_nearest_demo = list(n_nearest_users(user, profiles_df)['profile_id'])
    output_df = pd.DataFrame(user_similarity_df.loc[n_nearest_demo,user]).reset_index().set_axis(['profile_id', 'similarity_score'], axis=1)
    return output_df

# For user with profile_id = 1068746

In [13]:
distance_method(1068746)

Unnamed: 0,profile_id,similarity_score
0,6929514,0.069451
1,2719296,-0.048913
2,5630167,0.133159
3,1067056,-0.05253
4,5202699,-0.030274
5,6906907,-0.0023
6,9877452,0.135101
7,8915197,0.163082
8,9539167,0.027372
9,8981194,0.037963


In [14]:
get_similar_user(1068746, user_similarity_df)

Unnamed: 0,profile_id,similarity_score
0,1707988,0.257505
1,7864683,0.179857
2,8915197,0.163082
3,2619888,0.145139
4,6149256,0.139395
5,6928093,0.137038
6,9877452,0.135101
7,2007652,0.134993
8,5630167,0.133159


# Demo Section

In [15]:
# 9701116
get_similar_user(9701116)

Unnamed: 0,profile_id,similarity_score
0,3765823,0.169535
1,1027320,0.164399
2,6149256,0.156906
3,7281202,0.155732
4,1692226,0.152024
5,1231670,0.151943
6,6140358,0.151905
7,4565372,0.147937
8,6995493,0.143218


In [16]:
distance_method(9701116)

Unnamed: 0,profile_id,similarity_score
0,4849079,-0.067443
1,3905032,-0.000877
2,2032521,-0.037164
3,9527206,-0.066845
4,3009864,-0.060692
5,5684511,0.077221
6,8139792,-0.061357
7,7184910,-0.053916
8,6784892,0.007446
9,7576597,0.121943
