# Setup 

In [2]:
import pandas as pd
from sklearn import preprocessing
from scipy.spatial import distance


# Read in cash friends dataset

In [3]:
cash_friends = pd.read_parquet("CashFriends.parquet")

# Encode Cash Friends Categorical Features

In [9]:
categorical_cols = ["user_occupation", "most_used_cash_app_feature"]

In [10]:
binary_cols = ["gender", "cash_card_enabled", "direct_deposit_enabled", "cash_boost_used", ]

In [11]:
# Encode the categorical columns

def encode_categorical_columns(cols, cash_friends):
    categorical_encoders = {}
    for col in cols:
        label_encoder = preprocessing.LabelEncoder()
        encoded_col = label_encoder.fit_transform(cash_friends[col].values.tolist())
        cash_friends[col + "_encoded"] = encoded_col
        categorical_encoders[col] = label_encoder
    return cash_friends, categorical_encoders

In [12]:
# Encode the binary columns

def encode_binary_columns(cols, cash_friends):
    binary_encoders = {}
    for col in cols:
        label_encoder = preprocessing.LabelBinarizer()
        encoded_col = label_encoder.fit_transform(cash_friends[col].values.tolist())
        cash_friends[col + "_encoded"] = encoded_col
        binary_encoders[col] = label_encoder
    return cash_friends, binary_encoders

In [13]:
# Encode the columns
cash_friends, categorical_encoders = encode_categorical_columns(categorical_cols, cash_friends)
cash_friends, binary_encoders = encode_binary_columns(binary_cols, cash_friends)

# Drop all original columns categorical & binary columns 

In [19]:
# Drop non numerical columns for distance calculation
vector_df = cash_friends.drop(columns=['user_id', 'most_interacted_user_id', 'account_creation_date', 'gender', 'cash_card_enabled', 'direct_deposit_enabled', 'cash_boost_used', 'user_occupation', 'location',
       'most_used_cash_app_feature'])

# Compute Vector Distances 

In [20]:
# use scipy distance functions
# manhattan : distance.cityblock
# euclidean : distance.euclidean

def manhattan_distance(row1, row2):
    return distance.cityblock(row1, row2)

def euclidean_distance(row1, row2):
    return distance.euclidean(row1, row2)
    

# Lets get the top 3 recommended friends for user 0 

In [21]:
# Using row 0 as our target row
target_row = vector_df.iloc[0]

In [22]:
# Compute vector distances 
manhatten_distances = vector_df.apply(lambda row: manhattan_distance(target_row, row), axis=1)
euclidian_distances = vector_df.apply(lambda row: euclidean_distance(target_row, row), axis=1)
vector_df["manhattan_distances"] = manhatten_distances
vector_df["euclidian_distances"] = euclidian_distances

### Rank the other users and get the top 3 recommended for each distance metric 


In [23]:

euclidian_distances = vector_df["euclidian_distances"]
euc_dict = euclidian_distances.to_dict()
ordered_customers_euc =[(customer, distance) for customer, distance in euc_dict.items()]
ordered_customers_euc.sort(key=lambda elem: elem[1])
ordered_customers_euc[:3]

[(0, 0.0), (3236, 73.66247959443125), (3891, 122.16894040630785)]

In [24]:

manhattan_distances = vector_df["manhattan_distances"]
man_dict = manhattan_distances.to_dict()
ordered_customers_man =[(customer, distance) for customer, distance in man_dict.items()]
ordered_customers_man.sort(key=lambda elem: elem[1])
ordered_customers_man[:3]

[(0, 0.0), (3236, 161.47), (3891, 207.5)]

In [25]:
### Compare target user to recommended users 


In [26]:
target_user = cash_friends.iloc[0]

In [27]:
target_user

user_id                                         iqxsKVYnZY
account_creation_date                  2018-05-31 00:00:00
gender                                                Male
count_num_transactions_last_yr                          13
sum_amount_spent_all_time_usd                       1243.0
current_cash_account_balance_usd                     616.0
current_bitcoin_account_balance_btc                   2.78
current_stock_account_balance_usd                   1752.0
cash_card_enabled                                       No
direct_deposit_enabled                                 Yes
cash_boost_used                                         No
most_interacted_user_index                            4689
user_occupation                                   Engineer
location                                              Utah
most_used_cash_app_feature                           Boost
user_account_age_yr                                      5
most_interacted_user_id                         AG5Z6mJM

### recommended users

In [29]:
recommender_user_id = 3236

In [30]:
recommended_user = cash_friends.iloc[recommender_user_id]

In [31]:
recommended_user

user_id                                         gGO8wyLUrx
account_creation_date                  2014-03-02 00:00:00
gender                                              Female
count_num_transactions_last_yr                          23
sum_amount_spent_all_time_usd                       1220.0
current_cash_account_balance_usd                     564.0
current_bitcoin_account_balance_btc                   1.31
current_stock_account_balance_usd                   1786.0
cash_card_enabled                                      Yes
direct_deposit_enabled                                 Yes
cash_boost_used                                         No
most_interacted_user_index                            4659
user_occupation                                 Accountant
location                                         Tennessee
most_used_cash_app_feature                       Cash Card
user_account_age_yr                                      9
most_interacted_user_id                         fz20bEBr