# Setup

In [None]:
! git clone https://github.com/anushagj/friend-up-your-cash-app-game.git
! pip install prefect==1.0 -U

In [227]:
import pandas as pd
from sklearn import preprocessing
from scipy.spatial import distance


# Read in cash friends dataset

In [296]:
cash_friends = pd.read_parquet("friend-up-your-cash-app-game/Dataset/cash_friends.parquet")

# Encode Cash Friends Categorical Features

In [297]:
categorical_cols = ["user_occupation", "most_used_cash_app_feature", "user_id"]

In [298]:
binary_cols = ["gender", "cash_card_enabled", "direct_deposit_enabled", "cash_boost_used", ]

In [299]:
# Encode the categorical columns

def encode_categorical_columns(cols, cash_friends):
    categorical_encoders = {}
    for col in cols:
        label_encoder = preprocessing.LabelEncoder()
        encoded_col = label_encoder.fit_transform(cash_friends[col].values.tolist())
        cash_friends[col + "_encoded"] = encoded_col
        categorical_encoders[col] = label_encoder
    return cash_friends, categorical_encoders

In [300]:
# Encode the binary columns

def encode_binary_columns(cols, cash_friends):
    binary_encoders = {}
    for col in cols:
        label_encoder = preprocessing.LabelBinarizer()
        encoded_col = label_encoder.fit_transform(cash_friends[col].values.tolist())
        cash_friends[col + "_encoded"] = encoded_col
        binary_encoders[col] = label_encoder
    return cash_friends, binary_encoders

In [301]:
# Encode the columns
cash_friends, categorical_encoders = encode_categorical_columns(categorical_cols, cash_friends)
cash_friends, binary_encoders = encode_binary_columns(binary_cols, cash_friends)

# Drop all original columns categorical & binary columns

In [303]:
# Drop non numerical columns for distance calculation
vector_df = cash_friends.drop(columns=['user_id', 'most_interacted_user_id', 'account_creation_date', 'gender', 'cash_card_enabled', 'direct_deposit_enabled', 'cash_boost_used', 'user_occupation', 'location',
       'most_used_cash_app_feature', 'user_id_encoded'])

# Compute Vector Distances

In [304]:
# use scipy distance functions
# manhattan : distance.cityblock
# euclidean : distance.euclidean

def manhattan_distance(row1, row2):
    return distance.cityblock(row1, row2)

def euclidean_distance(row1, row2):
    return distance.euclidean(row1, row2)


# Lets get the top 3 recommended friends for user 0

In [305]:
# Using row 0 as our target row
target_row = vector_df.iloc[0]

In [306]:
# Compute vector distances
manhatten_distances = vector_df.apply(lambda row: manhattan_distance(target_row, row), axis=1)
euclidian_distances = vector_df.apply(lambda row: euclidean_distance(target_row, row), axis=1)
vector_df["manhattan_distances"] = manhatten_distances
vector_df["euclidian_distances"] = euclidian_distances

### Rank the other users and get the top 3 recommended for each distance metric


In [307]:

euclidian_distances = vector_df["euclidian_distances"]
euc_dict = euclidian_distances.to_dict()
ordered_customers_euc =[(customer, distance) for customer, distance in euc_dict.items()]
ordered_customers_euc.sort(key=lambda elem: elem[1])
ordered_customers_euc[:3]

[(0, 0.0), (1772, 206.0826360953295), (981, 280.55483314318434)]

In [308]:

manhattan_distances = vector_df["manhattan_distances"]
man_dict = manhattan_distances.to_dict()
ordered_customers_man =[(customer, distance) for customer, distance in man_dict.items()]
ordered_customers_man.sort(key=lambda elem: elem[1])
ordered_customers_man[:3]

[(0, 0.0), (1772, 304.23), (1183, 499.1)]

In [309]:
### Compare target user to recommended users


In [310]:
target_user = cash_friends.iloc[0]

In [None]:
target_user

### recommended users

In [312]:
recommender_user_id = ordered_customers_man[1][0]

In [313]:
recommended_user = cash_friends.iloc[recommender_user_id]

In [314]:
recommended_user

user_id                                         FeKVVsuTml
account_creation_date                  2020-06-16 00:00:00
gender                                              Female
count_num_transactions_last_yr                          16
sum_amount_spent_all_time_usd                       1377.0
current_cash_account_balance_usd                     698.0
current_bitcoin_account_balance_btc                   2.04
current_stock_account_balance_usd                   1618.0
cash_card_enabled                                       No
direct_deposit_enabled                                  No
cash_boost_used                                         No
most_interacted_user_index                             529
user_occupation                               Entrepreneur
location                                        Washington
most_used_cash_app_feature                  Direct Deposit
account_age_yr                                           3
most_interacted_user_id                         aL8IUZbB

# Bonus Material

## Merge the cash friends df to get the most interacted user id's features

In [317]:
cash_friends = cash_friends.drop(columns=['account_creation_date', 'gender', 'cash_card_enabled', 'direct_deposit_enabled', 'cash_boost_used', 'user_occupation', 'location',
       'most_used_cash_app_feature'])
cash_friends.head()

Unnamed: 0,user_id,count_num_transactions_last_yr,sum_amount_spent_all_time_usd,current_cash_account_balance_usd,current_bitcoin_account_balance_btc,current_stock_account_balance_usd,most_interacted_user_index,account_age_yr,most_interacted_user_id,user_occupation_encoded,most_used_cash_app_feature_encoded,user_id_encoded,gender_encoded,cash_card_enabled_encoded,direct_deposit_enabled_encoded,cash_boost_used_encoded
0,LyuLjUo0dH,14,1383.0,714.0,2.27,1432.0,442,3,dt8BG7TNjO,6,4,1757,1,1,1,1
1,86lAOsc1Gh,15,528.0,262.0,3.52,2525.0,842,8,3WfkGmY1HF,6,3,657,1,1,0,1
2,YcI21zkiL1,16,720.0,753.0,1.65,2686.0,4698,4,TSFnHGhvcb,1,1,2774,0,0,1,0
3,10zlKlUH4r,30,1062.0,736.0,2.35,2751.0,467,4,vH6YkrHISj,3,1,73,1,0,0,1
4,dflMuC8Yz8,11,199.0,350.0,2.53,1550.0,3724,7,zTbIUEjCJJ,7,0,3200,1,0,1,0


## Create new column called least_interacted_user_id

In [318]:
from random import choice

users = cash_friends["user_id"].tolist()
# we can hypothesis that any users who is not the most interacted user or the user itself can be the least interact users, so we will use random to randomly select this.
cash_friends["least_interacted_user_id"] = cash_friends.apply(lambda x: choice(list(set(users) - set([x['user_id'], x['most_interacted_user_id']]))), axis=1)
cash_friends.head()

Unnamed: 0,user_id,count_num_transactions_last_yr,sum_amount_spent_all_time_usd,current_cash_account_balance_usd,current_bitcoin_account_balance_btc,current_stock_account_balance_usd,most_interacted_user_index,account_age_yr,most_interacted_user_id,user_occupation_encoded,most_used_cash_app_feature_encoded,user_id_encoded,gender_encoded,cash_card_enabled_encoded,direct_deposit_enabled_encoded,cash_boost_used_encoded,least_interacted_user_id
0,LyuLjUo0dH,14,1383.0,714.0,2.27,1432.0,442,3,dt8BG7TNjO,6,4,1757,1,1,1,1,6rYAuAXzTL
1,86lAOsc1Gh,15,528.0,262.0,3.52,2525.0,842,8,3WfkGmY1HF,6,3,657,1,1,0,1,CKZoOZEFGa
2,YcI21zkiL1,16,720.0,753.0,1.65,2686.0,4698,4,TSFnHGhvcb,1,1,2774,0,0,1,0,vggSgz5xYS
3,10zlKlUH4r,30,1062.0,736.0,2.35,2751.0,467,4,vH6YkrHISj,3,1,73,1,0,0,1,dtCtP86Fvv
4,dflMuC8Yz8,11,199.0,350.0,2.53,1550.0,3724,7,zTbIUEjCJJ,7,0,3200,1,0,1,0,2v7ZgBMArd


## Create Positive pairs

In [323]:
cash_friends_pos_pairs = pd.merge(cash_friends[["user_id", "user_id_encoded"]], cash_friends[["user_id", "user_id_encoded", "most_interacted_user_id"]], left_on="user_id", right_on="most_interacted_user_id", how='inner')
cash_friends_pos_pairs["connected"] = 1
cash_friends_pos_pairs= cash_friends_pos_pairs.rename(columns={"user_id_encoded_x": "user_id_2", "user_id_encoded_y": "user_id_1"})
cash_friends_pos_pairs= cash_friends_pos_pairs.drop(columns=["most_interacted_user_id", "user_id_x", "user_id_y"])
cash_friends_pos_pairs

Unnamed: 0,user_id_2,user_id_1,connected
0,1757,3221,1
1,1757,1368,1
2,657,4705,1
3,2774,1925,1
4,677,4967,1
...,...,...,...
4995,2318,3865,1
4996,2418,3019,1
4997,63,1199,1
4998,2964,1948,1


## Create Negative Pairs

In [324]:
cash_friends_neg_pairs = pd.merge(cash_friends[["user_id", "user_id_encoded"]], cash_friends[["user_id",'least_interacted_user_id', "user_id_encoded"]], left_on='user_id', right_on="least_interacted_user_id", how='inner')
cash_friends_neg_pairs["connected"] = 0
cash_friends_neg_pairs= cash_friends_neg_pairs.rename(columns={"user_id_encoded_x": "user_id_2", "user_id_encoded_y": "user_id_1"})
cash_friends_neg_pairs= cash_friends_neg_pairs.drop(columns=["least_interacted_user_id",  "user_id_x", "user_id_y"])

cash_friends_neg_pairs.head()

Unnamed: 0,user_id_2,user_id_1,connected
0,1757,896,0
1,657,1412,0
2,2774,4152,0
3,2774,1750,0
4,2774,4083,0


## Combine negativite pairs with positive pairs

In [325]:
customer_pairs = pd.concat([cash_friends_neg_pairs, cash_friends_pos_pairs])
customer_pairs

Unnamed: 0,user_id_2,user_id_1,connected
0,1757,896,0
1,657,1412,0
2,2774,4152,0
3,2774,1750,0
4,2774,4083,0
...,...,...,...
4995,2318,3865,1
4996,2418,3019,1
4997,63,1199,1
4998,2964,1948,1


## Normalize the numerical columns


In [326]:
cash_friends.columns

Index(['user_id', 'count_num_transactions_last_yr',
       'sum_amount_spent_all_time_usd', 'current_cash_account_balance_usd',
       'current_bitcoin_account_balance_btc',
       'current_stock_account_balance_usd', 'most_interacted_user_index',
       'account_age_yr', 'most_interacted_user_id', 'user_occupation_encoded',
       'most_used_cash_app_feature_encoded', 'user_id_encoded',
       'gender_encoded', 'cash_card_enabled_encoded',
       'direct_deposit_enabled_encoded', 'cash_boost_used_encoded',
       'least_interacted_user_id'],
      dtype='object')

In [327]:
numerical_cols = [
      'count_num_transactions_last_yr',
       'sum_amount_spent_all_time_usd', 'current_cash_account_balance_usd',
       'current_bitcoin_account_balance_btc',
       'current_stock_account_balance_usd',
       'account_age_yr']

In [328]:
from sklearn.preprocessing import MinMaxScaler

def normalize(col, data):
    scaler = MinMaxScaler()
    data[col] = scaler.fit_transform(data[col].values.reshape(-1, 1))
    return data

In [329]:
for num_col in numerical_cols:
    cash_friends = normalize(num_col, cash_friends)

In [330]:
cash_friends.head()

Unnamed: 0,user_id,count_num_transactions_last_yr,sum_amount_spent_all_time_usd,current_cash_account_balance_usd,current_bitcoin_account_balance_btc,current_stock_account_balance_usd,most_interacted_user_index,account_age_yr,most_interacted_user_id,user_occupation_encoded,most_used_cash_app_feature_encoded,user_id_encoded,gender_encoded,cash_card_enabled_encoded,direct_deposit_enabled_encoded,cash_boost_used_encoded,least_interacted_user_id
0,LyuLjUo0dH,0.258065,0.685672,0.647325,0.532864,0.277842,442,0.333333,dt8BG7TNjO,6,4,1757,1,1,1,1,6rYAuAXzTL
1,86lAOsc1Gh,0.290323,0.261775,0.237534,0.826291,0.489911,842,0.888889,3WfkGmY1HF,6,3,657,1,1,0,1,CKZoOZEFGa
2,YcI21zkiL1,0.322581,0.356966,0.682684,0.387324,0.521149,4698,0.444444,TSFnHGhvcb,1,1,2774,0,0,1,0,vggSgz5xYS
3,10zlKlUH4r,0.774194,0.526525,0.667271,0.551643,0.53376,467,0.444444,vH6YkrHISj,3,1,73,1,0,0,1,dtCtP86Fvv
4,dflMuC8Yz8,0.16129,0.098661,0.317316,0.593897,0.300737,3724,0.777778,zTbIUEjCJJ,7,0,3200,1,0,1,0,2v7ZgBMArd


## Lets most_interacted_user and least interact user related columns


In [331]:
cash_friends_clean = cash_friends.drop(columns=["most_interacted_user_index", "most_interacted_user_id", "least_interacted_user_id"])

In [333]:
num_customers = cash_friends["user_id"].nunique()
customer_data = cash_friends_clean


## We need to set our index to user_id_encode to easily access between the customer data and customer pairs dataframes

In [337]:
customer_data.set_index('user_id_encoded', inplace=True)
customer_pairs.set_index(['user_id_1', 'user_id_2'], inplace=True)


## Split data into train and test sets


In [338]:
train_pairs, test_pairs = train_test_split(customer_pairs, test_size=0.2, random_state=42)


In [339]:
train_pairs

Unnamed: 0_level_0,Unnamed: 1_level_0,connected
user_id_1,user_id_2,Unnamed: 2_level_1
500,4485,1
1758,1859,0
2729,3960,0
1163,4382,1
931,2244,1
...,...,...
1660,3223,1
2990,3897,1
4906,1543,1
768,4403,0


In [340]:
num_features = 12

In [None]:
# Model Architecture

In [341]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Flatten, Dense, Concatenate, Input
from tensorflow.keras.optimizers import Adam


# Define input layers for the embeddings
input_customer_1 = Input(shape=(1,))
input_customer_2 = Input(shape=(1,))
input_features_customer_1 = Input(shape=(num_features,))
input_features_customer_2 = Input(shape=(num_features,))

# Embedding layers for customer IDs
embedding_dim = 10
embedding_customer_1 = Embedding(input_dim=num_customers, output_dim=embedding_dim)(input_customer_1)
embedding_customer_2 = Embedding(input_dim=num_customers, output_dim=embedding_dim)(input_customer_2)

# Flatten the embeddings
flatten_customer_1 = Flatten()(embedding_customer_1)
flatten_customer_2 = Flatten()(embedding_customer_2)

# Fully connected layers for customer features
fc_customer_1 = Dense(32, activation='relu')(input_features_customer_1)
fc_customer_1 = Dense(16, activation='relu')(fc_customer_1)

fc_customer_2 = Dense(32, activation='relu')(input_features_customer_2)
fc_customer_2 = Dense(16, activation='relu')(fc_customer_2)

# Concatenate the embeddings and features
concatenated = Concatenate()([flatten_customer_1, flatten_customer_2, fc_customer_1, fc_customer_2])

# Final fully connected layers and output for binary classification
final_fc = Dense(32, activation='relu')(concatenated)
output = Dense(1, activation='sigmoid')(final_fc)

# Create the model
model = Model(inputs=[input_customer_1, input_customer_2, input_features_customer_1, input_features_customer_2], outputs=output)

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])


In [344]:
customer_data.columns

Index(['user_id', 'count_num_transactions_last_yr',
       'sum_amount_spent_all_time_usd', 'current_cash_account_balance_usd',
       'current_bitcoin_account_balance_btc',
       'current_stock_account_balance_usd', 'account_age_yr',
       'user_occupation_encoded', 'most_used_cash_app_feature_encoded',
       'gender_encoded', 'cash_card_enabled_encoded',
       'direct_deposit_enabled_encoded', 'cash_boost_used_encoded'],
      dtype='object')

In [346]:
features = list(set(customer_data.columns)  - {'user_id'})
features

['cash_boost_used_encoded',
 'sum_amount_spent_all_time_usd',
 'current_stock_account_balance_usd',
 'most_used_cash_app_feature_encoded',
 'account_age_yr',
 'direct_deposit_enabled_encoded',
 'count_num_transactions_last_yr',
 'user_occupation_encoded',
 'gender_encoded',
 'cash_card_enabled_encoded',
 'current_bitcoin_account_balance_btc',
 'current_cash_account_balance_usd']

# Train the model

In [348]:
model.fit([
        train_pairs.index.get_level_values('user_id_1'),
        train_pairs.index.get_level_values('user_id_2'),
        customer_data.loc[train_pairs.index.get_level_values('user_id_1')][features].values,
        customer_data.loc[train_pairs.index.get_level_values('user_id_2')][features].values
    ],
    train_pairs['connected'].values,
    epochs=10,  # You can adjust the number of epochs
    batch_size=64  # You can adjust the batch size
)

# Evaluate the model on the test set
loss, accuracy = model.evaluate([
        test_pairs.index.get_level_values('user_id_1'),
        test_pairs.index.get_level_values('user_id_2'),
        customer_data.loc[test_pairs.index.get_level_values('user_id_1')][features].values,
        customer_data.loc[test_pairs.index.get_level_values('user_id_2')][features].values
    ],
    test_pairs['connected'].values
)
print(f'Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}')





Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




Test Loss: 7.3196, Test Accuracy: 0.2725


In [349]:
# Test loss is not great, try changing the learning rate and/or num epoch, batch size and even the embedding size! This is where hyperparam tunning comes in !