# Part 1: GCP and Prefect Setup

# Setup git cloning


In [None]:
! git clone https://github.com/anushagj/friend-up-your-cash-app-game.git
! pip install prefect==1.0 -U
! pip install parquet

# Create a Free-Tier Prefect Account


1.   In a new tab, go to https://cloud.prefect.io/
2.   Click Sign in with Google option and use the new google account created in the previous step.
3.   Click Next, then click TO THE DASHBOARD

# **Let's quickly explore the data**

In [None]:
import pandas as pd
parquet_file_path = '/content/friend-up-your-cash-app-game/Dataset/cash_friends.parquet'
df = pd.read_parquet(parquet_file_path)
df.head()

# Create a Free Google Account
Create a google account [here](https://https://accounts.google.com/signup/v2/createaccount?continue=https%3A%2F%2Fmyaccount.google.com%3Futm_source%3Daccount-marketing-page%26utm_medium%3Dcreate-account-button&flowName=GlifWebSignIn&flowEntry=SignUp) (if you don’t already have one)

# Create a Free Google Cloud Platform Account

In a new tab go to https://console.cloud.google.com/. Then in the top left, click on Select a **project > new project**


# Create Prefect Task

In [None]:
import prefect
from prefect import task, Flow
@task
def hello_task():
  logger = prefect.context.get("logger")
  logger.info("Hello world!")

flow = Flow("hello-flow", tasks=[hello_task])
flow.run()


# Setup Prefect Cloud

In [None]:
! prefect agent local start

In [None]:
! prefect auth login --key <Your KEY>

# Connecting local prefect to our cloud prefect


Create an API key : https://cloud.prefect.io/user/keys, **save the key**!

In [None]:
! prefect create project cash_find_friends

In [None]:
flow.register(project_name="cash_find_friends")

Next we follow the link that was generated and select quick run and we will see our flow run in the cloud !



---



In [None]:
! prefect agent local start



---



# **Create a Table in Big Query using Prefect**

In [None]:
import parquet
import os
from google.cloud import bigquery
from prefect import task, Flow, Parameter
import pandas as pd


#TO BE UPDATED BY YOU
PROJECT_ID = "ghc23-394604"
DATASET_NAME = "Friends"
TABLE_NAME = "cash_friends"

#TO BE UPDATED BY YOU
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = <Your JSON FILE>

# Function to create a new table in BigQuery
def create_table(project_id, dataset_name, table_name):
  client = bigquery.Client(project=project_id)

  # Define the schema for your table (change the fields accordingly)
  schema = [
    bigquery.SchemaField("user_id", "STRING"),
    bigquery.SchemaField("account_creation_date", "STRING"),
    bigquery.SchemaField("gender", "STRING"),
    bigquery.SchemaField("count_num_transactions_last_yr", "INTEGER"),
    bigquery.SchemaField("sum_amount_spent_all_time_usd", "FLOAT"),
    bigquery.SchemaField("current_cash_account_balance_usd", "FLOAT"),
    bigquery.SchemaField("current_bitcoin_account_balance_btc", "FLOAT"),
    bigquery.SchemaField("current_stock_account_balance_usd", "FLOAT"),
    bigquery.SchemaField("cash_card_enabled", "STRING"),
    bigquery.SchemaField("direct_deposit_enabled", "STRING"),
    bigquery.SchemaField("cash_boost_used", "STRING"),
    bigquery.SchemaField("most_interacted_user_index", "INTEGER"),
    bigquery.SchemaField("user_occupation", "STRING"),
    bigquery.SchemaField("location", "STRING"),
    bigquery.SchemaField("most_used_cash_app_feature", "STRING"),
    bigquery.SchemaField("account_age_yr","INTEGER"),
    bigquery.SchemaField("most_interacted_user_id","STRING")
  ]

  table_ref = client.dataset(dataset_name).table(table_name)
  table = bigquery.Table(table_ref, schema=schema)

  # Create the table
  table = client.create_table(table)
  print(f"Table {table.project}.{table.dataset_id}.{table.table_id} created.")


# Create the table (only needed if the table doesn't already exist)
create_table(PROJECT_ID, DATASET_NAME, TABLE_NAME)

# **Upload data from the parquet file into BigQuery**

In [None]:
# Function to upload Parquet data to BigQuery table
def upload_parquet_to_bigquery(parquet_file_path, project_id, dataset_name, table_name):
  df = pd.read_parquet(parquet_file_path)

  df['account_creation_date'] = df['account_creation_date'].dt.strftime('%Y-%m-%d %H:%M:%S')


  # Initialize a BigQuery client
  client = bigquery.Client()


  # Define the job configuration
  job_config = bigquery.LoadJobConfig()
  job_config.source_format = bigquery.SourceFormat.PARQUET
  job_config.autodetect = True  # Automatically detect schema

  # Upload the DataFrame to BigQuery
  table_ref = client.dataset(dataset_name).table(table_name)
  job = client.load_table_from_dataframe(df, table_ref, job_config=job_config)

  # Wait for the job to complete
  job.result()

  print(f"Loaded {job.output_rows} rows into {dataset_name}:{table_name}")

# Upload the CSV data to the table
upload_parquet_to_bigquery(parquet_file_path, PROJECT_ID, DATASET_NAME, TABLE_NAME)

# Part 2: Encoding & Embeddings

## Setup

In [None]:
import pandas as pd
from sklearn import preprocessing
from scipy.spatial import distance

In [None]:
cash_friends = df

## Encode Cash Friends Categorical Features

In [None]:
categorical_cols = ["user_occupation", "most_used_cash_app_feature", "gender"]
binary_cols = ["cash_card_enabled", "direct_deposit_enabled", "cash_boost_used", ]


In [None]:
# Encode the categorical columns

# use built in encoder preprocessing.LabelEncoder()
def encode_categorical_columns(cols, cash_friends):
    categorical_encoders = {}
    # TODO: encode each categorical column and save each encoder for
    # each column
    # create a new column, using the current column name + suffix '_encoded'
    # return the mapping between the columns to categorical encoders
    return cash_friends, categorical_encoders

In [None]:
# Encode the binary columns

# use built in encoder preprocessing.LabelBinarizer()
def encode_binary_columns(cols, cash_friends):
    binary_encoders = []
    # TODO: encode each binary column and save each encoder for
    # each column
    # create a new column, using the current column name + suffix '_encoded'
    # return the mapping between the columns to binary encoders
    return cash_friends, binary_encoders

In [None]:
# Encode the columns
cash_friends, categorical_encoders = encode_categorical_columns(categorical_cols, cash_friends)
cash_friends, binary_encoders = encode_binary_columns(binary_cols, cash_friends)

## Drop all original columns categorical & binary columns

In [None]:
# Drop non numerical columns for distance calculation
vector_df = cash_friends.drop(columns=['user_id', 'most_interacted_user_id', 'account_creation_date', 'gender', 'cash_card_enabled', 'direct_deposit_enabled', 'cash_boost_used', 'user_occupation', 'location',
       'most_used_cash_app_feature'])

## Compute Vector Distances

In [None]:
# use scipy distance functions
# manhattan : distance.cityblock
# euclidean : distance.euclidean


def manhattan_distance(vector_1, vector_2):
    # TODO: COMPLETE FUNCTION TO compute distance
    # and return distance
    return ...

def euclidean_distance(vector_1, vector_2):
    # TODO: COMPLETE FUNCTION TO compute distance
    # and return distance
    return ...


## Lets get the top 3 recommended friends for user 0

In [None]:
# Using row 0 as our target row
target_row = vector_df.iloc[0]

In [None]:
# Compute vector distances
manhatten_distances = vector_df.apply(lambda row: manhattan_distance(target_row, row), axis=1)
euclidian_distances = vector_df.apply(lambda row: euclidean_distance(target_row, row), axis=1)
vector_df["manhattan_distances"] = manhatten_distances
vector_df["euclidian_distances"] = euclidian_distances

### Rank the other users and get the top 3 recommended for each distance metric

In [None]:
euclidian_distances = vector_df["euclidian_distances"]
euc_dict = euclidian_distances.to_dict()
ordered_customers_euc =[(customer, distance) for customer, distance in euc_dict.items()]
ordered_customers_euc.sort(key=lambda elem: elem[1])
ordered_customers_euc[:4]


In [None]:
manhattan_distances = vector_df["manhattan_distances"]
man_dict = manhattan_distances.to_dict()
ordered_customers_man =[(customer, distance) for customer, distance in man_dict.items()]
ordered_customers_man.sort(key=lambda elem: elem[1])
ordered_customers_man[:4]

### Compare target user to recommended users

In [None]:
target_user = cash_friends.iloc[0]
target_user

In [None]:
recommender_user_id = ...

In [None]:
recommended_user = cash_friends.iloc[recommender_user_id]
recommended_user

# (BONUS) Part 3: Model Training