## Import Dependencies

In [1]:
import atoti
import pandas as pd
import time

## Load Data from S3 

In [2]:
# Load credit card info data
cc_df = pd.read_csv('s3://data.atoti.io/notebooks/retail-banking/sd254_cards.csv')
cc_df

Unnamed: 0,User,CARD INDEX,Card Brand,Card Type,Card Number,Expires,CVV,Has Chip,Cards Issued,Credit Limit,Acct Open Date,Year PIN last Changed,Card on Dark Web
0,0,0,Visa,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
1,0,1,Visa,Debit,4956965974959986,12/2020,393,YES,2,$21968,04/2014,2014,No
2,0,2,Visa,Debit,4582313478255491,02/2024,719,YES,2,$46414,07/2003,2004,No
3,0,3,Visa,Credit,4879494103069057,08/2024,693,NO,1,$12400,01/2003,2012,No
4,0,4,Mastercard,Debit (Prepaid),5722874738736011,03/2009,75,YES,1,$28,09/2008,2009,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6141,1997,1,Amex,Credit,300609782832003,01/2024,663,YES,1,$6900,11/2000,2013,No
6142,1997,2,Visa,Credit,4718517475996018,01/2021,492,YES,2,$5700,04/2012,2012,No
6143,1998,0,Mastercard,Credit,5929512204765914,08/2020,237,NO,2,$9200,02/2012,2012,No
6144,1999,0,Mastercard,Debit,5589768928167462,01/2020,630,YES,1,$28074,01/2020,2020,No


## Analyze User Credit Card Data

In [3]:
# Find the distinct credit card brands
unique_values = cc_df['Card Brand'].unique()
print(sorted(unique_values))

['Amex', 'Discover', 'Mastercard', 'Visa']


In [4]:
# Find the distinct credit card combinations, and their frequency counts overall
cc_combinations_df = cc_df.groupby(['Card Brand', 'Card Type'], as_index = False).size()
cc_combinations_df

Unnamed: 0,Card Brand,Card Type,size
0,Amex,Credit,402
1,Discover,Credit,209
2,Mastercard,Credit,635
3,Mastercard,Debit,2191
4,Mastercard,Debit (Prepaid),383
5,Visa,Credit,811
6,Visa,Debit,1320
7,Visa,Debit (Prepaid),195


In [5]:
# Find the distinct credit card combinations for each user, and their frequency counts
cc_combinations_df = cc_df.groupby(['User', 'Card Brand', 'Card Type'], as_index = False).size()
cc_combinations_df

Unnamed: 0,User,Card Brand,Card Type,size
0,0,Mastercard,Debit (Prepaid),1
1,0,Visa,Credit,1
2,0,Visa,Debit,3
3,1,Mastercard,Debit,1
4,1,Mastercard,Debit (Prepaid),2
...,...,...,...,...
4646,1997,Mastercard,Debit,1
4647,1997,Visa,Credit,1
4648,1998,Mastercard,Credit,1
4649,1999,Mastercard,Debit,1


In [6]:
# Find the max frequency count for all distinct credit card combinations at the user level
max_cc_combinations = cc_combinations_df.groupby(['Card Brand', 'Card Type'], as_index = False)['size'].max()
max_cc_combinations

Unnamed: 0,Card Brand,Card Type,size
0,Amex,Credit,4
1,Discover,Credit,2
2,Mastercard,Credit,4
3,Mastercard,Debit,6
4,Mastercard,Debit (Prepaid),3
5,Visa,Credit,4
6,Visa,Debit,5
7,Visa,Debit (Prepaid),2


In [7]:
# Load credit card info data
cc_df = pd.read_csv('s3://data.atoti.io/notebooks/retail-banking/sd254_cards.csv')
cc_df

Unnamed: 0,User,CARD INDEX,Card Brand,Card Type,Card Number,Expires,CVV,Has Chip,Cards Issued,Credit Limit,Acct Open Date,Year PIN last Changed,Card on Dark Web
0,0,0,Visa,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
1,0,1,Visa,Debit,4956965974959986,12/2020,393,YES,2,$21968,04/2014,2014,No
2,0,2,Visa,Debit,4582313478255491,02/2024,719,YES,2,$46414,07/2003,2004,No
3,0,3,Visa,Credit,4879494103069057,08/2024,693,NO,1,$12400,01/2003,2012,No
4,0,4,Mastercard,Debit (Prepaid),5722874738736011,03/2009,75,YES,1,$28,09/2008,2009,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6141,1997,1,Amex,Credit,300609782832003,01/2024,663,YES,1,$6900,11/2000,2013,No
6142,1997,2,Visa,Credit,4718517475996018,01/2021,492,YES,2,$5700,04/2012,2012,No
6143,1998,0,Mastercard,Credit,5929512204765914,08/2020,237,NO,2,$9200,02/2012,2012,No
6144,1999,0,Mastercard,Debit,5589768928167462,01/2020,630,YES,1,$28074,01/2020,2020,No


In [8]:
# Load credit card info data
cc_info = pd.read_csv('s3://data.atoti.io/notebooks/retail-banking/cc_info.csv')
cc_info

Unnamed: 0,Retailer ID,Retailer Name,Card Brand,Card Type,Industry
0,1,Cathay Pacific Elite,Amex,Credit,Industrials
1,2,Hilton Honors,Amex,Credit,Consumer Discretionary
2,3,Delta SkyMiles Reserve,Amex,Credit,Industrials
3,4,Marriot Bonvoy Brilliant,Amex,Credit,Consumer Discretionary
4,5,Discover it Miles,Discover,Credit,Financials
5,6,Discover it Secured,Discover,Credit,Financials
6,7,Capital One VentureOne Rewards,Mastercard,Credit,Financials
7,8,Citi / AAdvantage Executive World Elite,Mastercard,Credit,Industrials
8,9,Capital One Quicksilver Cash Rewards,Mastercard,Credit,Financials
9,10,IHG One Rewards Traveler,Mastercard,Credit,Consumer Discretionary


In [9]:
# Due to lack of time, this is an extremely crude way
# of creating the entity relationship (hard-coding), sorry!
match_list = []

for index, row in max_cc_combinations.iterrows():
    match_list.append(row['Card Brand'] + " " + row['Card Type'])

match_list

cc_dict = {'Amex Credit': [1, 2, 3, 4],
           'Discover Credit': [5, 6],
           'Mastercard Credit': [7, 8, 9, 10],
           'Mastercard Debit': [11, 12, 13, 14, 15, 16],
           'Mastercard Debit (Prepaid)': [17, 18, 19],
           'Visa Credit': [20, 21, 22, 23],
           'Visa Debit': [24, 25, 26, 27, 28],
           'Visa Debit (Prepaid)': [29, 30]
           }

In [10]:
# Add the new `Retailer ID` column with empty values
cc_df.insert(2, 'Retailer ID', "")
cc_df

Unnamed: 0,User,CARD INDEX,Retailer ID,Card Brand,Card Type,Card Number,Expires,CVV,Has Chip,Cards Issued,Credit Limit,Acct Open Date,Year PIN last Changed,Card on Dark Web
0,0,0,,Visa,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
1,0,1,,Visa,Debit,4956965974959986,12/2020,393,YES,2,$21968,04/2014,2014,No
2,0,2,,Visa,Debit,4582313478255491,02/2024,719,YES,2,$46414,07/2003,2004,No
3,0,3,,Visa,Credit,4879494103069057,08/2024,693,NO,1,$12400,01/2003,2012,No
4,0,4,,Mastercard,Debit (Prepaid),5722874738736011,03/2009,75,YES,1,$28,09/2008,2009,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6141,1997,1,,Amex,Credit,300609782832003,01/2024,663,YES,1,$6900,11/2000,2013,No
6142,1997,2,,Visa,Credit,4718517475996018,01/2021,492,YES,2,$5700,04/2012,2012,No
6143,1998,0,,Mastercard,Credit,5929512204765914,08/2020,237,NO,2,$9200,02/2012,2012,No
6144,1999,0,,Mastercard,Debit,5589768928167462,01/2020,630,YES,1,$28074,01/2020,2020,No


In [13]:
for user in cc_df['User'].unique():
    df = cc_df.loc[cc_df['User'] == user].sort_values(by=['Card Brand', 'Card Type'])
    prev_row = None
    
    for index, row in df.iterrows():
        cc_input = row['Card Brand'] + " " + row['Card Type']
        distinct_count = cc_combinations_df.loc[
                        (cc_combinations_df['User'] == user) & 
                        (cc_combinations_df['Card Brand'] == row['Card Brand']) 
                        & (cc_combinations_df['Card Type'] == row['Card Type'])]['size'].values[0]
        
        if prev_row is None:
            print("FIRST ROW AND NEW CARD FOR USER")
            num_counter = 0
            print(f"  User {user} has a {cc_input}, and has {distinct_count} distinct cards")
            assignment = cc_dict[cc_input][num_counter]
            print(f"    Assigning to Retailer ID... {assignment}")
            cc_df.loc[index, 'Retailer ID'] = assignment
            prev_row = row
            
        else:
            
            if str(prev_row['Card Brand']) == str(row['Card Brand']) and str(prev_row['Card Type']) == str(row['Card Type']):
                print("SAME CARD AS PREVIOUS ROW")
                num_counter+=1
                print(f"  User {user} has a {cc_input}, which is same as above, and has {distinct_count} distinct cards")
                assignment = cc_dict[cc_input][num_counter]
                print(f"    Assigning to Retailer ID... {assignment}")
                cc_df.loc[index, 'Retailer ID'] = assignment
                prev_row = row
                
            else:
                print("NEW CARD FOR SAME USER")
                num_counter = 0
                print(f"  User {user} has a {cc_input}, and has {distinct_count} distinct cards")
                assignment = cc_dict[cc_input][num_counter]
                print(f"    Assigning to Retailer ID... {assignment}")
                cc_df.loc[index, 'Retailer ID'] = assignment
                prev_row = row

FIRST ROW AND NEW CARD FOR USER
  User 0 has a Mastercard Debit (Prepaid), and has 1 distinct cards
    Assigning to Retailer ID... 17
NEW CARD FOR SAME USER
  User 0 has a Visa Credit, and has 1 distinct cards
    Assigning to Retailer ID... 20
NEW CARD FOR SAME USER
  User 0 has a Visa Debit, and has 3 distinct cards
    Assigning to Retailer ID... 24
SAME CARD AS PREVIOUS ROW
  User 0 has a Visa Debit, which is same as above, and has 3 distinct cards
    Assigning to Retailer ID... 25
SAME CARD AS PREVIOUS ROW
  User 0 has a Visa Debit, which is same as above, and has 3 distinct cards
    Assigning to Retailer ID... 26
FIRST ROW AND NEW CARD FOR USER
  User 1 has a Mastercard Debit, and has 1 distinct cards
    Assigning to Retailer ID... 11
NEW CARD FOR SAME USER
  User 1 has a Mastercard Debit (Prepaid), and has 2 distinct cards
    Assigning to Retailer ID... 17
SAME CARD AS PREVIOUS ROW
  User 1 has a Mastercard Debit (Prepaid), which is same as above, and has 2 distinct cards
 

In [14]:
# Check Retailer ID values have been assigned
cc_df

Unnamed: 0,User,CARD INDEX,Retailer ID,Card Brand,Card Type,Card Number,Expires,CVV,Has Chip,Cards Issued,Credit Limit,Acct Open Date,Year PIN last Changed,Card on Dark Web
0,0,0,24,Visa,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
1,0,1,25,Visa,Debit,4956965974959986,12/2020,393,YES,2,$21968,04/2014,2014,No
2,0,2,26,Visa,Debit,4582313478255491,02/2024,719,YES,2,$46414,07/2003,2004,No
3,0,3,20,Visa,Credit,4879494103069057,08/2024,693,NO,1,$12400,01/2003,2012,No
4,0,4,17,Mastercard,Debit (Prepaid),5722874738736011,03/2009,75,YES,1,$28,09/2008,2009,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6141,1997,1,1,Amex,Credit,300609782832003,01/2024,663,YES,1,$6900,11/2000,2013,No
6142,1997,2,20,Visa,Credit,4718517475996018,01/2021,492,YES,2,$5700,04/2012,2012,No
6143,1998,0,7,Mastercard,Credit,5929512204765914,08/2020,237,NO,2,$9200,02/2012,2012,No
6144,1999,0,11,Mastercard,Debit,5589768928167462,01/2020,630,YES,1,$28074,01/2020,2020,No


In [16]:
# Remove the `Card Brand` and `Card Type` columns
cc_df.drop(columns=['Card Brand', 'Card Type'])

Unnamed: 0,User,CARD INDEX,Retailer ID,Card Number,Expires,CVV,Has Chip,Cards Issued,Credit Limit,Acct Open Date,Year PIN last Changed,Card on Dark Web
0,0,0,24,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
1,0,1,25,4956965974959986,12/2020,393,YES,2,$21968,04/2014,2014,No
2,0,2,26,4582313478255491,02/2024,719,YES,2,$46414,07/2003,2004,No
3,0,3,20,4879494103069057,08/2024,693,NO,1,$12400,01/2003,2012,No
4,0,4,17,5722874738736011,03/2009,75,YES,1,$28,09/2008,2009,No
...,...,...,...,...,...,...,...,...,...,...,...,...
6141,1997,1,1,300609782832003,01/2024,663,YES,1,$6900,11/2000,2013,No
6142,1997,2,20,4718517475996018,01/2021,492,YES,2,$5700,04/2012,2012,No
6143,1998,0,7,5929512204765914,08/2020,237,NO,2,$9200,02/2012,2012,No
6144,1999,0,11,5589768928167462,01/2020,630,YES,1,$28074,01/2020,2020,No


In [17]:
cc_df.to_csv('sd254_cards_processed.csv')