## Import Dependencies

In [1]:
import atoti
import pandas as pd
import time

## Load Data from S3 

In [2]:
# Load credit card info data
cc_df = pd.read_csv('s3://data.atoti.io/notebooks/retail-banking/sd254_cards.csv')
cc_df

Unnamed: 0,User,CARD INDEX,Card Brand,Card Type,Card Number,Expires,CVV,Has Chip,Cards Issued,Credit Limit,Acct Open Date,Year PIN last Changed,Card on Dark Web
0,0,0,Visa,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
1,0,1,Visa,Debit,4956965974959986,12/2020,393,YES,2,$21968,04/2014,2014,No
2,0,2,Visa,Debit,4582313478255491,02/2024,719,YES,2,$46414,07/2003,2004,No
3,0,3,Visa,Credit,4879494103069057,08/2024,693,NO,1,$12400,01/2003,2012,No
4,0,4,Mastercard,Debit (Prepaid),5722874738736011,03/2009,75,YES,1,$28,09/2008,2009,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6141,1997,1,Amex,Credit,300609782832003,01/2024,663,YES,1,$6900,11/2000,2013,No
6142,1997,2,Visa,Credit,4718517475996018,01/2021,492,YES,2,$5700,04/2012,2012,No
6143,1998,0,Mastercard,Credit,5929512204765914,08/2020,237,NO,2,$9200,02/2012,2012,No
6144,1999,0,Mastercard,Debit,5589768928167462,01/2020,630,YES,1,$28074,01/2020,2020,No


## Analyze User Credit Card Data

In [3]:
# Find the distinct credit card brands
unique_values = cc_df['Card Brand'].unique()
print(sorted(unique_values))

['Amex', 'Discover', 'Mastercard', 'Visa']


In [4]:
# Find the distinct credit card combinations, and their frequency counts overall
cc_combinations_df = cc_df.groupby(['Card Brand', 'Card Type'], as_index = False).size()
cc_combinations_df

Unnamed: 0,Card Brand,Card Type,size
0,Amex,Credit,402
1,Discover,Credit,209
2,Mastercard,Credit,635
3,Mastercard,Debit,2191
4,Mastercard,Debit (Prepaid),383
5,Visa,Credit,811
6,Visa,Debit,1320
7,Visa,Debit (Prepaid),195


In [5]:
# Find the distinct credit card combinations for each user, and their frequency counts
cc_combinations_df = cc_df.groupby(['User', 'Card Brand', 'Card Type'], as_index = False).size()
cc_combinations_df

Unnamed: 0,User,Card Brand,Card Type,size
0,0,Mastercard,Debit (Prepaid),1
1,0,Visa,Credit,1
2,0,Visa,Debit,3
3,1,Mastercard,Debit,1
4,1,Mastercard,Debit (Prepaid),2
...,...,...,...,...
4646,1997,Mastercard,Debit,1
4647,1997,Visa,Credit,1
4648,1998,Mastercard,Credit,1
4649,1999,Mastercard,Debit,1


In [6]:
# Find the max frequency count for all distinct credit card combinations at the user level
max_cc_combinations = cc_combinations_df.groupby(['Card Brand', 'Card Type'], as_index = False)['size'].max()
max_cc_combinations

Unnamed: 0,Card Brand,Card Type,size
0,Amex,Credit,4
1,Discover,Credit,2
2,Mastercard,Credit,4
3,Mastercard,Debit,6
4,Mastercard,Debit (Prepaid),3
5,Visa,Credit,4
6,Visa,Debit,5
7,Visa,Debit (Prepaid),2


In [7]:
# Load user credit card data
cc_df = pd.read_csv('s3://data.atoti.io/notebooks/retail-banking/sd254_cards.csv')
cc_df

Unnamed: 0,User,CARD INDEX,Card Brand,Card Type,Card Number,Expires,CVV,Has Chip,Cards Issued,Credit Limit,Acct Open Date,Year PIN last Changed,Card on Dark Web
0,0,0,Visa,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
1,0,1,Visa,Debit,4956965974959986,12/2020,393,YES,2,$21968,04/2014,2014,No
2,0,2,Visa,Debit,4582313478255491,02/2024,719,YES,2,$46414,07/2003,2004,No
3,0,3,Visa,Credit,4879494103069057,08/2024,693,NO,1,$12400,01/2003,2012,No
4,0,4,Mastercard,Debit (Prepaid),5722874738736011,03/2009,75,YES,1,$28,09/2008,2009,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6141,1997,1,Amex,Credit,300609782832003,01/2024,663,YES,1,$6900,11/2000,2013,No
6142,1997,2,Visa,Credit,4718517475996018,01/2021,492,YES,2,$5700,04/2012,2012,No
6143,1998,0,Mastercard,Credit,5929512204765914,08/2020,237,NO,2,$9200,02/2012,2012,No
6144,1999,0,Mastercard,Debit,5589768928167462,01/2020,630,YES,1,$28074,01/2020,2020,No


In [8]:
# Load credit card info data
cc_info = pd.read_csv('s3://data.atoti.io/notebooks/retail-banking/cc_info.csv')
cc_info

Unnamed: 0,Retailer ID,Retailer Name,Card Brand,Card Type,Industry
0,1,Cathay Pacific Elite,Amex,Credit,Industrials
1,2,Hilton Honors,Amex,Credit,Consumer Discretionary
2,3,Delta SkyMiles Reserve,Amex,Credit,Industrials
3,4,Marriot Bonvoy Brilliant,Amex,Credit,Consumer Discretionary
4,5,Discover it Miles,Discover,Credit,Financials
5,6,Discover it Secured,Discover,Credit,Financials
6,7,Capital One VentureOne Rewards,Mastercard,Credit,Financials
7,8,Citi / AAdvantage Executive World Elite,Mastercard,Credit,Industrials
8,9,Capital One Quicksilver Cash Rewards,Mastercard,Credit,Financials
9,10,IHG One Rewards Traveler,Mastercard,Credit,Consumer Discretionary


In [9]:
# Due to lack of time, this is an extremely crude way
# of creating the entity relationship (hard-coding), sorry!
cc_dict = {'Amex Credit': [1, 2, 3, 4],
           'Discover Credit': [5, 6],
           'Mastercard Credit': [7, 8, 9, 10],
           'Mastercard Debit': [11, 12, 13, 14, 15, 16],
           'Mastercard Debit (Prepaid)': [17, 18, 19],
           'Visa Credit': [20, 21, 22, 23],
           'Visa Debit': [24, 25, 26, 27, 28],
           'Visa Debit (Prepaid)': [29, 30]
           }

In [10]:
# Add the new `Retailer ID` column with empty values
cc_df.insert(2, 'Retailer ID', "")
cc_df

Unnamed: 0,User,CARD INDEX,Retailer ID,Card Brand,Card Type,Card Number,Expires,CVV,Has Chip,Cards Issued,Credit Limit,Acct Open Date,Year PIN last Changed,Card on Dark Web
0,0,0,,Visa,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
1,0,1,,Visa,Debit,4956965974959986,12/2020,393,YES,2,$21968,04/2014,2014,No
2,0,2,,Visa,Debit,4582313478255491,02/2024,719,YES,2,$46414,07/2003,2004,No
3,0,3,,Visa,Credit,4879494103069057,08/2024,693,NO,1,$12400,01/2003,2012,No
4,0,4,,Mastercard,Debit (Prepaid),5722874738736011,03/2009,75,YES,1,$28,09/2008,2009,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6141,1997,1,,Amex,Credit,300609782832003,01/2024,663,YES,1,$6900,11/2000,2013,No
6142,1997,2,,Visa,Credit,4718517475996018,01/2021,492,YES,2,$5700,04/2012,2012,No
6143,1998,0,,Mastercard,Credit,5929512204765914,08/2020,237,NO,2,$9200,02/2012,2012,No
6144,1999,0,,Mastercard,Debit,5589768928167462,01/2020,630,YES,1,$28074,01/2020,2020,No


In [11]:
# Get cards for each unique user and sort by Card Brand and Card Type
# Set the Retailer ID for each user credit card, while accounting for
# Users with multiple distinct credit cards of the same Card Brand and Card Type
for user in cc_df['User'].unique():
    df = cc_df.loc[cc_df['User'] == user].sort_values(by=['Card Brand', 'Card Type'])
    prev_row = None
    
    for index, row in df.iterrows():
        cc_input = row['Card Brand'] + " " + row['Card Type']
        distinct_count = cc_combinations_df.loc[
                        (cc_combinations_df['User'] == user) & 
                        (cc_combinations_df['Card Brand'] == row['Card Brand']) 
                        & (cc_combinations_df['Card Type'] == row['Card Type'])]['size'].values[0]
        
        if prev_row is None:
            print("FIRST ROW AND NEW CARD FOR USER")
            num_counter = 0
            print(f"  User {user} has a {cc_input}, and has {distinct_count} distinct cards")
            assignment = cc_dict[cc_input][num_counter]
            print(f"    Assigning to Retailer ID... {assignment}")
            cc_df.loc[index, 'Retailer ID'] = assignment
            prev_row = row
            
        else:
            
            if str(prev_row['Card Brand']) == str(row['Card Brand']) and str(prev_row['Card Type']) == str(row['Card Type']):
                print("SAME CARD AS PREVIOUS ROW")
                num_counter+=1
                print(f"  User {user} has a {cc_input}, which is same as above, and has {distinct_count} distinct cards")
                assignment = cc_dict[cc_input][num_counter]
                print(f"    Assigning to Retailer ID... {assignment}")
                cc_df.loc[index, 'Retailer ID'] = assignment
                prev_row = row
                
            else:
                print("NEW CARD FOR SAME USER")
                num_counter = 0
                print(f"  User {user} has a {cc_input}, and has {distinct_count} distinct cards")
                assignment = cc_dict[cc_input][num_counter]
                print(f"    Assigning to Retailer ID... {assignment}")
                cc_df.loc[index, 'Retailer ID'] = assignment
                prev_row = row

FIRST ROW AND NEW CARD FOR USER
  User 0 has a Mastercard Debit (Prepaid), and has 1 distinct cards
    Assigning to Retailer ID... 17
NEW CARD FOR SAME USER
  User 0 has a Visa Credit, and has 1 distinct cards
    Assigning to Retailer ID... 20
NEW CARD FOR SAME USER
  User 0 has a Visa Debit, and has 3 distinct cards
    Assigning to Retailer ID... 24
SAME CARD AS PREVIOUS ROW
  User 0 has a Visa Debit, which is same as above, and has 3 distinct cards
    Assigning to Retailer ID... 25
SAME CARD AS PREVIOUS ROW
  User 0 has a Visa Debit, which is same as above, and has 3 distinct cards
    Assigning to Retailer ID... 26
FIRST ROW AND NEW CARD FOR USER
  User 1 has a Mastercard Debit, and has 1 distinct cards
    Assigning to Retailer ID... 11
NEW CARD FOR SAME USER
  User 1 has a Mastercard Debit (Prepaid), and has 2 distinct cards
    Assigning to Retailer ID... 17
SAME CARD AS PREVIOUS ROW
  User 1 has a Mastercard Debit (Prepaid), which is same as above, and has 2 distinct cards
 

In [12]:
# Check Retailer ID values have been assigned
cc_df

Unnamed: 0,User,CARD INDEX,Retailer ID,Card Brand,Card Type,Card Number,Expires,CVV,Has Chip,Cards Issued,Credit Limit,Acct Open Date,Year PIN last Changed,Card on Dark Web
0,0,0,24,Visa,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
1,0,1,25,Visa,Debit,4956965974959986,12/2020,393,YES,2,$21968,04/2014,2014,No
2,0,2,26,Visa,Debit,4582313478255491,02/2024,719,YES,2,$46414,07/2003,2004,No
3,0,3,20,Visa,Credit,4879494103069057,08/2024,693,NO,1,$12400,01/2003,2012,No
4,0,4,17,Mastercard,Debit (Prepaid),5722874738736011,03/2009,75,YES,1,$28,09/2008,2009,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6141,1997,1,1,Amex,Credit,300609782832003,01/2024,663,YES,1,$6900,11/2000,2013,No
6142,1997,2,20,Visa,Credit,4718517475996018,01/2021,492,YES,2,$5700,04/2012,2012,No
6143,1998,0,7,Mastercard,Credit,5929512204765914,08/2020,237,NO,2,$9200,02/2012,2012,No
6144,1999,0,11,Mastercard,Debit,5589768928167462,01/2020,630,YES,1,$28074,01/2020,2020,No


In [13]:
# Remove the `Card Brand` and `Card Type` columns
cc_df.drop(columns=['Card Brand', 'Card Type'], inplace=True)
cc_df.head()

Unnamed: 0,User,CARD INDEX,Retailer ID,Card Number,Expires,CVV,Has Chip,Cards Issued,Credit Limit,Acct Open Date,Year PIN last Changed,Card on Dark Web
0,0,0,24,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
1,0,1,25,4956965974959986,12/2020,393,YES,2,$21968,04/2014,2014,No
2,0,2,26,4582313478255491,02/2024,719,YES,2,$46414,07/2003,2004,No
3,0,3,20,4879494103069057,08/2024,693,NO,1,$12400,01/2003,2012,No
4,0,4,17,5722874738736011,03/2009,75,YES,1,$28,09/2008,2009,No


In [14]:
# Cast intended measures as numerical data types
cc_df['Credit Limit'] = cc_df['Credit Limit'].str.replace('$', '')
cc_df['Credit Limit'] = cc_df['Credit Limit'].astype(int)
cc_df['Credit Limit'] = cc_df['Credit Limit'] + 50000

In [15]:
# Output DataFrame to CSV file
cc_df.to_csv('processed_data/sd254_cards_processed.csv')

## Add Credit Loss Attributes to Users Data

In [16]:
# Load users info data
users_df = pd.read_csv('s3://data.atoti.io/notebooks/retail-banking/sd254_users.csv')
users_df

Unnamed: 0,Person,Current Age,Retirement Age,Birth Year,Birth Month,Gender,Address,Apartment,City,State,Zipcode,Latitude,Longitude,Per Capita Income - Zipcode,Yearly Income - Person,Total Debt,FICO Score,Num Credit Cards
0,Hazel Robinson,53,66,1966,11,Female,462 Rose Lane,,La Verne,CA,91750,34.15,-117.76,$29278,$59696,$127613,787,5
1,Sasha Sadr,53,68,1966,12,Female,3606 Federal Boulevard,,Little Neck,NY,11363,40.76,-73.74,$37891,$77254,$191349,701,5
2,Saanvi Lee,81,67,1938,11,Female,766 Third Drive,,West Covina,CA,91792,34.02,-117.89,$22681,$33483,$196,698,5
3,Everlee Clark,63,63,1957,1,Female,3 Madison Street,,New York,NY,10069,40.71,-73.99,$163145,$249925,$202328,722,4
4,Kyle Peterson,43,70,1976,9,Male,9620 Valley Stream Drive,,San Francisco,CA,94117,37.76,-122.44,$53797,$109687,$183855,675,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,Jose Faraday,32,70,1987,7,Male,6577 Lexington Lane,9.0,Freeport,NY,11520,40.65,-73.58,$23550,$48010,$87837,703,3
1996,Ximena Richardson,62,65,1957,11,Female,2 Elm Drive,955.0,Independence,KY,41051,38.95,-84.54,$24218,$49378,$104480,740,4
1997,Annika Russell,47,67,1973,1,Female,276 Fifth Boulevard,,Elizabeth,NJ,7201,40.66,-74.19,$15175,$30942,$71066,779,3
1998,Juelz Roman,66,60,1954,2,Male,259 Valley Boulevard,,Camp Hill,PA,17011,40.24,-76.92,$25336,$54654,$27241,618,1


In [17]:
contracts_df = pd.read_csv('contracts_2022-12-30 2.csv', low_memory=False)

In [18]:
contracts_df

Unnamed: 0,Reporting Date,EAD,PD12,PDLT,LGD,Maturity Date,Residual Maturity,Bucketed Arrears,Reporting Index,Is New Contract,Client ID,FICO,FICO Segment,LTV Segment,Macro Economic Scenario,Entity
0,12/30/22,69795.693590,0.057123,0.024117,0.607348,11/5/32,3597,0,2,False,9XFALYXL,300,300-410,81%-90%,Base,Paris
1,12/30/22,24045.796250,0.056460,0.021793,0.613929,12/5/32,3598,0,0,True,KHAAK99A,300,300-410,>30%,Base,Paris
2,12/30/22,2019.251356,0.017708,0.017708,0.607348,8/10/23,460,0,113,False,AZHHTHYX,300,300-410,>30%,Base,Paris
3,12/30/22,20563.067570,0.034836,0.154493,0.613929,8/15/31,3152,0,17,False,YZAZKX99,300,300-410,>30%,Base,Paris
4,12/30/22,2666.804003,0.017043,0.017043,0.613929,5/15/24,504,0,20,False,AZLKF9YH,300,300-410,30%-50%,Base,Paris
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336393,12/30/22,460.992072,0.041017,0.041017,0.322573,10/15/23,290,91-180,50,False,BN9JJQ,850,741-850,71%-80%,Base,NewYork
336394,12/30/22,116.777909,0.041017,0.041017,0.322573,6/28/23,181,0,47,False,NSJJQ,850,741-850,>30%,Base,NewYork
336395,12/30/22,2461.296310,0.013934,0.063931,0.497079,4/9/26,1168,0,44,False,BJGBJ9,850,741-850,81%-90%,Base,NewYork
336396,12/30/22,41522.000000,0.019270,0.038064,0.607348,1/15/25,746,0,11,False,122O9E2E,850,741-850,71%-80%,Base,NewYork


In [19]:
groupby_contracts_df = contracts_df.groupby(['FICO']).mean(['EAD'])
groupby_contracts_df.reset_index()

Unnamed: 0,FICO,EAD,PD12,PDLT,LGD,Residual Maturity,Reporting Index,Is New Contract
0,300,8980.553229,0.104477,0.155930,0.604633,1787.578864,26.446372,0.015773
1,301,9349.136091,0.112370,0.163001,0.605998,1786.359551,26.280899,0.028892
2,302,8508.016229,0.100375,0.148074,0.602229,1739.838710,27.416129,0.030645
3,303,7804.911004,0.091073,0.137547,0.598811,1727.121685,26.806552,0.031201
4,304,8688.726128,0.103132,0.154085,0.604463,1741.822476,28.037459,0.027687
...,...,...,...,...,...,...,...,...
546,846,9125.236908,0.094646,0.143602,0.600449,1820.182566,27.258224,0.023026
547,847,8643.170678,0.103138,0.149518,0.605348,1697.993681,27.949447,0.018957
548,848,7772.732491,0.097820,0.148221,0.606113,1737.099130,27.521739,0.020870
549,849,7968.876954,0.104263,0.151679,0.607293,1769.934084,26.741158,0.030547


In [20]:
df3 = pd.merge(users_df, groupby_contracts_df, left_on='FICO Score', right_on='FICO')

In [21]:
df3

Unnamed: 0,Person,Current Age,Retirement Age,Birth Year,Birth Month,Gender,Address,Apartment,City,State,...,Total Debt,FICO Score,Num Credit Cards,EAD,PD12,PDLT,LGD,Residual Maturity,Reporting Index,Is New Contract
0,Hazel Robinson,53,66,1966,11,Female,462 Rose Lane,,La Verne,CA,...,$127613,787,5,7543.452009,0.102502,0.148166,0.608904,1664.486322,28.908815,0.024316
1,Nickolas Lopez,21,67,1999,2,Male,92196 Tenth Drive,,Leesburg,VA,...,$85204,787,2,7543.452009,0.102502,0.148166,0.608904,1664.486322,28.908815,0.024316
2,Kallie Rodriguez,39,71,1980,7,Female,135 Littlewood Avenue,6.0,Oceanside,CA,...,$91549,787,1,7543.452009,0.102502,0.148166,0.608904,1664.486322,28.908815,0.024316
3,Rylan Rodriguez,33,69,1986,10,Female,928 Bayview Street,,Portage,WI,...,$0,787,3,7543.452009,0.102502,0.148166,0.608904,1664.486322,28.908815,0.024316
4,Sasha Sadr,53,68,1966,12,Female,3606 Federal Boulevard,,Little Neck,NY,...,$191349,701,5,8943.997200,0.105292,0.150311,0.602701,1747.076299,26.618506,0.038961
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,Alessandro Davis,37,66,1982,12,Male,550 Forest Street,,Helena,MT,...,$71180,580,1,8195.670961,0.103159,0.150773,0.607600,1747.212670,26.556561,0.024133
1996,Darren Turner,31,63,1988,5,Male,6692 Lake Street,,Taylorsville,KY,...,$53853,514,1,8681.589080,0.123131,0.171764,0.611464,1825.388976,27.366929,0.020472
1997,August Braun,42,72,1977,8,Male,331 Oak Lane,,Antioch,CA,...,$99235,563,2,9434.481972,0.097110,0.143415,0.605630,1780.612245,26.211931,0.032967
1998,Kyng El-Mafouk,51,68,1968,10,Male,207 Ocean View Street,,Berkeley Heights,NJ,...,$242379,505,1,8936.491473,0.101380,0.149277,0.602884,1747.658249,26.885522,0.040404


In [22]:
df3.columns

Index(['Person', 'Current Age', 'Retirement Age', 'Birth Year', 'Birth Month',
       'Gender', 'Address', 'Apartment', 'City', 'State', 'Zipcode',
       'Latitude', 'Longitude', 'Per Capita Income - Zipcode',
       'Yearly Income - Person', 'Total Debt', 'FICO Score',
       'Num Credit Cards', 'EAD', 'PD12', 'PDLT', 'LGD', 'Residual Maturity',
       'Reporting Index', 'Is New Contract'],
      dtype='object')

In [23]:
df3.drop(columns=['Residual Maturity', 'Reporting Index', 'Is New Contract'], inplace=True)

In [24]:
df3.to_csv("processed_data/sd254_users_processed.csv", index=False)

## Modify Merchant Names to be MerchantN e.g. (Merchant1, Merchant2, etc.)

In [25]:
cc_sales_gzip_df = pd.read_csv("credit_card_transactions_ibm.csv.gz", compression="gzip",)
cc_sales_gzip_df

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
0,0,0,2002,9,1,06:21,$134.09,Swipe Transaction,3527213246127876953,La Verne,CA,91750.0,5300,,No
1,0,0,2002,9,1,06:42,$38.48,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
2,0,0,2002,9,2,06:22,$120.34,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
3,0,0,2002,9,2,17:45,$128.95,Swipe Transaction,3414527459579106770,Monterey Park,CA,91754.0,5651,,No
4,0,0,2002,9,3,06:23,$104.71,Swipe Transaction,5817218446178736267,La Verne,CA,91750.0,5912,,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24386895,1999,1,2020,2,27,22:23,$-54.00,Chip Transaction,-5162038175624867091,Merrimack,NH,3054.0,5541,,No
24386896,1999,1,2020,2,27,22:24,$54.00,Chip Transaction,-5162038175624867091,Merrimack,NH,3054.0,5541,,No
24386897,1999,1,2020,2,28,07:43,$59.15,Chip Transaction,2500998799892805156,Merrimack,NH,3054.0,4121,,No
24386898,1999,1,2020,2,28,20:10,$43.12,Chip Transaction,2500998799892805156,Merrimack,NH,3054.0,4121,,No


In [26]:
merchant_name_df = pd.DataFrame(cc_sales_gzip_df.groupby('Merchant Name').count().index.tolist(), columns=['Merchant Name'])

In [27]:
merchant_name_df.insert(1, 'Merchant Name (Revised)', "")

In [28]:
merchant_name_df

Unnamed: 0,Merchant Name,Merchant Name (Revised)
0,-9222899435637403521,
1,-9222692221935167526,
2,-9222439367252190791,
3,-9222264855000293132,
4,-9222232253446715869,
...,...,...
100338,9222821118491815331,
100339,9222874644865944349,
100340,9222877122873253163,
100341,9222957302638210593,


In [29]:
counter = 1

for index, row in merchant_name_df.iterrows():
    name = f"Merchant {counter}"
    merchant_name_df.loc[index, 'Merchant Name (Revised)'] = name
    counter+=1

In [30]:
merchant_name_df

Unnamed: 0,Merchant Name,Merchant Name (Revised)
0,-9222899435637403521,Merchant 1
1,-9222692221935167526,Merchant 2
2,-9222439367252190791,Merchant 3
3,-9222264855000293132,Merchant 4
4,-9222232253446715869,Merchant 5
...,...,...
100338,9222821118491815331,Merchant 100339
100339,9222874644865944349,Merchant 100340
100340,9222877122873253163,Merchant 100341
100341,9222957302638210593,Merchant 100342


In [31]:
merchant_name_merge_df = pd.merge(cc_sales_gzip_df, merchant_name_df, left_on='Merchant Name', right_on='Merchant Name')
merchant_name_merge_df

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?,Merchant Name (Revised)
0,0,0,2002,9,1,06:21,$134.09,Swipe Transaction,3527213246127876953,La Verne,CA,91750.0,5300,,No,Merchant 69375
1,0,0,2002,9,10,06:22,$102.18,Swipe Transaction,3527213246127876953,La Verne,CA,91750.0,5300,,No,Merchant 69375
2,0,0,2002,9,16,06:00,$115.34,Swipe Transaction,3527213246127876953,La Verne,CA,91750.0,5300,,No,Merchant 69375
3,0,0,2002,9,18,06:19,$128.85,Swipe Transaction,3527213246127876953,La Verne,CA,91750.0,5300,,No,Merchant 69375
4,0,0,2002,9,23,06:01,$134.89,Swipe Transaction,3527213246127876953,La Verne,CA,91750.0,5300,,No,Merchant 69375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24386895,1999,1,2019,12,21,07:59,$42.80,Chip Transaction,-3533580464561517260,Russellville,AL,35653.0,4121,,No,Merchant 31143
24386896,1999,1,2019,12,22,08:15,$46.72,Chip Transaction,-3533580464561517260,Russellville,AL,35653.0,4121,,No,Merchant 31143
24386897,1999,1,2019,12,22,20:25,$46.30,Chip Transaction,-3533580464561517260,Russellville,AL,35653.0,4121,,No,Merchant 31143
24386898,1999,1,2019,12,23,19:48,$49.00,Chip Transaction,-3533580464561517260,Russellville,AL,35653.0,4121,,No,Merchant 31143


In [32]:
merchant_name_merge_df.pop('Merchant Name')
merchant_name_merge_df.rename(columns={"Merchant Name (Revised)": "Merchant Name"}, inplace=True)
merchant_name_merge_df

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?,Merchant Name
0,0,0,2002,9,1,06:21,$134.09,Swipe Transaction,La Verne,CA,91750.0,5300,,No,Merchant 69375
1,0,0,2002,9,10,06:22,$102.18,Swipe Transaction,La Verne,CA,91750.0,5300,,No,Merchant 69375
2,0,0,2002,9,16,06:00,$115.34,Swipe Transaction,La Verne,CA,91750.0,5300,,No,Merchant 69375
3,0,0,2002,9,18,06:19,$128.85,Swipe Transaction,La Verne,CA,91750.0,5300,,No,Merchant 69375
4,0,0,2002,9,23,06:01,$134.89,Swipe Transaction,La Verne,CA,91750.0,5300,,No,Merchant 69375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24386895,1999,1,2019,12,21,07:59,$42.80,Chip Transaction,Russellville,AL,35653.0,4121,,No,Merchant 31143
24386896,1999,1,2019,12,22,08:15,$46.72,Chip Transaction,Russellville,AL,35653.0,4121,,No,Merchant 31143
24386897,1999,1,2019,12,22,20:25,$46.30,Chip Transaction,Russellville,AL,35653.0,4121,,No,Merchant 31143
24386898,1999,1,2019,12,23,19:48,$49.00,Chip Transaction,Russellville,AL,35653.0,4121,,No,Merchant 31143


In [33]:
merchant_name_merge_df_revised = merchant_name_merge_df[['User',
 'Card',
 'Year',
 'Month',
 'Day',
 'Time',
 'Amount',
 'Use Chip',
 'Merchant Name',
 'Merchant City',
 'Merchant State',
 'Zip',
 'MCC',
 'Errors?',
 'Is Fraud?']]

In [34]:
merchant_name_merge_df_revised

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
0,0,0,2002,9,1,06:21,$134.09,Swipe Transaction,Merchant 69375,La Verne,CA,91750.0,5300,,No
1,0,0,2002,9,10,06:22,$102.18,Swipe Transaction,Merchant 69375,La Verne,CA,91750.0,5300,,No
2,0,0,2002,9,16,06:00,$115.34,Swipe Transaction,Merchant 69375,La Verne,CA,91750.0,5300,,No
3,0,0,2002,9,18,06:19,$128.85,Swipe Transaction,Merchant 69375,La Verne,CA,91750.0,5300,,No
4,0,0,2002,9,23,06:01,$134.89,Swipe Transaction,Merchant 69375,La Verne,CA,91750.0,5300,,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24386895,1999,1,2019,12,21,07:59,$42.80,Chip Transaction,Merchant 31143,Russellville,AL,35653.0,4121,,No
24386896,1999,1,2019,12,22,08:15,$46.72,Chip Transaction,Merchant 31143,Russellville,AL,35653.0,4121,,No
24386897,1999,1,2019,12,22,20:25,$46.30,Chip Transaction,Merchant 31143,Russellville,AL,35653.0,4121,,No
24386898,1999,1,2019,12,23,19:48,$49.00,Chip Transaction,Merchant 31143,Russellville,AL,35653.0,4121,,No


In [35]:
# Cast intended measures as numerical data types
merchant_name_merge_df_revised['Amount'] = merchant_name_merge_df_revised['Amount'].str.replace('$', '')
merchant_name_merge_df_revised['Amount'] = merchant_name_merge_df_revised['Amount'].astype(float)
merchant_name_merge_df_revised['Amount'] = merchant_name_merge_df_revised['Amount'] * .20

In [36]:
merchant_name_merge_df_5MM = merchant_name_merge_df_revised[19386900:]
merchant_name_merge_df_10MM = merchant_name_merge_df_revised[14386900:]

In [37]:
merchant_name_merge_df_5MM.to_csv('processed_data/credit_card_transactions_ibm_processed_5MM.csv', index=False)
merchant_name_merge_df_10MM.to_csv('processed_data/credit_card_transactions_ibm_processed_10MM.csv', index=False)

In [38]:
!gzip processed_data/credit_card_transactions_ibm_processed_5MM.csv 

In [39]:
!gzip processed_data/credit_card_transactions_ibm_processed_10MM.csv

In [40]:
!rm -rf processed_data/credit_card_transactions_ibm_processed_5MM.csv processed_data/credit_card_transactions_ibm_processed_10MM.csv

## Analyze Amounts

In [41]:
merchant_name_merge_df_5MM

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
19386900,50,0,2019,7,4,23:37,4.432,Chip Transaction,Merchant 47076,Beaverton,OR,97007.0,5921,,No
19386901,50,1,2019,6,28,23:33,3.436,Chip Transaction,Merchant 47076,Beaverton,OR,97007.0,5921,,No
19386902,792,1,2015,11,28,16:08,1.672,Chip Transaction,Merchant 47076,Yonkers,NY,10703.0,5921,,No
19386903,1210,1,2007,3,7,22:52,4.076,Swipe Transaction,Merchant 47076,Beaverton,OR,97007.0,5921,,No
19386904,1575,0,2016,5,23,07:07,2.462,Swipe Transaction,Merchant 47076,Shreveport,LA,71107.0,5921,,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24386895,1999,1,2019,12,21,07:59,8.560,Chip Transaction,Merchant 31143,Russellville,AL,35653.0,4121,,No
24386896,1999,1,2019,12,22,08:15,9.344,Chip Transaction,Merchant 31143,Russellville,AL,35653.0,4121,,No
24386897,1999,1,2019,12,22,20:25,9.260,Chip Transaction,Merchant 31143,Russellville,AL,35653.0,4121,,No
24386898,1999,1,2019,12,23,19:48,9.800,Chip Transaction,Merchant 31143,Russellville,AL,35653.0,4121,,No


In [42]:
dfupdate=merchant_name_merge_df_5MM.sample(1000000)
dfupdate.Amount *= -1
merchant_name_merge_df_5MM.update(dfupdate)

In [43]:
merchant_name_merge_df_5MM

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
19386900,50,0,2019,7,4,23:37,4.432,Chip Transaction,Merchant 47076,Beaverton,OR,97007.0,5921,,No
19386901,50,1,2019,6,28,23:33,-3.436,Chip Transaction,Merchant 47076,Beaverton,OR,97007.0,5921,,No
19386902,792,1,2015,11,28,16:08,-1.672,Chip Transaction,Merchant 47076,Yonkers,NY,10703.0,5921,,No
19386903,1210,1,2007,3,7,22:52,4.076,Swipe Transaction,Merchant 47076,Beaverton,OR,97007.0,5921,,No
19386904,1575,0,2016,5,23,07:07,2.462,Swipe Transaction,Merchant 47076,Shreveport,LA,71107.0,5921,,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24386895,1999,1,2019,12,21,07:59,8.560,Chip Transaction,Merchant 31143,Russellville,AL,35653.0,4121,,No
24386896,1999,1,2019,12,22,08:15,9.344,Chip Transaction,Merchant 31143,Russellville,AL,35653.0,4121,,No
24386897,1999,1,2019,12,22,20:25,9.260,Chip Transaction,Merchant 31143,Russellville,AL,35653.0,4121,,No
24386898,1999,1,2019,12,23,19:48,-9.800,Chip Transaction,Merchant 31143,Russellville,AL,35653.0,4121,,No


In [44]:
merchant_name_merge_df_5MM.to_csv('processed_data/credit_card_transactions_ibm_processed_5MM.csv', index=False)

In [45]:
!gzip processed_data/credit_card_transactions_ibm_processed_5MM.csv 