# Setup

## Import

In [2]:
import pandas as pd
import numpy as np
import datetime as dt

## Constants

In [3]:
CARDS_FILENAME = 'data/sd254_cards.csv'
USERS_FILENAME = 'data/sd254_users.csv'

In [22]:
def EncodeColumns(df, cols, target_column_name):
    target_encoding_map = {}

    for col in cols:
        encoding = df.groupby(col)[target_column_name].mean().to_dict()

        df[col + "_target_encoded"] = df[col].map(encoding)

        target_encoding_map[col] = encoding

    df.drop(columns=cols, inplace=True)

    return df

In [4]:
def TurnStringIntoDateTime(x):
    return dt.datetime.strptime(x, '%d/%m/%Y').date()

# Cards Dataset

Keep:
- Index: User? Card Index?
- Card Brand
- Card Type
- Card Number
- Expiration date
    - Expiration Month
    - Expiration Year
- CVV
- Had Chip
- Cards Issued (Look into it)
- Credit Limit
- Account Open Date
    - Account Open Month
    - Account Open Year
- Year Pin last changed
- Card on Dark Web

In [25]:
def LoadCards():
    df = pd.read_csv(CARDS_FILENAME)
    df.columns = [x.lower() for x in df.columns]
    df.columns = [x.replace(' ', '_') for x in df.columns]

    #df.acct_open_date = df.apply(lambda x: dt.datetime.strptime(str(x), '%d/%m/%Y').date())
    #print(df.acct_open_date.apply(lambda x: dt.datetime.strptime(str(x), '%m/%Y').date()))
    df['acct_open_month'] = pd.DatetimeIndex(df['acct_open_date']).month
    df['acct_open_year'] = pd.DatetimeIndex(df['acct_open_date']).year
    df = df.drop('acct_open_date', axis=1)

    df['expires_month'] = pd.DatetimeIndex(df['expires']).month
    df['expires_year'] = pd.DatetimeIndex(df['expires']).year
    df = df.drop('expires', axis=1)
    #print(df.acct_open_date.apply(lambda x: print(str(x))))

    df['has_chip'] = df['has_chip'].replace('YES', 1)
    df['has_chip'] = df['has_chip'].replace('NO', 0)

    df['card_on_dark_web'] = df['card_on_dark_web'].replace('Yes', 1)
    df['card_on_dark_web'] = df['card_on_dark_web'].replace('No', 0)

    df['credit_limit'] = df['credit_limit'].apply(lambda value: float(value[1:]))

    df = df[['user','card_index','card_brand','card_type','card_number','expires_month', 'expires_year', 
             'cvv', 'has_chip', 'cards_issued', 'credit_limit', 'acct_open_month', 'acct_open_year', 'year_pin_last_changed', 'card_on_dark_web']]

    return df

### Test

In [26]:
LoadCards().head()

Unnamed: 0,user,card_index,card_brand,card_type,card_number,expires_month,expires_year,cvv,has_chip,cards_issued,credit_limit,acct_open_month,acct_open_year,year_pin_last_changed,card_on_dark_web
0,0,0,Visa,Debit,4344676511950444,12,2022,623,1,2,24295.0,9,2002,2008,0
1,0,1,Visa,Debit,4956965974959986,12,2020,393,1,2,21968.0,4,2014,2014,0
2,0,2,Visa,Debit,4582313478255491,2,2024,719,1,2,46414.0,7,2003,2004,0
3,0,3,Visa,Credit,4879494103069057,8,2024,693,0,1,12400.0,1,2003,2012,0
4,0,4,Mastercard,Debit (Prepaid),5722874738736011,3,2009,75,1,1,28.0,9,2008,2009,0


# User Dataset

Keep:
- Age
- Current Age
- Retirement age
- Birth Year
- Birth Month
- Gender
- City
- State
- Zipcode
- Per Capita Income Zipcode
- Yearly Income Person
- Total Debt
- FICO Score
- Number of Credit Cards

Drop:
- Person
- Address
- Apartment
- Latitude
- Longitude

In [29]:
def LoadUsers():
    df = pd.read_csv(USERS_FILENAME)
    df.columns = [x.lower() for x in df.columns]
    df.columns = [x.replace(' - ', ' ') for x in df.columns]
    df.columns = [x.replace(' ', '_') for x in df.columns]
    #df.columns = [x.replace(' ', '_') for x in df.columns]

    df['per_capita_income_zipcode'] = df['per_capita_income_zipcode'].apply(lambda value: float(value[1:]))
    df['yearly_income_person'] = df['yearly_income_person'].apply(lambda value: float(value[1:]))
    df['total_debt'] = df['total_debt'].apply(lambda value: float(value[1:]))
    

    df = df.drop('person', axis=1)
    df = df.drop('address', axis=1)
    df = df.drop('apartment', axis=1)
    df = df.drop('latitude', axis=1)
    df = df.drop('longitude', axis=1)
    
    return df

In [30]:
LoadUsers().head()

Unnamed: 0,current_age,retirement_age,birth_year,birth_month,gender,city,state,zipcode,per_capita_income_zipcode,yearly_income_person,total_debt,fico_score,num_credit_cards
0,53,66,1966,11,Female,La Verne,CA,91750,29278.0,59696.0,127613.0,787,5
1,53,68,1966,12,Female,Little Neck,NY,11363,37891.0,77254.0,191349.0,701,5
2,81,67,1938,11,Female,West Covina,CA,91792,22681.0,33483.0,196.0,698,5
3,63,63,1957,1,Female,New York,NY,10069,163145.0,249925.0,202328.0,722,4
4,43,70,1976,9,Male,San Francisco,CA,94117,53797.0,109687.0,183855.0,675,1


# Cards + Userts

In [31]:
cards = LoadCards()
users = LoadUsers()

In [32]:
users["user"] = users.index

In [33]:
combined = pd.merge(users, cards, left_on="user", right_on="user", how="outer")
combined.head()
print(combined.to_string())

      current_age  retirement_age  birth_year  birth_month  gender                       city state  zipcode  per_capita_income_zipcode  yearly_income_person  total_debt  fico_score  num_credit_cards  user  card_index  card_brand        card_type       card_number  expires_month  expires_year  cvv  has_chip  cards_issued  credit_limit  acct_open_month  acct_open_year  year_pin_last_changed  card_on_dark_web
0              53              66        1966           11  Female                   La Verne    CA    91750                    29278.0               59696.0    127613.0         787                 5     0           0        Visa            Debit  4344676511950444             12          2022  623         1             2       24295.0                9            2002                   2008                 0
1              53              66        1966           11  Female                   La Verne    CA    91750                    29278.0               59696.0    127613.0         