In [43]:
import pandas as pd
import numpy as np

# Datasets

In [44]:
import dataframe_loading as df

## Users

In [45]:
users = df.LoadUsers()

In [46]:
users.head()

Unnamed: 0,current_age,retirement_age,birth_year,birth_month,gender,city,state,zipcode,per_capita_income_zipcode,yearly_income_person,total_debt,fico_score,num_credit_cards
0,53,66,1966,11,Female,La Verne,CA,91750,29278.0,59696.0,127613.0,787,5
1,53,68,1966,12,Female,Little Neck,NY,11363,37891.0,77254.0,191349.0,701,5
2,81,67,1938,11,Female,West Covina,CA,91792,22681.0,33483.0,196.0,698,5
3,63,63,1957,1,Female,New York,NY,10069,163145.0,249925.0,202328.0,722,4
4,43,70,1976,9,Male,San Francisco,CA,94117,53797.0,109687.0,183855.0,675,1


## Cards

In [47]:
cards = df.LoadCards()

In [48]:
cards.head()

Unnamed: 0,user,card_index,card_brand,card_type,card_number,expires_month,expires_year,cvv,has_chip,cards_issued,credit_limit,acct_open_month,acct_open_year,year_pin_last_changed,card_on_dark_web
0,0,0,Visa,Debit,4344676511950444,12,2022,623,1,2,24295.0,9,2002,2008,0
1,0,1,Visa,Debit,4956965974959986,12,2020,393,1,2,21968.0,4,2014,2014,0
2,0,2,Visa,Debit,4582313478255491,2,2024,719,1,2,46414.0,7,2003,2004,0
3,0,3,Visa,Credit,4879494103069057,8,2024,693,0,1,12400.0,1,2003,2012,0
4,0,4,Mastercard,Debit (Prepaid),5722874738736011,3,2009,75,1,1,28.0,9,2008,2009,0


## Transactions

In [49]:
raw_transactions = df.LoadRawTransactions()

In [50]:
raw_transactions.head()

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
0,0,0,2002,9,1,06:21,$134.09,Swipe Transaction,3527213246127876953,La Verne,CA,91750.0,5300,,No
1,0,0,2002,9,1,06:42,$38.48,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
2,0,0,2002,9,2,06:22,$120.34,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
3,0,0,2002,9,2,17:45,$128.95,Swipe Transaction,3414527459579106770,Monterey Park,CA,91754.0,5651,,No
4,0,0,2002,9,3,06:23,$104.71,Swipe Transaction,5817218446178736267,La Verne,CA,91750.0,5912,,No


In [51]:
transactions = df.refactorTransactions(raw_transactions)

Column names made lowercase and spaces removed.
Space removed from card method
Amount parsed into float.
Time split into month, day, hour, and minute
Fraud column changed to 1 and 0
Error column fixed.


In [52]:
transactions['purchase_zip'] = transactions['zip']
transactions = transactions.drop('zip', axis=1)

In [53]:
transactions.head()

Unnamed: 0,user,card,year,month,day,amount,use_chip,merchant_city,merchant_state,mcc,hour,minute,is_fraud,errors,purchase_zip
0,0,0,2002,9,1,134.089996,Swipe,La Verne,CA,5300,6,21,0,,91750.0
1,0,0,2002,9,1,38.48,Swipe,Monterey Park,CA,5411,6,42,0,,91754.0
2,0,0,2002,9,2,120.339996,Swipe,Monterey Park,CA,5411,6,22,0,,91754.0
3,0,0,2002,9,2,128.949997,Swipe,Monterey Park,CA,5651,17,45,0,,91754.0
4,0,0,2002,9,3,104.709999,Swipe,La Verne,CA,5912,6,23,0,,91750.0


In [54]:
transactionErrors = transactions['errors'].value_counts()
print(transactionErrors)

errors
None                                                   23998469
Insufficient Balance                                     242783
Bad PIN                                                   58918
Technical Glitch                                          48157
Bad Card Number                                           13321
Bad CVV                                                   10740
Bad Expiration                                            10716
Bad Zipcode                                                2079
Bad PIN,Insufficient Balance                                581
Insufficient Balance,Technical Glitch                       457
Bad PIN,Technical Glitch                                    128
Bad Card Number,Insufficient Balance                        122
Bad CVV,Insufficient Balance                                 89
Bad Expiration,Insufficient Balance                          78
Bad Card Number,Bad CVV                                      60
Bad Card Number,Bad Expiration   

Started to split the errors column but stumbled upon some trouble

In [55]:
print(transactions.columns)

Index(['user', 'card', 'year', 'month', 'day', 'amount', 'use_chip',
       'merchant_city', 'merchant_state', 'mcc', 'hour', 'minute', 'is_fraud',
       'errors', 'purchase_zip'],
      dtype='object')


In [56]:
errorTypes = []

In [57]:
for e in transactionErrors.index:
    for s in e.split(','):
        if s not in errorTypes:
            errorTypes.append(s)

errorTypes.remove('None')

print(errorTypes)

['Insufficient Balance', 'Bad PIN', 'Technical Glitch', 'Bad Card Number', 'Bad CVV', 'Bad Expiration', 'Bad Zipcode']


In [60]:
for e in errorTypes:
    errorT = e.lower().replace(' ', '_')
    print(errorT)
    transactions[errotT] = 0
    transactions[errorT] = np.where(transactions.errors.str.contains(e), 1, 0)

insufficient_balance
bad_pin
technical_glitch
bad_card_number
bad_cvv
bad_expiration
bad_zipcode


In [61]:
print(transactions.columns)

Index(['user', 'card', 'year', 'month', 'day', 'amount', 'use_chip',
       'merchant_city', 'merchant_state', 'mcc', 'hour', 'minute', 'is_fraud',
       'errors', 'purchase_zip', 'insufficient_balance', 'bad_pin',
       'technical_glitch', 'bad_card_number', 'bad_cvv', 'bad_expiration',
       'bad_zipcode'],
      dtype='object')


In [62]:
transactions.head()

Unnamed: 0,user,card,year,month,day,amount,use_chip,merchant_city,merchant_state,mcc,...,is_fraud,errors,purchase_zip,insufficient_balance,bad_pin,technical_glitch,bad_card_number,bad_cvv,bad_expiration,bad_zipcode
0,0,0,2002,9,1,134.089996,Swipe,La Verne,CA,5300,...,0,,91750.0,0,0,0,0,0,0,0
1,0,0,2002,9,1,38.48,Swipe,Monterey Park,CA,5411,...,0,,91754.0,0,0,0,0,0,0,0
2,0,0,2002,9,2,120.339996,Swipe,Monterey Park,CA,5411,...,0,,91754.0,0,0,0,0,0,0,0
3,0,0,2002,9,2,128.949997,Swipe,Monterey Park,CA,5651,...,0,,91754.0,0,0,0,0,0,0,0
4,0,0,2002,9,3,104.709999,Swipe,La Verne,CA,5912,...,0,,91750.0,0,0,0,0,0,0,0


### Merging Dataframes

In [63]:
users["user"] = users.index

In [64]:
users

Unnamed: 0,current_age,retirement_age,birth_year,birth_month,gender,city,state,zipcode,per_capita_income_zipcode,yearly_income_person,total_debt,fico_score,num_credit_cards,user
0,53,66,1966,11,Female,La Verne,CA,91750,29278.0,59696.0,127613.0,787,5,0
1,53,68,1966,12,Female,Little Neck,NY,11363,37891.0,77254.0,191349.0,701,5,1
2,81,67,1938,11,Female,West Covina,CA,91792,22681.0,33483.0,196.0,698,5,2
3,63,63,1957,1,Female,New York,NY,10069,163145.0,249925.0,202328.0,722,4,3
4,43,70,1976,9,Male,San Francisco,CA,94117,53797.0,109687.0,183855.0,675,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,32,70,1987,7,Male,Freeport,NY,11520,23550.0,48010.0,87837.0,703,3,1995
1996,62,65,1957,11,Female,Independence,KY,41051,24218.0,49378.0,104480.0,740,4,1996
1997,47,67,1973,1,Female,Elizabeth,NJ,7201,15175.0,30942.0,71066.0,779,3,1997
1998,66,60,1954,2,Male,Camp Hill,PA,17011,25336.0,54654.0,27241.0,618,1,1998


In [65]:
cards

Unnamed: 0,user,card_index,card_brand,card_type,card_number,expires_month,expires_year,cvv,has_chip,cards_issued,credit_limit,acct_open_month,acct_open_year,year_pin_last_changed,card_on_dark_web
0,0,0,Visa,Debit,4344676511950444,12,2022,623,1,2,24295.0,9,2002,2008,0
1,0,1,Visa,Debit,4956965974959986,12,2020,393,1,2,21968.0,4,2014,2014,0
2,0,2,Visa,Debit,4582313478255491,2,2024,719,1,2,46414.0,7,2003,2004,0
3,0,3,Visa,Credit,4879494103069057,8,2024,693,0,1,12400.0,1,2003,2012,0
4,0,4,Mastercard,Debit (Prepaid),5722874738736011,3,2009,75,1,1,28.0,9,2008,2009,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6141,1997,1,Amex,Credit,300609782832003,1,2024,663,1,1,6900.0,11,2000,2013,0
6142,1997,2,Visa,Credit,4718517475996018,1,2021,492,1,2,5700.0,4,2012,2012,0
6143,1998,0,Mastercard,Credit,5929512204765914,8,2020,237,0,2,9200.0,2,2012,2012,0
6144,1999,0,Mastercard,Debit,5589768928167462,1,2020,630,1,1,28074.0,1,2020,2020,0


In [66]:
combined = pd.merge(left=users, right=cards, left_on="user", right_on="user", how="inner")
combined.head()

Unnamed: 0,current_age,retirement_age,birth_year,birth_month,gender,city,state,zipcode,per_capita_income_zipcode,yearly_income_person,...,expires_month,expires_year,cvv,has_chip,cards_issued,credit_limit,acct_open_month,acct_open_year,year_pin_last_changed,card_on_dark_web
0,53,66,1966,11,Female,La Verne,CA,91750,29278.0,59696.0,...,12,2022,623,1,2,24295.0,9,2002,2008,0
1,53,66,1966,11,Female,La Verne,CA,91750,29278.0,59696.0,...,12,2020,393,1,2,21968.0,4,2014,2014,0
2,53,66,1966,11,Female,La Verne,CA,91750,29278.0,59696.0,...,2,2024,719,1,2,46414.0,7,2003,2004,0
3,53,66,1966,11,Female,La Verne,CA,91750,29278.0,59696.0,...,8,2024,693,0,1,12400.0,1,2003,2012,0
4,53,66,1966,11,Female,La Verne,CA,91750,29278.0,59696.0,...,3,2009,75,1,1,28.0,9,2008,2009,0


Some data exploration

In [67]:
transactions['is_fraud'] = transactions['is_fraud'].replace('Yes', 1)

In [68]:
transactions['is_fraud'].value_counts()

is_fraud
0    24357143
1       29757
Name: count, dtype: int64

#### Splitting the chip type transactions into three true/false columns

In [71]:
chipTransactionTypes = transactions['use_chip'].value_counts()
chipTransactionTypes

use_chip
Swipe     15386082
Chip       6287598
Online     2713220
Name: count, dtype: int64

Splitting the "use chip" column into three true/false

In [72]:
chipTypes = []
for e in chipTransactionTypes.index:
    for s in e.split(','):
        if s not in chipTypes:
            chipTypes.append(s)

print(chipTypes)

['Swipe', 'Chip', 'Online']


In [75]:
for e in chipTransactionTypes.index:
    print(e)
    chipT = e.lower().replace(' ', '_') + "_transaction"
    print(chipT)
    transactions[chipT] = 0
    transactions[chipT] = np.where(transactions.use_chip.str.contains(e), 1, 0)

Swipe
swipe_transaction
Chip
chip_transaction
Online
online_transaction


In [76]:
transactions = transactions.drop('use_chip', axis=1)

## Train Set
Obtained by grabbing all fraudulent transactions from 2015 and adding 3000 random non-fraudulent ones.

In [77]:
fraud = transactions.query("year == 2015 and is_fraud == 1")
fraud

Unnamed: 0,user,card,year,month,day,amount,merchant_city,merchant_state,mcc,hour,...,insufficient_balance,bad_pin,technical_glitch,bad_card_number,bad_cvv,bad_expiration,bad_zipcode,swipe_transaction,chip_transaction,online_transaction
4099,0,0,2015,11,15,287.130005,ONLINE,,3001,12,...,0,0,0,0,0,0,0,0,0,1
4100,0,0,2015,11,15,2.410000,ONLINE,,5651,13,...,0,0,0,0,0,0,0,0,0,1
4101,0,0,2015,11,16,50.810001,ONLINE,,4411,09,...,0,0,0,0,0,0,0,0,0,1
4102,0,0,2015,11,16,248.360001,ONLINE,,5732,09,...,0,0,0,0,0,0,0,0,0,1
4103,0,0,2015,11,16,473.000000,ONLINE,,3640,11,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24220497,1983,3,2015,7,8,3.060000,ONLINE,,5311,15,...,0,0,0,0,0,0,0,0,0,1
24220499,1983,3,2015,7,10,58.669998,ONLINE,,3256,13,...,0,0,0,0,0,0,0,0,0,1
24220500,1983,3,2015,7,10,155.639999,Dubberly,LA,5300,14,...,0,0,0,0,0,0,0,0,1,0
24220501,1983,3,2015,7,10,19.290001,ONLINE,,5816,15,...,0,0,0,0,0,0,0,0,0,1


In [79]:
print(fraud.iloc[0])

user                             0
card                             0
year                          2015
month                           11
day                             15
amount                  287.130005
merchant_city               ONLINE
merchant_state                None
mcc                           3001
hour                            12
minute                          55
is_fraud                         1
errors                        None
purchase_zip                   0.0
insufficient_balance             0
bad_pin                          0
technical_glitch                 0
bad_card_number                  0
bad_cvv                          0
bad_expiration                   0
bad_zipcode                      0
swipe_transaction                0
chip_transaction                 0
online_transaction               1
Name: 4099, dtype: object


### Filtering the transactions so only the non-fraudulent ones happening in 2015 are into a database

In [80]:
nonfraud = transactions.query("year == 2015 and is_fraud == 0")
nonfraud

Unnamed: 0,user,card,year,month,day,amount,merchant_city,merchant_state,mcc,hour,...,insufficient_balance,bad_pin,technical_glitch,bad_card_number,bad_cvv,bad_expiration,bad_zipcode,swipe_transaction,chip_transaction,online_transaction
3895,0,0,2015,1,2,34.990002,Monterey Park,CA,5411,06,...,0,0,0,0,0,0,0,0,1,0
3896,0,0,2015,1,2,1.670000,La Verne,CA,5499,11,...,0,0,0,0,0,0,0,0,1,0
3897,0,0,2015,1,2,208.960007,Mira Loma,CA,4814,20,...,0,0,0,0,0,0,0,0,1,0
3898,0,0,2015,1,4,136.580002,La Verne,CA,5300,06,...,0,0,0,0,0,0,0,0,1,0
3899,0,0,2015,1,4,128.639999,Monterey Park,CA,5651,16,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24379102,1998,0,2015,12,30,8.010000,Lebanon,PA,4214,14,...,0,0,0,0,0,0,0,1,0,0
24379103,1998,0,2015,12,30,21.950001,Camp Hill,PA,5814,17,...,0,0,0,0,0,0,0,1,0,0
24379104,1998,0,2015,12,30,36.709999,ONLINE,,4121,18,...,0,0,0,0,0,0,0,0,0,1
24379105,1998,0,2015,12,31,-0.110000,Carlisle,PA,5812,06,...,0,0,0,0,0,0,0,1,0,0


### Oversampling Train Set Fraud Data to Balance it

In [238]:
DATA_SIZE = 3000

In [239]:
balanced_fraud1 = fraud.sample(n=int(DATA_SIZE/2))
balanced_fraud2 = fraud.sample(n=int(DATA_SIZE/2))

fraud = pd.concat([balanced_fraud1, balanced_fraud2])

#### Merging Fraud and Non-Fraud into one Dataset

##### Finding ratio of fraud transactions

In [240]:
swipeCount = len(fraud.query("swipe_transaction == 1"))
chipCount = len(fraud.query("chip_transaction == 1"))
onlineCount = len(fraud.query("online_transaction == 1"))
totalCount = len(fraud)

print(f"{swipeCount} + {chipCount} + {onlineCount} = {totalCount}")

181 + 252 + 2567 = 3000


In [241]:
ratios = [swipeCount/totalCount, chipCount/totalCount, onlineCount/totalCount]
print(ratios)

[0.060333333333333336, 0.084, 0.8556666666666667]


In [242]:
# To preserve the ratio of purchase method I'm first getting the subset from the subset and then splitting it manually
ratios = [x * DATA_SIZE for x in ratios]
print(ratios)

[181.0, 252.00000000000003, 2567.0]


Applying the transaction ratio to the fraudulent transactions

In [243]:
fraud_swipe_transactions = fraud.query("swipe_transaction == 1").sample(n=int(ratios[0]))
fraud_chip_transactions = fraud.query("chip_transaction == 1").sample(n=int(ratios[1]))
fraud_online_transactions = fraud.query("online_transaction == 1").sample(n=int(ratios[2]))
print(f"{len(fraud_swipe_transactions)}, {len(fraud_chip_transactions)}, {len(fraud_online_transactions)}")

181, 252, 2567


##### Finding ratio of fraud transactions

In [244]:
swipeCount = len(nonfraud.query("swipe_transaction == 1"))
chipCount = len(nonfraud.query("chip_transaction == 1"))
onlineCount = len(nonfraud.query("online_transaction == 1"))
totalCount = len(nonfraud)

print(f"{swipeCount} + {chipCount} + {onlineCount} = {totalCount}")

291795 + 1197222 + 209073 = 1698090


In [245]:
ratios = [swipeCount/totalCount, chipCount/totalCount, onlineCount/totalCount]
print(ratios)

[0.17183718177481758, 0.7050403688850414, 0.12312244934014098]


In [246]:
# To preserve the ratio of purchase method I'm first getting the subset from the subset and then splitting it manually
ratios = [x * DATA_SIZE for x in ratios]
print(ratios)

[515.5115453244528, 2115.121106655124, 369.3673480204229]


Applying the transaction ratio to the nonfraudulent transactions

In [247]:
nonfraud_swipe_transactions = nonfraud.query("swipe_transaction == 1").sample(n=int(ratios[0]))
nonfraud_chip_transactions = nonfraud.query("chip_transaction == 1").sample(n=int(ratios[1]))
nonfraud_online_transactions = nonfraud.query("online_transaction == 1").sample(n=int(ratios[2]))
print(f"{len(nonfraud_swipe_transactions)}, {len(nonfraud_chip_transactions)}, {len(nonfraud_online_transactions)}")

515, 2115, 369


In [324]:
train_set = pd.concat([fraud_swipe_transactions, fraud_chip_transactions, fraud_online_transactions, nonfraud_swipe_transactions, nonfraud_chip_transactions, nonfraud_online_transactions])
train_set.head()

Unnamed: 0,user,card,year,month,day,amount,merchant_city,merchant_state,mcc,hour,...,insufficient_balance,bad_pin,technical_glitch,bad_card_number,bad_cvv,bad_expiration,bad_zipcode,swipe_transaction,chip_transaction,online_transaction
11643255,969,3,2015,7,12,56.209999,Universal City,TX,5921,9,...,0,0,0,0,0,0,0,1,0,0
4344705,362,1,2015,12,13,47.970001,Williamsburg,IA,5813,10,...,0,0,0,0,0,0,0,1,0,0
16459147,1334,4,2015,12,28,82.029999,Farmington,MN,5813,11,...,0,1,0,0,0,0,0,1,0,0
19830624,1607,0,2015,4,3,111.379997,Clint,TX,5912,12,...,0,0,0,0,0,0,0,1,0,0
11169380,928,2,2015,12,26,53.040001,Chester,VA,5813,16,...,0,0,0,0,0,0,0,1,0,0


In [301]:
print(len(train_set))

5999


#### Merging Transactions with User data and Card data

In [325]:
train_set = pd.merge(left=combined, right=train_set, left_on=['user', 'card_index'], right_on=['user', 'card'], how="inner")

In [326]:
train_set

Unnamed: 0,current_age,retirement_age,birth_year,birth_month,gender,city,state,zipcode,per_capita_income_zipcode,yearly_income_person,...,insufficient_balance,bad_pin,technical_glitch,bad_card_number,bad_cvv,bad_expiration,bad_zipcode,swipe_transaction,chip_transaction,online_transaction
0,53,66,1966,11,Female,La Verne,CA,91750,29278.0,59696.0,...,0,0,0,0,0,0,0,0,0,1
1,53,66,1966,11,Female,La Verne,CA,91750,29278.0,59696.0,...,0,0,0,0,0,0,0,0,0,1
2,53,66,1966,11,Female,La Verne,CA,91750,29278.0,59696.0,...,0,0,0,0,0,0,0,0,0,1
3,53,66,1966,11,Female,La Verne,CA,91750,29278.0,59696.0,...,0,0,0,0,0,0,0,0,0,1
4,53,66,1966,11,Female,La Verne,CA,91750,29278.0,59696.0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5994,47,67,1973,1,Female,Elizabeth,NJ,7201,15175.0,30942.0,...,0,0,0,0,0,0,0,0,1,0
5995,66,60,1954,2,Male,Camp Hill,PA,17011,25336.0,54654.0,...,0,0,0,0,0,0,0,1,0,0
5996,66,60,1954,2,Male,Camp Hill,PA,17011,25336.0,54654.0,...,0,0,0,0,0,0,0,1,0,0
5997,66,60,1954,2,Male,Camp Hill,PA,17011,25336.0,54654.0,...,0,0,0,0,0,0,0,1,0,0


## Validation Set
Obtained by grabbing all fraudulent transactions after 2015 and adding 5000 random non-fraudulent ones.

In [205]:
v_fraud = transactions.query("year > 2015 and is_fraud == 1")
v_fraud

Unnamed: 0,user,card,year,month,day,amount,merchant_city,merchant_state,mcc,hour,...,insufficient_balance,bad_pin,technical_glitch,bad_card_number,bad_cvv,bad_expiration,bad_zipcode,swipe_transaction,chip_transaction,online_transaction
9761,0,2,2016,3,6,81.930000,Claremont,CA,4121,11,...,0,0,0,0,0,0,0,0,1,0
9762,0,2,2016,3,6,297.859985,ONLINE,,5311,12,...,0,0,0,0,0,0,0,0,0,1
17737,0,3,2016,2,23,244.229996,ONLINE,,5310,10,...,0,0,0,0,0,0,0,0,0,1
17738,0,3,2016,2,23,22.400000,Claremont,CA,5300,14,...,0,0,0,0,0,0,0,1,0,0
21031,1,1,2016,6,9,181.740005,ONLINE,,5310,10,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24375668,1997,2,2016,9,4,1.250000,ONLINE,,5815,10,...,0,0,0,0,0,0,0,0,0,1
24375670,1997,2,2016,9,4,116.660004,ONLINE,,5311,12,...,0,0,0,0,0,0,0,0,0,1
24375671,1997,2,2016,9,4,359.989990,ONLINE,,5310,12,...,0,0,0,0,0,0,0,0,0,1
24375672,1997,2,2016,9,4,351.510010,ONLINE,,4829,13,...,0,0,0,0,0,0,0,0,0,1


In [206]:
print(v_fraud.iloc[0])

user                            0
card                            2
year                         2016
month                           3
day                             6
amount                      81.93
merchant_city           Claremont
merchant_state                 CA
mcc                          4121
hour                           11
minute                         18
is_fraud                        1
errors                       None
purchase_zip              91711.0
insufficient_balance            0
bad_pin                         0
technical_glitch                0
bad_card_number                 0
bad_cvv                         0
bad_expiration                  0
bad_zipcode                     0
swipe_transaction               0
chip_transaction                1
online_transaction              0
Name: 9761, dtype: object


### Filtering the transactions so only the non-fraudulent ones happening after 2015 are into a dataframe

In [207]:
v_nonfraud = transactions.query("year > 2015 and is_fraud == 0")
v_nonfraud

Unnamed: 0,user,card,year,month,day,amount,merchant_city,merchant_state,mcc,hour,...,insufficient_balance,bad_pin,technical_glitch,bad_card_number,bad_cvv,bad_expiration,bad_zipcode,swipe_transaction,chip_transaction,online_transaction
4136,0,0,2016,1,3,66.480003,La Verne,CA,7538,10,...,0,0,0,0,0,0,0,0,1,0
4137,0,0,2016,1,4,40.020000,La Verne,CA,5912,06,...,0,0,0,0,0,0,0,0,1,0
4138,0,0,2016,1,7,54.110001,La Verne,CA,7538,09,...,0,0,0,0,0,0,0,0,1,0
4139,0,0,2016,1,7,89.480003,Monterey Park,CA,5651,16,...,0,0,0,0,0,0,0,0,1,0
4140,0,0,2016,1,10,29.150000,Monterey Park,CA,5942,06,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24386895,1999,1,2020,2,27,-54.000000,Merrimack,NH,5541,22,...,0,0,0,0,0,0,0,0,1,0
24386896,1999,1,2020,2,27,54.000000,Merrimack,NH,5541,22,...,0,0,0,0,0,0,0,0,1,0
24386897,1999,1,2020,2,28,59.150002,Merrimack,NH,4121,07,...,0,0,0,0,0,0,0,0,1,0
24386898,1999,1,2020,2,28,43.119999,Merrimack,NH,4121,20,...,0,0,0,0,0,0,0,0,1,0


### No Oversampling For the Validation Data

#### Merging Fraud and Non-Fraud into one Dataset

In [252]:
DATA_SIZE = 5000

##### Finding ratio of fraud transactions

In [253]:
swipeCount = len(v_fraud.query("swipe_transaction == 1"))
chipCount = len(v_fraud.query("chip_transaction == 1"))
onlineCount = len(v_fraud.query("online_transaction == 1"))
totalCount = len(v_fraud)

print(f"{swipeCount} + {chipCount} + {onlineCount} = {totalCount}")

676 + 4535 + 3201 = 8412


In [254]:
ratios = [swipeCount/totalCount, chipCount/totalCount, onlineCount/totalCount]
print(ratios)

[0.08036138849262958, 0.5391107941036615, 0.380527817403709]


In [255]:
# To preserve the ratio of purchase method I'm first getting the subset from the subset and then splitting it manually
ratios = [x * DATA_SIZE for x in ratios]
print(ratios)

[401.8069424631479, 2695.5539705183073, 1902.639087018545]


Applying the transaction ratio to the fraudulent transactions

In [256]:
v_fraud_swipe_transactions = v_fraud.query("swipe_transaction == 1").sample(n=int(ratios[0]))
v_fraud_chip_transactions = v_fraud.query("chip_transaction == 1").sample(n=int(ratios[1]))
v_fraud_online_transactions = v_fraud.query("online_transaction == 1").sample(n=int(ratios[2]))
print(f"{len(v_fraud_swipe_transactions)}, {len(v_fraud_chip_transactions)}, {len(v_fraud_online_transactions)}")

401, 2695, 1902


##### Finding ratio of fraud transactions

In [257]:
swipeCount = len(v_nonfraud.query("swipe_transaction == 1"))
chipCount = len(v_nonfraud.query("chip_transaction == 1"))
onlineCount = len(v_nonfraud.query("online_transaction == 1"))
totalCount = len(v_nonfraud)

print(f"{swipeCount} + {chipCount} + {onlineCount} = {totalCount}")

1226045 + 5085540 + 894340 = 7205925


In [258]:
ratios = [swipeCount/totalCount, chipCount/totalCount, onlineCount/totalCount]
print(ratios)

[0.1701440134333899, 0.7057442313096515, 0.12411175525695868]


In [259]:
# To preserve the ratio of purchase method I'm first getting the subset from the subset and then splitting it manually
ratios = [x * DATA_SIZE for x in ratios]
print(ratios)

[850.7200671669494, 3528.7211565482576, 620.5587762847935]


Applying the transaction ratio to the nonfraudulent transactions

In [260]:
v_nonfraud_swipe_transactions = v_nonfraud.query("swipe_transaction == 1").sample(n=int(ratios[0]))
v_nonfraud_chip_transactions = v_nonfraud.query("chip_transaction == 1").sample(n=int(ratios[1]))
v_nonfraud_online_transactions = v_nonfraud.query("online_transaction == 1").sample(n=int(ratios[2]))
print(f"{len(v_nonfraud_swipe_transactions)}, {len(v_nonfraud_chip_transactions)}, {len(v_nonfraud_online_transactions)}")

850, 3528, 620


In [327]:
validation_set = pd.concat([v_fraud_swipe_transactions, v_fraud_chip_transactions, v_fraud_online_transactions, v_nonfraud_swipe_transactions, v_nonfraud_chip_transactions, v_nonfraud_online_transactions])
validation_set.head()

Unnamed: 0,user,card,year,month,day,amount,merchant_city,merchant_state,mcc,hour,...,insufficient_balance,bad_pin,technical_glitch,bad_card_number,bad_cvv,bad_expiration,bad_zipcode,swipe_transaction,chip_transaction,online_transaction
23702273,1933,2,2019,8,9,109.980003,Rome,Italy,5651,16,...,0,0,0,0,0,0,0,1,0,0
2544733,214,0,2018,5,4,14.98,Rome,Italy,5311,10,...,0,0,0,0,0,0,0,1,0,0
57548,2,1,2018,1,10,85.139999,Rome,Italy,5621,16,...,0,0,0,0,0,0,0,1,0,0
5436974,465,1,2016,2,7,51.369999,Thomson,GA,7996,19,...,0,0,0,0,0,0,0,1,0,0
18459028,1491,0,2018,11,16,32.75,Marion,OH,5812,17,...,0,0,0,0,0,0,0,1,0,0


In [329]:
print(len(validation_set))

9996


#### Merging Transactions with User data and Card data

In [328]:
validation_set = pd.merge(left=combined, right=validation_set, left_on=['user', 'card_index'], right_on=['user', 'card'], how="inner")

In [330]:
validation_set

Unnamed: 0,current_age,retirement_age,birth_year,birth_month,gender,city,state,zipcode,per_capita_income_zipcode,yearly_income_person,...,insufficient_balance,bad_pin,technical_glitch,bad_card_number,bad_cvv,bad_expiration,bad_zipcode,swipe_transaction,chip_transaction,online_transaction
0,53,66,1966,11,Female,La Verne,CA,91750,29278.0,59696.0,...,0,0,0,0,0,0,0,0,1,0
1,53,66,1966,11,Female,La Verne,CA,91750,29278.0,59696.0,...,0,0,0,0,0,0,0,0,1,0
2,53,66,1966,11,Female,La Verne,CA,91750,29278.0,59696.0,...,0,0,0,0,0,0,0,0,0,1
3,53,66,1966,11,Female,La Verne,CA,91750,29278.0,59696.0,...,0,0,0,0,0,0,0,0,1,0
4,53,66,1966,11,Female,La Verne,CA,91750,29278.0,59696.0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9991,47,67,1973,1,Female,Elizabeth,NJ,7201,15175.0,30942.0,...,0,0,0,0,0,0,0,0,1,0
9992,47,67,1973,1,Female,Elizabeth,NJ,7201,15175.0,30942.0,...,0,0,0,0,0,0,0,0,0,1
9993,47,67,1973,1,Female,Elizabeth,NJ,7201,15175.0,30942.0,...,0,0,0,0,0,0,0,0,0,1
9994,66,60,1954,2,Male,Camp Hill,PA,17011,25336.0,54654.0,...,0,0,0,0,0,0,0,1,0,0


## Dataframe cleanup

Dropping the Errors and Card Number Column in both sets since they're useless now

In [331]:
train_set = train_set.drop('errors', axis=1)
train_set = train_set.drop('card_number', axis=1)

In [332]:
validation_set = validation_set.drop('errors', axis=1)
validation_set = validation_set.drop('card_number', axis=1)

Dropping the User and Card_index columns

In [333]:
train_set = train_set.drop('user', axis=1)
train_set = train_set.drop('card_index', axis=1)

In [334]:
validation_set = validation_set.drop('user', axis=1)
validation_set = validation_set.drop('card_index', axis=1)

Consider coming back here (or before in the file) and labeling user columns with "user_"

## Target Encoding

In order to convert the text variables into numerical values, we're be using a method called target encoding.

In [335]:
def EncodeColumns(df, cols, target_column_name):
    target_encoding_map = {}

    for col in cols:
        temp = df.groupby(col)
        encoding = temp[target_column_name].mean().to_dict()

        df[col + "_target_encoded"] = df[col].map(encoding)

        target_encoding_map[col] = encoding

    df.drop(columns=cols, inplace=True)

    return df

Looking at the data in the columns to decide which ones to encode

In [336]:
print(validation_set.iloc[0])

current_age                        53
retirement_age                     66
birth_year                       1966
birth_month                        11
gender                         Female
city                         La Verne
state                              CA
zipcode                         91750
per_capita_income_zipcode     29278.0
yearly_income_person          59696.0
total_debt                   127613.0
fico_score                        787
num_credit_cards                    5
card_brand                       Visa
card_type                       Debit
expires_month                      12
expires_year                     2022
cvv                               623
has_chip                            1
cards_issued                        2
credit_limit                  24295.0
acct_open_month                     9
acct_open_year                   2002
year_pin_last_changed            2008
card_on_dark_web                    0
card                                0
year        

In [337]:
print(train_set.iloc[0])

current_age                         53
retirement_age                      66
birth_year                        1966
birth_month                         11
gender                          Female
city                          La Verne
state                               CA
zipcode                          91750
per_capita_income_zipcode      29278.0
yearly_income_person           59696.0
total_debt                    127613.0
fico_score                         787
num_credit_cards                     5
card_brand                        Visa
card_type                        Debit
expires_month                       12
expires_year                      2022
cvv                                623
has_chip                             1
cards_issued                         2
credit_limit                   24295.0
acct_open_month                      9
acct_open_year                    2002
year_pin_last_changed             2008
card_on_dark_web                     0
card                     

These are the columns with strings that need to be encoded.

In [322]:
targetEncodeColumns = ['gender', 'city', 'state', 'zipcode', 'card_brand', 'card_type', 'cvv', 'merchant_city', 'merchant_state', 'mcc', 'purchase_zip']

In [338]:
encoded_train_set = train_set
encoded_train_set = EncodeColumns(encoded_train_set, targetEncodeColumns, 'is_fraud')
encoded_train_set

Unnamed: 0,current_age,retirement_age,birth_year,birth_month,per_capita_income_zipcode,yearly_income_person,total_debt,fico_score,num_credit_cards,expires_month,...,city_target_encoded,state_target_encoded,zipcode_target_encoded,card_brand_target_encoded,card_type_target_encoded,cvv_target_encoded,merchant_city_target_encoded,merchant_state_target_encoded,mcc_target_encoded,purchase_zip_target_encoded
0,53,66,1966,11,29278.0,59696.0,127613.0,787,5,12,...,0.736842,0.513959,0.736842,0.483105,0.473182,0.736842,0.874319,0.874319,0.808943,0.868697
1,53,66,1966,11,29278.0,59696.0,127613.0,787,5,12,...,0.736842,0.513959,0.736842,0.483105,0.473182,0.736842,0.874319,0.874319,0.933333,0.868697
2,53,66,1966,11,29278.0,59696.0,127613.0,787,5,12,...,0.736842,0.513959,0.736842,0.483105,0.473182,0.736842,0.874319,0.874319,1.000000,0.868697
3,53,66,1966,11,29278.0,59696.0,127613.0,787,5,12,...,0.736842,0.513959,0.736842,0.483105,0.473182,0.736842,0.874319,0.874319,0.808943,0.868697
4,53,66,1966,11,29278.0,59696.0,127613.0,787,5,12,...,0.736842,0.513959,0.736842,0.483105,0.473182,0.736842,0.874319,0.874319,0.808943,0.868697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5994,47,67,1973,1,15175.0,30942.0,71066.0,779,3,1,...,0.000000,0.440860,0.000000,0.483105,0.517978,0.000000,0.000000,0.148936,0.133995,0.000000
5995,66,60,1954,2,25336.0,54654.0,27241.0,618,1,8,...,0.000000,0.296296,0.000000,0.505129,0.517978,0.000000,0.000000,0.070796,0.279070,0.000000
5996,66,60,1954,2,25336.0,54654.0,27241.0,618,1,8,...,0.000000,0.296296,0.000000,0.505129,0.517978,0.000000,0.000000,0.070796,0.000000,0.000000
5997,66,60,1954,2,25336.0,54654.0,27241.0,618,1,8,...,0.000000,0.296296,0.000000,0.505129,0.517978,0.000000,0.000000,0.141667,0.957447,0.000000


In [339]:
encoded_validation_set = validation_set
encoded_validation_set = EncodeColumns(encoded_validation_set, targetEncodeColumns, 'is_fraud')
encoded_validation_set

Unnamed: 0,current_age,retirement_age,birth_year,birth_month,per_capita_income_zipcode,yearly_income_person,total_debt,fico_score,num_credit_cards,expires_month,...,city_target_encoded,state_target_encoded,zipcode_target_encoded,card_brand_target_encoded,card_type_target_encoded,cvv_target_encoded,merchant_city_target_encoded,merchant_state_target_encoded,mcc_target_encoded,purchase_zip_target_encoded
0,53,66,1966,11,29278.0,59696.0,127613.0,787,5,12,...,0.428571,0.437393,0.428571,0.521078,0.467380,0.153846,0.000000,0.061977,0.165123,0.000000
1,53,66,1966,11,29278.0,59696.0,127613.0,787,5,12,...,0.428571,0.437393,0.428571,0.521078,0.467380,0.153846,0.000000,0.061977,0.321185,0.000000
2,53,66,1966,11,29278.0,59696.0,127613.0,787,5,12,...,0.428571,0.437393,0.428571,0.521078,0.467380,0.214286,0.752969,0.752969,0.000000,0.878095
3,53,66,1966,11,29278.0,59696.0,127613.0,787,5,2,...,0.428571,0.437393,0.428571,0.521078,0.467380,0.200000,1.000000,0.061977,0.415550,1.000000
4,53,66,1966,11,29278.0,59696.0,127613.0,787,5,8,...,0.428571,0.437393,0.428571,0.521078,0.538345,0.500000,1.000000,0.061977,0.520089,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9991,47,67,1973,1,15175.0,30942.0,71066.0,779,3,1,...,0.727273,0.510638,0.727273,0.519591,0.538345,0.428571,0.000000,0.065421,0.166271,0.000000
9992,47,67,1973,1,15175.0,30942.0,71066.0,779,3,1,...,0.727273,0.510638,0.727273,0.521078,0.538345,0.285714,0.752969,0.752969,0.848430,0.878095
9993,47,67,1973,1,15175.0,30942.0,71066.0,779,3,1,...,0.727273,0.510638,0.727273,0.521078,0.538345,0.285714,0.752969,0.752969,0.848430,0.878095
9994,66,60,1954,2,25336.0,54654.0,27241.0,618,1,8,...,0.375000,0.449468,0.375000,0.479888,0.538345,0.266667,0.000000,0.081886,0.292308,0.000000


In [340]:
encoded_train_set['is_fraud'].value_counts()

is_fraud
1    3000
0    2999
Name: count, dtype: int64

In [341]:
encoded_validation_set['is_fraud'].value_counts()

is_fraud
0    4998
1    4998
Name: count, dtype: int64

## Saving Non-normalized Data

In [342]:
encoded_train_set.to_pickle("./train_data.pkl")

In [343]:
encoded_validation_set.to_pickle("./validation_data.pkl")

## Normalizing Data

In [344]:
def NormalizeColumns(df, cols):

    for col in cols:
        df[col] = pd.to_numeric(df[col])
        df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

    return df

In [346]:
print(train_set.columns)

Index(['current_age', 'retirement_age', 'birth_year', 'birth_month',
       'per_capita_income_zipcode', 'yearly_income_person', 'total_debt',
       'fico_score', 'num_credit_cards', 'expires_month', 'expires_year',
       'has_chip', 'cards_issued', 'credit_limit', 'acct_open_month',
       'acct_open_year', 'year_pin_last_changed', 'card_on_dark_web', 'card',
       'year', 'month', 'day', 'amount', 'hour', 'minute', 'is_fraud',
       'insufficient_balance', 'bad_pin', 'technical_glitch',
       'bad_card_number', 'bad_cvv', 'bad_expiration', 'bad_zipcode',
       'swipe_transaction', 'chip_transaction', 'online_transaction',
       'gender_target_encoded', 'city_target_encoded', 'state_target_encoded',
       'zipcode_target_encoded', 'card_brand_target_encoded',
       'card_type_target_encoded', 'cvv_target_encoded',
       'merchant_city_target_encoded', 'merchant_state_target_encoded',
       'mcc_target_encoded', 'purchase_zip_target_encoded'],
      dtype='object')


In [345]:
print(train_set.iloc[4001])

current_age                             44
retirement_age                          67
birth_year                            1975
birth_month                              7
per_capita_income_zipcode          18069.0
yearly_income_person               36843.0
total_debt                         52248.0
fico_score                             772
num_credit_cards                         5
expires_month                            7
expires_year                          2023
has_chip                                 1
cards_issued                             2
credit_limit                       11528.0
acct_open_month                         11
acct_open_year                        2007
year_pin_last_changed                 2013
card_on_dark_web                         0
card                                     4
year                                  2015
month                                   10
day                                     15
amount                           48.119999
hour       

In [347]:
unnormalizedColumns = ['current_age', 'retirement_age', 'birth_year', 'birth_month',
       'per_capita_income_zipcode', 'yearly_income_person', 'total_debt',
       'fico_score', 'num_credit_cards', 'expires_month', 'expires_year',
       'cards_issued', 'credit_limit', 'acct_open_month',
       'acct_open_year', 'year_pin_last_changed', 'card',
       'year', 'month', 'day', 'amount', 'hour', 'minute']

In [348]:
normalized_train_set = encoded_train_set
normalized_train_set = NormalizeColumns(normalized_train_set, unnormalizedColumns)
normalized_train_set

Unnamed: 0,current_age,retirement_age,birth_year,birth_month,per_capita_income_zipcode,yearly_income_person,total_debt,fico_score,num_credit_cards,expires_month,...,city_target_encoded,state_target_encoded,zipcode_target_encoded,card_brand_target_encoded,card_type_target_encoded,cvv_target_encoded,merchant_city_target_encoded,merchant_state_target_encoded,mcc_target_encoded,purchase_zip_target_encoded
0,0.384615,0.551724,0.615385,0.909091,0.179460,0.213046,0.276306,0.825967,0.50,1.000000,...,0.736842,0.513959,0.736842,0.483105,0.473182,0.736842,0.874319,0.874319,0.808943,0.868697
1,0.384615,0.551724,0.615385,0.909091,0.179460,0.213046,0.276306,0.825967,0.50,1.000000,...,0.736842,0.513959,0.736842,0.483105,0.473182,0.736842,0.874319,0.874319,0.933333,0.868697
2,0.384615,0.551724,0.615385,0.909091,0.179460,0.213046,0.276306,0.825967,0.50,1.000000,...,0.736842,0.513959,0.736842,0.483105,0.473182,0.736842,0.874319,0.874319,1.000000,0.868697
3,0.384615,0.551724,0.615385,0.909091,0.179460,0.213046,0.276306,0.825967,0.50,1.000000,...,0.736842,0.513959,0.736842,0.483105,0.473182,0.736842,0.874319,0.874319,0.808943,0.868697
4,0.384615,0.551724,0.615385,0.909091,0.179460,0.213046,0.276306,0.825967,0.50,1.000000,...,0.736842,0.513959,0.736842,0.483105,0.473182,0.736842,0.874319,0.874319,0.808943,0.868697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5994,0.307692,0.586207,0.705128,0.000000,0.093015,0.110425,0.153871,0.803867,0.25,0.000000,...,0.000000,0.440860,0.000000,0.483105,0.517978,0.000000,0.000000,0.148936,0.133995,0.000000
5995,0.551282,0.344828,0.461538,0.090909,0.155297,0.195051,0.058982,0.359116,0.00,0.636364,...,0.000000,0.296296,0.000000,0.505129,0.517978,0.000000,0.000000,0.070796,0.279070,0.000000
5996,0.551282,0.344828,0.461538,0.090909,0.155297,0.195051,0.058982,0.359116,0.00,0.636364,...,0.000000,0.296296,0.000000,0.505129,0.517978,0.000000,0.000000,0.070796,0.000000,0.000000
5997,0.551282,0.344828,0.461538,0.090909,0.155297,0.195051,0.058982,0.359116,0.00,0.636364,...,0.000000,0.296296,0.000000,0.505129,0.517978,0.000000,0.000000,0.141667,0.957447,0.000000


In [350]:
print(normalized_train_set['hour'])

0       0.695652
1       0.304348
2       0.478261
3       0.695652
4       0.695652
          ...   
5994    0.608696
5995    0.652174
5996    0.695652
5997    0.608696
5998    0.739130
Name: hour, Length: 5999, dtype: float64


In [351]:
normalized_validation_set = encoded_validation_set
normalized_validation_set = NormalizeColumns(normalized_validation_set, unnormalizedColumns)
normalized_validation_set

Unnamed: 0,current_age,retirement_age,birth_year,birth_month,per_capita_income_zipcode,yearly_income_person,total_debt,fico_score,num_credit_cards,expires_month,...,city_target_encoded,state_target_encoded,zipcode_target_encoded,card_brand_target_encoded,card_type_target_encoded,cvv_target_encoded,merchant_city_target_encoded,merchant_state_target_encoded,mcc_target_encoded,purchase_zip_target_encoded
0,0.421687,0.551724,0.571429,0.909091,0.179460,0.213046,0.276306,0.825967,0.500,1.000000,...,0.428571,0.437393,0.428571,0.521078,0.467380,0.153846,0.000000,0.061977,0.165123,0.000000
1,0.421687,0.551724,0.571429,0.909091,0.179460,0.213046,0.276306,0.825967,0.500,1.000000,...,0.428571,0.437393,0.428571,0.521078,0.467380,0.153846,0.000000,0.061977,0.321185,0.000000
2,0.421687,0.551724,0.571429,0.909091,0.179460,0.213046,0.276306,0.825967,0.500,1.000000,...,0.428571,0.437393,0.428571,0.521078,0.467380,0.214286,0.752969,0.752969,0.000000,0.878095
3,0.421687,0.551724,0.571429,0.909091,0.179460,0.213046,0.276306,0.825967,0.500,0.090909,...,0.428571,0.437393,0.428571,0.521078,0.467380,0.200000,1.000000,0.061977,0.415550,1.000000
4,0.421687,0.551724,0.571429,0.909091,0.179460,0.213046,0.276306,0.825967,0.500,0.636364,...,0.428571,0.437393,0.428571,0.521078,0.538345,0.500000,1.000000,0.061977,0.520089,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9991,0.349398,0.586207,0.654762,0.000000,0.093015,0.110425,0.153871,0.803867,0.250,0.000000,...,0.727273,0.510638,0.727273,0.519591,0.538345,0.428571,0.000000,0.065421,0.166271,0.000000
9992,0.349398,0.586207,0.654762,0.000000,0.093015,0.110425,0.153871,0.803867,0.250,0.000000,...,0.727273,0.510638,0.727273,0.521078,0.538345,0.285714,0.752969,0.752969,0.848430,0.878095
9993,0.349398,0.586207,0.654762,0.000000,0.093015,0.110425,0.153871,0.803867,0.250,0.000000,...,0.727273,0.510638,0.727273,0.521078,0.538345,0.285714,0.752969,0.752969,0.848430,0.878095
9994,0.578313,0.344828,0.428571,0.090909,0.155297,0.195051,0.058982,0.359116,0.000,0.636364,...,0.375000,0.449468,0.375000,0.479888,0.538345,0.266667,0.000000,0.081886,0.292308,0.000000


In [352]:
print(normalized_validation_set['hour'])

0       0.565217
1       0.260870
2       0.565217
3       0.478261
4       0.608696
          ...   
9991    0.478261
9992    0.565217
9993    0.521739
9994    0.739130
9995    0.478261
Name: hour, Length: 9996, dtype: float64


## Saving Encoded Dataframes

In [353]:
normalized_train_set.to_pickle("./normalized_train_data.pkl")

In [354]:
normalized_validation_set.to_pickle("./normalized_validation_data.pkl")