In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('credit_card_transactions.csv')
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 24 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

Unnamed: 0.1,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud,merch_zipcode
count,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1100702.0
mean,648337.0,4.17192e+17,70.35104,48800.67,38.53762,-90.22634,88824.44,1349244000.0,38.53734,-90.22646,0.005788652,46825.75
std,374318.0,1.308806e+18,160.316,26893.22,5.075808,13.75908,301956.4,12841280.0,5.109788,13.77109,0.07586269,25834.0
min,0.0,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1325376000.0,19.02779,-166.6712,0.0,1001.0
25%,324168.5,180042900000000.0,9.65,26237.0,34.6205,-96.798,743.0,1338751000.0,34.73357,-96.89728,0.0,25114.0
50%,648337.0,3521417000000000.0,47.52,48174.0,39.3543,-87.4769,2456.0,1349250000.0,39.36568,-87.43839,0.0,45860.0
75%,972505.5,4642255000000000.0,83.14,72042.0,41.9404,-80.158,20328.0,1359385000.0,41.95716,-80.2368,0.0,68319.0
max,1296674.0,4.992346e+18,28948.9,99783.0,66.6933,-67.9503,2906700.0,1371817000.0,67.51027,-66.9509,1.0,99403.0


# Can we remove the credit card numbers?

### This dataset contains a credit card number associated with each transaction, but due to the random assignment by the card issuing authority, it's not likely that the credit card number itself can be very useful in determining whether or not a transaction is fraudulent. We can quickly scrub to confirm that all of the card numbers are valid using Luhn's algorithm. Note that since this dataset is notionally after the point of sale, it is expected that they are all valid card numbers. By confirming they are valid, we can remove them from out dataset since they are not going to be useful for other purposes.

### To quickly check if a credit card number is valid, you can use Luhn's algorithm: https://en.wikipedia.org/wiki/Luhn_algorithm
### A python module has been built to quickly check: https://pypi.org/project/luhncheck/

In [3]:
# you may need to install luhncheck using pip
# pip install luhncheck

from luhncheck import is_luhn

# initialize two empty counters
check_counter = 0
error_counter = 0

# iterate through the credit card numbers and increment the check counter
for i in df['cc_num']:
    check_counter += 1
    # if the luhn check fails, the credit card number is invalid, and error increments
    if is_luhn(str(i)) == 'False':
        error_counter += 1

# print the results
print("Total count of credit card numbers: " + str(df['cc_num'].count()))
print("Values checked: " + str(check_counter))
print("Errors found: " + str(error_counter))

Total count of credit card numbers: 1296675
Values checked: 1296675
Errors found: 0


## No errors found indicates that all of these credit card numbers are valid. Again, this is not surprising, as presumably the merchant/payment processor are performing this check at the point of sale, effectively cutting out credit card number fraud before the transaction takes place. 

## Since all of the credit card numbers are valid, we will remove the field from the dataframe to reduce computational overhead.

In [4]:
df.drop('cc_num', axis = 1, inplace = True)

In [18]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   merchant               1296675 non-null  object 
 3   category               1296675 non-null  object 
 4   amt                    1296675 non-null  float64
 5   first                  1296675 non-null  object 
 6   last                   1296675 non-null  object 
 7   gender                 1296675 non-null  object 
 8   street                 1296675 non-null  object 
 9   city                   1296675 non-null  object 
 10  state                  1296675 non-null  object 
 11  zip                    1296675 non-null  int64  
 12  lat                    1296675 non-null  float64
 13  long                   1296675 non-null  float64
 14  city_pop          

In [23]:
print((df.memory_usage(deep=True).sum()/(1024*1024)))

1153.7020664215088


In [22]:
print((df['trans_num'].memory_usage(deep=True)/(1024*1024)))

110.0580244064331


## We can see the total column count has decreased from 24 to 23, and we have reduced memory usage by ~10 MB

## What additional columns can we remove to reduce the load? 
#### Column 0 = an icrementing integer for each transaction that is not needed

Review whether the integer in column 0 or the unique transaction ID are more resource intesive, suspect transaction ID requires more bits in memory than integer so thats the one we should remove

In [10]:
print((df['Unnamed: 0'].memory_usage(deep=True)/(1024*1024)))
print((df['trans_num'].memory_usage(deep=True)/(1024*1024)))

9.892967224121094
110.0580244064331


## OPTIONAL DATA CLEANING/FILLING ACTION: geocode missing merchant ZIPS