In [502]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import hashlib
from datetime import datetime

In [503]:
df = pd.read_csv('../../datas/credit_card_purchases.csv')
df = df.dropna().reset_index(drop=True)

In [504]:
def to_hash(cc_num):
    return int(hashlib.sha256(cc_num.encode()).hexdigest(), 16) % 10**8

df['customer_id'] = df['cc_num'].astype('str').apply(to_hash)

In [505]:
df = df.drop(columns=['Unnamed: 0', 'cc_num', 'first', 'last', 'street', 'lat', 'long', 'merch_lat', 'merch_long', 'merch_zipcode', 'trans_num', 'zip'])

In [506]:
df['trans_date_trans_time'] = df['trans_date_trans_time'].str[:10]
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['dob'] = pd.to_datetime(df['dob'])
df = df.rename(columns={'trans_date_trans_time': 'trans_date'})

In [507]:
def find_age(dob):
    today = datetime.today()
    age = today.year - dob.year

    if (today.month, today.day) < (dob.month, dob.day):
        age -= 1
    return age

df['age'] = df['dob'].apply(find_age)

In [508]:
df['merchant'] = df['merchant'].apply(lambda x: x.replace('fraud_', ''))

# drop frauds
df = df[df['merchant'] == 'Dare-Gibson']
df = df[df['is_fraud'] == 0]
df = df.drop(columns=['is_fraud'])

In [509]:
df.sample()

Unnamed: 0,trans_date,merchant,category,amt,gender,city,state,city_pop,job,dob,unix_time,customer_id,age
700120,2019-12-09,Dare-Gibson,health_fitness,9.88,F,Saint Louis,MO,927396,Occupational hygienist,1960-03-12,1355062712,29184163,64


In [535]:
df['state'].value_counts().reset_index()[:7]

Unnamed: 0,state,count
0,NY,106
1,PA,105
2,TX,97
3,OH,53
4,CA,52
5,IL,46
6,MO,45


In [536]:
df.groupby('state')['city'].nunique().sort_values(ascending=False).reset_index()[:10]

Unnamed: 0,state,city
0,NY,45
1,PA,43
2,TX,37
3,OH,27
4,CA,25
5,IL,22
6,FL,21


In [513]:
df = df.iloc[:, [0, 1, 2, 11, 3, 4, 8, 9, 12, 5, 6, 7, 10]]

In [514]:
df.sample()

Unnamed: 0,trans_date,merchant,category,customer_id,amt,gender,job,dob,age,city,state,city_pop,unix_time
180201,2019-04-18,Dare-Gibson,health_fitness,66722734,46.05,M,Child psychotherapist,1946-04-03,78,Holloway,OH,128,1334763172


In [543]:
df_ca = df[df['state'] == 'CA']
ca_summary = df_ca.groupby(['trans_date']).agg(
    Total_Sales = ('amt', 'sum')
)
ca_summary.head()

Unnamed: 0_level_0,Total_Sales
trans_date,Unnamed: 1_level_1
2019-01-30,75.48
2019-03-13,69.91
2019-03-15,55.85
2019-03-18,48.88
2019-03-19,13.35


In [545]:
df_ny = df[df['state'] == 'NY']
ny_summary = df_ny.groupby(['trans_date']).agg(
    Total_Sales = ('amt', 'sum')
)
ny_summary.head()

Unnamed: 0_level_0,Total_Sales
trans_date,Unnamed: 1_level_1
2019-01-26,57.88
2019-02-11,89.89
2019-02-14,75.21
2019-02-17,74.43
2019-02-25,24.41


In [523]:
# sns.kdeplot(x=df['age'])

In [524]:
# sns.kdeplot(x=df['amt'])

In [517]:
# sns.boxplot(x='age', y='state', data=df)

In [518]:
# sns.boxplot(x='amt', y='state', data=df)

In [519]:
# def remove_outliers(group):
#     Q1_age = group['age'].quantile(0.25)
#     Q3_age = group['age'].quantile(0.75)
#     IQR_age = Q3_age - Q1_age

#     lower_bound_age = Q1_age - 1.5 * IQR_age
#     upper_bound_age = Q3_age + 1.5 * IQR_age

#     Q1_amt = group['amt'].quantile(0.25)
#     Q3_amt = group['amt'].quantile(0.75)
#     IQR_amt = Q3_amt - Q1_amt

#     lower_bound_amt = Q1_amt - 1.5 * IQR_amt
#     upper_bound_amt = Q3_amt + 1.5 * IQR_amt

#     return group[(group['age'] >= lower_bound_amt) & (group['age'] <= upper_bound_amt) &
#                  (group['amt'] >= lower_bound_age) & (group['amt'] <= upper_bound_age)]

In [520]:
# df = df.groupby('state').apply(remove_outliers).reset_index(drop=True)

In [521]:
# sns.boxplot(x='age', y='state', data=df)

In [522]:
# sns.boxplot(data=df, x='amt', y='state')