In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error, mean_absolute_error, classification_report, confusion_matrix

In [3]:
# Mount GDrive's folders
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# This code imports a library "os" that allows file navigation
import os
# This code sets the home directory
# Find your folder and put the path here as a string
os.chdir('/content/drive/MyDrive/my_workspace')

In [7]:
df = pd.read_csv('Data/cc_clean.csv')
df

Unnamed: 0,cc_num,gender,city,city_pop,job,dob,acct_num,acct_num2,trans_num,unix_time,category,amt,trans_datetime
0,6.760000e+11,M,Dasmarinas,659019,Chartered loss adjuster,12/12/1958,7.980000e+11,798000000000,a72eaa86b043eed95b25bbb25b3153a1,1581314011,shopping_net,68.88,2020-02-10 13:53:31
1,3.520000e+15,M,Digos,169393,"Administrator, charities/voluntary organisations",31/08/1970,9.680000e+11,968000000000,060d12f91c13871a13963041736a4702,1590902968,entertainment,50.06,2020-05-31 13:29:28
2,4.140000e+18,M,Calapan,133893,Financial controller,23/07/1953,6.280000e+11,628000000000,18aafb6098ab0923886c0ac83592ef8d,1585461157,food_dining,105.44,2020-03-29 13:52:37
3,4.720000e+15,M,Laoag,111125,Dance movement psychotherapist,11/01/1954,2.570000e+11,257000000000,c20ee88b451f637bc6893b7460e9fee0,1601282159,gas_transport,82.69,2020-09-28 16:35:59
4,3.530000e+15,M,City of Paranaque,665822,"Engineer, water",31/07/1961,5.400000e+11,540000000000,b389cc449c9c298e8c004024449f7a27,1594960430,shopping_net,363.49,2020-07-17 12:33:50
...,...,...,...,...,...,...,...,...,...,...,...,...,...
92427,3.530000e+15,M,Dasmarinas,659019,"Physicist, medical",26/03/1965,2.010000e+11,201000000000,4f77498d91283c4910a636b2e8149dda,1587273415,misc_pos,6.54,2020-04-19 13:16:55
92428,2.470000e+15,M,San Fernando,306659,"Surveyor, quantity",01/11/1935,5.811000e+11,581000000000,d44f411eabd406a76a60546e723a98fd,1628185569,kids_pets,98.23,2021-08-06 01:46:09
92429,3.520000e+15,M,Masbate,95389,Wellsite geologist,20/11/1967,5.310000e+11,531000000000,7e767a74cae901c13f1a9d1d37aa63d4,1621481285,grocery_pos,78.79,2021-05-20 11:28:05
92430,4.620000e+15,M,San Fernando,121812,Personnel officer,20/11/1934,5.550000e+11,555000000000,6ced184c93e66028e8d235ad3060de90,1625341374,personal_care,31.37,2021-07-04 03:42:54


In [8]:
gender_demog = df.groupby(['gender'])['acct_num'].nunique()
gender_demog

gender
F     6
M    88
Name: acct_num, dtype: int64

In [9]:
df["birth_date"] = pd.to_datetime(df.dob, format="%d/%m/%Y")

In [10]:
current_date = pd.to_datetime("2022-01-01")
df["age"] = (current_date - df["birth_date"])// pd.Timedelta(days=365.25)

In [11]:
bins = [-np.Inf,1927,1945,1964,1980,1996,2012,np.Inf]
labels = ["GREATEST","SILENT_GEN","BABY_BOOMER","GEN_X","MILLENIAL","GEN_Z","GEN_ALPHA"]

df["generation"] = pd.cut(df.birth_date.dt.year,
                                    bins=bins,
                                    labels=labels,
                                    right=True) #Silent: (-inf,1945], BabyB: (1945,1964]

In [12]:
df.head()

Unnamed: 0,cc_num,gender,city,city_pop,job,dob,acct_num,acct_num2,trans_num,unix_time,category,amt,trans_datetime,birth_date,age,generation
0,676000000000.0,M,Dasmarinas,659019,Chartered loss adjuster,12/12/1958,798000000000.0,798000000000,a72eaa86b043eed95b25bbb25b3153a1,1581314011,shopping_net,68.88,2020-02-10 13:53:31,1958-12-12,63,BABY_BOOMER
1,3520000000000000.0,M,Digos,169393,"Administrator, charities/voluntary organisations",31/08/1970,968000000000.0,968000000000,060d12f91c13871a13963041736a4702,1590902968,entertainment,50.06,2020-05-31 13:29:28,1970-08-31,51,GEN_X
2,4.14e+18,M,Calapan,133893,Financial controller,23/07/1953,628000000000.0,628000000000,18aafb6098ab0923886c0ac83592ef8d,1585461157,food_dining,105.44,2020-03-29 13:52:37,1953-07-23,68,BABY_BOOMER
3,4720000000000000.0,M,Laoag,111125,Dance movement psychotherapist,11/01/1954,257000000000.0,257000000000,c20ee88b451f637bc6893b7460e9fee0,1601282159,gas_transport,82.69,2020-09-28 16:35:59,1954-01-11,67,BABY_BOOMER
4,3530000000000000.0,M,City of Paranaque,665822,"Engineer, water",31/07/1961,540000000000.0,540000000000,b389cc449c9c298e8c004024449f7a27,1594960430,shopping_net,363.49,2020-07-17 12:33:50,1961-07-31,60,BABY_BOOMER


In [13]:
gen_demog = df.groupby(['generation'])['acct_num'].nunique()
gen_demog

generation
GREATEST        3
SILENT_GEN     17
BABY_BOOMER    55
GEN_X          19
MILLENIAL       0
GEN_Z           0
GEN_ALPHA       0
Name: acct_num, dtype: int64

In [14]:
city_demog = df.groupby(['city'])['acct_num'].nunique()
city_demog.value_counts()

acct_num
1    36
2    15
3     5
4     2
5     1
Name: count, dtype: int64

In [15]:
df['trans_datetime'] = pd.to_datetime(df['trans_datetime'])
df['year'] = df['trans_datetime'].dt.year

In [16]:
df["month"] = df.trans_datetime.dt.month #01
df["month_abbr"] = df.trans_datetime.dt.strftime('%b') #Jan

In [17]:
df.head()

Unnamed: 0,cc_num,gender,city,city_pop,job,dob,acct_num,acct_num2,trans_num,unix_time,category,amt,trans_datetime,birth_date,age,generation,year,month,month_abbr
0,676000000000.0,M,Dasmarinas,659019,Chartered loss adjuster,12/12/1958,798000000000.0,798000000000,a72eaa86b043eed95b25bbb25b3153a1,1581314011,shopping_net,68.88,2020-02-10 13:53:31,1958-12-12,63,BABY_BOOMER,2020,2,Feb
1,3520000000000000.0,M,Digos,169393,"Administrator, charities/voluntary organisations",31/08/1970,968000000000.0,968000000000,060d12f91c13871a13963041736a4702,1590902968,entertainment,50.06,2020-05-31 13:29:28,1970-08-31,51,GEN_X,2020,5,May
2,4.14e+18,M,Calapan,133893,Financial controller,23/07/1953,628000000000.0,628000000000,18aafb6098ab0923886c0ac83592ef8d,1585461157,food_dining,105.44,2020-03-29 13:52:37,1953-07-23,68,BABY_BOOMER,2020,3,Mar
3,4720000000000000.0,M,Laoag,111125,Dance movement psychotherapist,11/01/1954,257000000000.0,257000000000,c20ee88b451f637bc6893b7460e9fee0,1601282159,gas_transport,82.69,2020-09-28 16:35:59,1954-01-11,67,BABY_BOOMER,2020,9,Sep
4,3530000000000000.0,M,City of Paranaque,665822,"Engineer, water",31/07/1961,540000000000.0,540000000000,b389cc449c9c298e8c004024449f7a27,1594960430,shopping_net,363.49,2020-07-17 12:33:50,1961-07-31,60,BABY_BOOMER,2020,7,Jul


In [18]:
active_demog = df.groupby(['year','gender'])['acct_num'].nunique()
active_demog

year  gender
2020  F          6
      M         72
2021  F          6
      M         82
Name: acct_num, dtype: int64

In [19]:
df['age'].describe()

count    92432.000000
mean        66.443645
std         10.478605
min         51.000000
25%         58.000000
50%         65.000000
75%         72.000000
max         95.000000
Name: age, dtype: float64

In [20]:
agg_mth = df.groupby(["year","month","month_abbr"]).agg(total_amt = ("amt","sum"),
                                           total_cnt = ("amt","count")).reset_index()

In [21]:
import plotly.express as px

fig = px.line(agg_mth, x="month", y="total_amt", color="year")

# Change the background color to white
fig.update_layout(
    plot_bgcolor='white',  # White background for the plot area
    paper_bgcolor='white',  # White background for the entire figure
    width=500,  # Width in pixels
    height=300  # Height in pixels
)

fig.update_xaxes(
    tickmode='array',
    tickvals=agg_mth["month"],
    ticktext=agg_mth["month_abbr"],
    tickangle=-90)  # Show all tick marks

fig.show()

fig = px.line(agg_mth, x="month", y="total_cnt", color="year")

# Change the background color to white
fig.update_layout(
    plot_bgcolor='white',  # White background for the plot area
    paper_bgcolor='white',  # White background for the entire figure
    width=500,  # Width in pixels
    height=300  # Height in pixels
)

fig.update_xaxes(
    tickmode='array',
    tickvals=agg_mth["month"],
    ticktext=agg_mth["month_abbr"],
    tickangle=-90)  # Show all tick marks

fig.show()