# Term Subscription Predictions and Customer Analysis
### Clustering Analysis Notebook
##### Brittany Allen, February 2019

<i> <b> * Please note: </b> this notebook and analysis are both still works in progress. This was something I decided to do as an add-on to my capstone project. My goal is to clean up this notebook and add to my presentation by mid March 2019. </i>

In [32]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#importing the usual suspects (packages/libraies)

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import silhouette_score
from scipy.cluster import hierarchy
#scikit-learn packages

from sklearn.cluster import KMeans, AgglomerativeClustering
#clustering packages

import warnings
warnings.simplefilter(action='ignore')
#preventing those ugly pink warnings from printing out

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
#displayng my plots inline and changing the inline backeng default figure format so it's cleaner

In [34]:
bmd_df = pd.read_csv('./data/bank.csv')

#reading in my DataFrame and assigning it the variable 'bmd_df' – which stands for bank marketing DataFrame

In [35]:
bmd_df = bmd_df.drop(columns=['duration'])

#dropping this feature as it is known to highly influence the target

In [36]:
bmd_df.head()

#displaying the first five rows of my DataFrame

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,2,-1,0,unknown,yes


In [37]:
bmd_df.shape

#checking out the shape of my DataFrame

(11162, 16)

In [38]:
bmd_df.isnull().sum().sum()

#checking for nulls, there are none

0

In [39]:
b = [18, 35, 50, 72, 90, np.inf]
names = ['Millennials (18-34)', 
         'Gen X (35-49)',
         'Baby Boomers (50-71)',
         'Silent Gen (72-89)',
         'GI Gen (90+)']

bmd_df['age_group'] = pd.cut(x = bmd_df.age, bins = b, labels = names, right=False)

I engineered a new feature above, <b>age_group</b>, to bucket customers by their generational age groups as I think it will make data visualization and analysis easier and more meaningful.

In [40]:
bmd_df.head()

#displaying my head again to get a glimpse of the new feature I engineered

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,deposit,age_group
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1,-1,0,unknown,yes,Baby Boomers (50-71)
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1,-1,0,unknown,yes,Baby Boomers (50-71)
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1,-1,0,unknown,yes,Gen X (35-49)
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,1,-1,0,unknown,yes,Baby Boomers (50-71)
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,2,-1,0,unknown,yes,Baby Boomers (50-71)


# Feature Selection / Engineering / Transformation

In [41]:
ss = StandardScaler()

In [42]:
bmd_df.head(3)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,deposit,age_group
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1,-1,0,unknown,yes,Baby Boomers (50-71)
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1,-1,0,unknown,yes,Baby Boomers (50-71)
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1,-1,0,unknown,yes,Gen X (35-49)


In [43]:
to_dummy = (bmd_df.columns[bmd_df.dtypes != float]) & (bmd_df.columns[bmd_df.dtypes != int])

In [44]:
to_dummy

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'poutcome', 'deposit', 'age_group'],
      dtype='object')

In [46]:
bmd_df_dummied = pd.get_dummies(bmd_df[to_dummy], columns=to_dummy, drop_first=True)
bmd_df_dummied.head()

Unnamed: 0,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,...,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown,deposit_yes,age_group_Gen X (35-49),age_group_Baby Boomers (50-71),age_group_Silent Gen (72-89),age_group_GI Gen (90+)
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,1,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,1,1,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,1,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,1,0,0


In [47]:
bmd_df = bmd_df.join(bmd_df_dummied, how='left')

In [48]:
bmd_df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'campaign', 'pdays', 'previous',
       'poutcome', 'deposit', 'age_group', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'job_unknown', 'marital_married', 'marital_single',
       'education_secondary', 'education_tertiary', 'education_unknown',
       'default_yes', 'housing_yes', 'loan_yes', 'contact_telephone',
       'contact_unknown', 'month_aug', 'month_dec', 'month_feb', 'month_jan',
       'month_jul', 'month_jun', 'month_mar', 'month_may', 'month_nov',
       'month_oct', 'month_sep', 'poutcome_other', 'poutcome_success',
       'poutcome_unknown', 'deposit_yes', 'age_group_Gen X (35-49)',
       'age_group_Baby Boomers (50-71)', 'age_group_Silent Gen (72-89)',
       'age_group_GI Gen (90+)'],
      dtype='o

In [51]:
raw_og_features = bmd_df.drop(columns=['job', 'marital', 'education', 'default', 'housing',
       'loan', 'contact', 'month', 'poutcome', 'deposit', 'age_group'])
print('shape:', raw_og_features.shape)
raw_og_features.head(1)

shape: (11162, 46)


Unnamed: 0,age,balance,day,campaign,pdays,previous,job_blue-collar,job_entrepreneur,job_housemaid,job_management,...,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown,deposit_yes,age_group_Gen X (35-49),age_group_Baby Boomers (50-71),age_group_Silent Gen (72-89),age_group_GI Gen (90+)
0,59,2343,5,1,-1,0,0,0,0,0,...,0,0,0,0,1,1,0,1,0,0


In [53]:
raw_og_features.columns

Index(['age', 'balance', 'day', 'campaign', 'pdays', 'previous',
       'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_married', 'marital_single', 'education_secondary',
       'education_tertiary', 'education_unknown', 'default_yes', 'housing_yes',
       'loan_yes', 'contact_telephone', 'contact_unknown', 'month_aug',
       'month_dec', 'month_feb', 'month_jan', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep',
       'poutcome_other', 'poutcome_success', 'poutcome_unknown', 'deposit_yes',
       'age_group_Gen X (35-49)', 'age_group_Baby Boomers (50-71)',
       'age_group_Silent Gen (72-89)', 'age_group_GI Gen (90+)'],
      dtype='object')

In [54]:
sc_og_features = pd.DataFrame(data=ss.fit_transform(raw_og_features), columns=['age_sc', 'balance_sc', 'day_sc', 'campaign_sc', 'pdays_sc', 'previous_sc',
       'job_blue-collar_sc', 'job_entrepreneur_sc', 'job_housemaid_sc',
       'job_management_sc', 'job_retired_sc', 'job_self-employed_sc', 'job_services_sc',
       'job_student_sc', 'job_technician_sc', 'job_unemployed_sc', 'job_unknown_sc',
       'marital_married_sc', 'marital_single_sc', 'education_secondary_sc',
       'education_tertiary_sc', 'education_unknown_sc', 'default_yes_sc', 'housing_yes_sc',
       'loan_yes_sc', 'contact_telephone_sc', 'contact_unknown_sc', 'month_aug_sc',
       'month_dec_sc', 'month_feb_sc', 'month_jan_sc', 'month_jul_sc', 'month_jun_sc',
       'month_mar_sc', 'month_may_sc', 'month_nov_sc', 'month_oct_sc', 'month_sep_sc',
       'poutcome_other_sc', 'poutcome_success_sc', 'poutcome_unknown_sc', 'deposit_yes_sc',
       'age_group_Gen X (35-49)_sc', 'age_group_Baby Boomers (50-71)_sc',
       'age_group_Silent Gen (72-89)_sc', 'age_group_GI Gen (90+)_sc'])
bmd_df = bmd_df.join(sc_og_features)
print('shape:', bmd_df.shape)
bmd_df.head(1)

shape: (11162, 103)


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,month_oct_sc,month_sep_sc,poutcome_other_sc,poutcome_success_sc,poutcome_unknown_sc,deposit_yes_sc,age_group_Gen X (35-49)_sc,age_group_Baby Boomers (50-71)_sc,age_group_Silent Gen (72-89)_sc,age_group_GI Gen (90+)_sc
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,...,-0.190781,-0.171522,-0.224814,-0.325782,0.583626,1.053764,-0.83525,1.901132,-0.14792,-0.02505


In [55]:
def grid_clusters(df_dic, par_dic):
    result_dic = {"Data_frame" :[],
                  "model": [],
                  "inertia": [],
                  "silhouette": [],
                  "Numb_clusters": [],
                  "Cluster_counts": [],
                  "init": [],
                  "model_params": []
                  }
    # loop dataFrames
    for df_name, df_data in df_dic.items():
        X = df_data

        # loop models
        for n_model in par_dic.keys():
            if n_model == "kmeans":
                # loop centroid seeds
                for n_init_seed in par_dic['kmeans']['init_seed']:
                    # loop Method for initialization
                    for n_init_method in par_dic['kmeans']['inits']:
                        # loop algorithms
                        for n_algorithm in par_dic['kmeans']['algorithms']:
                            # loop number of clusters
                            for n_cluster in par_dic['kmeans']['clusters_list']:
                                kmeans = KMeans(n_clusters=n_cluster,
                                                init=n_init_method,
                                                n_init=n_init_seed,
                                                algorithm=n_algorithm,
                                                tol=par_dic['kmeans']['tol'],
                                                random_state=42)
                                kmeans.fit(X)
                                #Saving results in dic
                                result_dic['Data_frame'].append(df_name)
                                result_dic['model'].append(n_model)
                                result_dic['inertia'].append(kmeans.inertia_)
                                result_dic['silhouette'].append(silhouette_score(X, kmeans.labels_))
                                result_dic['Numb_clusters'].append(n_cluster)
                                result_dic['Cluster_counts'].append(list(pd.Series(kmeans.labels_).value_counts()))
                                result_dic['init'].append(n_init_seed)
                                result_dic['model_params'].append(kmeans)
                                
            if n_model == "Agglomerative":
                # loop linkage criterion
                for n_link_met in par_dic['Agglomerative']['linkage_method']:
                    #loop number of clusters
                    for n_clusters in par_dic['Agglomerative']['clusters_list']:
                        # loop method to compute linkage
                        for n_affinity in par_dic['Agglomerative']['affinity']:
                            ac = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')
                            ac.fit(X)
                            # Saving results in dic
                            result_dic['Data_frame'].append(df_name)
                            result_dic['model'].append(n_model)
                            result_dic['inertia'].append(0)
                            result_dic['silhouette'].append(silhouette_score(X, ac.labels_))
                            result_dic['Numb_clusters'].append(n_clusters)
                            result_dic['Cluster_counts'].append(list(pd.Series(ac.labels_).value_counts()))
                            result_dic['init'].append(0)
                            result_dic['model_params'].append(ac)
    return pd.DataFrame(result_dic)

In [56]:

model_dic = {"kmeans": {"init_seed": [10,15,20],
                  "inits"  :  ["k-means++","random"],
                  "algorithms": ["auto"],
                  "clusters_list": range(4,10),
                  "tol": 0.0001},
             
           "Agglomerative": {"linkage_method":['complete', 'single', #ward,
                                           'centroid','median','weighted'],
                         "affinity": ['euclidean', 'l1', 'l2', 'manhattan',
                                      'cosine', 'precomputed'],
                         "clusters_list": range(4,10)}}

In [57]:
raw_og_features = ['age', 'balance', 'day', 'campaign', 'pdays', 'previous',
       'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_married', 'marital_single', 'education_secondary',
       'education_tertiary', 'education_unknown', 'default_yes', 'housing_yes',
       'loan_yes', 'contact_telephone', 'contact_unknown', 'month_aug',
       'month_dec', 'month_feb', 'month_jan', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep',
       'poutcome_other', 'poutcome_success', 'poutcome_unknown', 'deposit_yes',
       'age_group_Gen X (35-49)', 'age_group_Baby Boomers (50-71)',
       'age_group_Silent Gen (72-89)', 'age_group_GI Gen (90+)']

sc_og_features = ['age_sc', 'balance_sc', 'day_sc', 'campaign_sc', 'pdays_sc', 'previous_sc',
       'job_blue-collar_sc', 'job_entrepreneur_sc', 'job_housemaid_sc',
       'job_management_sc', 'job_retired_sc', 'job_self-employed_sc', 'job_services_sc',
       'job_student_sc', 'job_technician_sc', 'job_unemployed_sc', 'job_unknown_sc',
       'marital_married_sc', 'marital_single_sc', 'education_secondary_sc',
       'education_tertiary_sc', 'education_unknown_sc', 'default_yes_sc', 'housing_yes_sc',
       'loan_yes_sc', 'contact_telephone_sc', 'contact_unknown_sc', 'month_aug_sc',
       'month_dec_sc', 'month_feb_sc', 'month_jan_sc', 'month_jul_sc', 'month_jun_sc',
       'month_mar_sc', 'month_may_sc', 'month_nov_sc', 'month_oct_sc', 'month_sep_sc',
       'poutcome_other_sc', 'poutcome_success_sc', 'poutcome_unknown_sc', 'deposit_yes_sc',
       'age_group_Gen X (35-49)_sc', 'age_group_Baby Boomers (50-71)_sc',
       'age_group_Silent Gen (72-89)_sc', 'age_group_GI Gen (90+)_sc']

df_dic = {'raw_og_features': bmd_df[raw_og_features], 'sc_og_features': bmd_df[sc_og_features]}

In [58]:
# Running grid search and saving results in pd Data Frame
result = grid_clusters(df_dic, model_dic)
result.shape

(432, 8)

In [60]:
# Top 5 silhouette
result.sort_values(by=['silhouette'],ascending = False).head(19)

Unnamed: 0,Data_frame,model,inertia,silhouette,Numb_clusters,Cluster_counts,init,model_params
0,raw_og_features,kmeans,19585280000.0,0.727739,4,"[9221, 1707, 221, 13]",10,"KMeans(algorithm='auto', copy_x=True, init='k-..."
24,raw_og_features,kmeans,19585280000.0,0.727739,4,"[9221, 1707, 221, 13]",20,"KMeans(algorithm='auto', copy_x=True, init='k-..."
12,raw_og_features,kmeans,19585280000.0,0.727739,4,"[9221, 1707, 221, 13]",15,"KMeans(algorithm='auto', copy_x=True, init='k-..."
115,raw_og_features,Agglomerative,0.0,0.714626,5,"[9002, 1767, 314, 71, 8]",0,"AgglomerativeClustering(affinity='euclidean', ..."
44,raw_og_features,Agglomerative,0.0,0.714626,5,"[9002, 1767, 314, 71, 8]",0,"AgglomerativeClustering(affinity='euclidean', ..."
45,raw_og_features,Agglomerative,0.0,0.714626,5,"[9002, 1767, 314, 71, 8]",0,"AgglomerativeClustering(affinity='euclidean', ..."
46,raw_og_features,Agglomerative,0.0,0.714626,5,"[9002, 1767, 314, 71, 8]",0,"AgglomerativeClustering(affinity='euclidean', ..."
47,raw_og_features,Agglomerative,0.0,0.714626,5,"[9002, 1767, 314, 71, 8]",0,"AgglomerativeClustering(affinity='euclidean', ..."
78,raw_og_features,Agglomerative,0.0,0.714626,5,"[9002, 1767, 314, 71, 8]",0,"AgglomerativeClustering(affinity='euclidean', ..."
79,raw_og_features,Agglomerative,0.0,0.714626,5,"[9002, 1767, 314, 71, 8]",0,"AgglomerativeClustering(affinity='euclidean', ..."


In [61]:
# Top agglomerative silhouette score
display(result[(result.model == 'Agglomerative')]\
.sort_values(by=['silhouette'],ascending = False).head(1))

# Top kmeans silhouette score
display(result[(result.model == 'kmeans')]\
.sort_values(by=['inertia'],ascending = True).sort_values(by=['silhouette'],ascending = False).head(1))

Unnamed: 0,Data_frame,model,inertia,silhouette,Numb_clusters,Cluster_counts,init,model_params
150,raw_og_features,Agglomerative,0.0,0.714626,5,"[9002, 1767, 314, 71, 8]",0,"AgglomerativeClustering(affinity='euclidean', ..."


Unnamed: 0,Data_frame,model,inertia,silhouette,Numb_clusters,Cluster_counts,init,model_params
24,raw_og_features,kmeans,19585280000.0,0.727739,4,"[9221, 1707, 221, 13]",20,"KMeans(algorithm='auto', copy_x=True, init='k-..."


In [None]:
result.head()

In [63]:
result.model_params.loc[0]

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=42, tol=0.0001, verbose=0)

In [None]:
bmd_df2 = pd.read_csv('./data/bank.csv')
bmd_df2 = bmd_df2.drop(columns=['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome'])
bmd_df2.head(1)

In [None]:
bmd_df = bmd_df.join(bmd_df2, how='left')

In [None]:
bmd_df['target'] = bmd_df.deposit.map(lambda x: 1 if x == 'yes' else 0)

In [64]:
# Hyper parameters chosen:
kmeans5raw_og_features = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=42, tol=0.0001, verbose=0)

model5raw_og_features = kmeans5raw_og_features.fit(bmd_df[raw_og_features])
bmd_df['kmeans5raw_og_features'] = model5raw_og_features.labels_
print("Final model silhuete", silhouette_score(bmd_df[raw_og_features], model5raw_og_features.labels_))
print("Final model Inertia", model5raw_og_features.inertia_)

print("\nFinal model Cluster Count\n")

print(bmd_df['kmeans5raw_og_features'].value_counts(sort=False))

print("\n\nFinal model Term Deposit per cluster:")
display(bmd_df.groupby(['kmeans5raw_og_features']).agg([max,min,np.mean,np.median]))

Final model silhuete 0.7017434347803613
Final model Inertia 11147263538.34508

Final model Cluster Count

0    2006
1    8630
2      71
3     447
4       8
Name: kmeans5raw_og_features, dtype: int64


Final model Term Deposit per cluster:


Unnamed: 0_level_0,age,age,age,age,balance,balance,balance,balance,day,day,...,age_group_Baby Boomers (50-71)_sc,age_group_Baby Boomers (50-71)_sc,age_group_Silent Gen (72-89)_sc,age_group_Silent Gen (72-89)_sc,age_group_Silent Gen (72-89)_sc,age_group_Silent Gen (72-89)_sc,age_group_GI Gen (90+)_sc,age_group_GI Gen (90+)_sc,age_group_GI Gen (90+)_sc,age_group_GI Gen (90+)_sc
Unnamed: 0_level_1,max,min,mean,median,max,min,mean,median,max,min,...,mean,median,max,min,mean,median,max,min,mean,median
kmeans5raw_og_features,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,95,20,44.119641,42.0,6298,1928,3386.611167,3109,31,1,...,0.168502,-0.526002,6.760394,-0.14792,0.127586,-0.14792,39.919562,-0.02505,-0.005138,-0.02505
1,93,18,40.37022,38.0,1925,-6847,464.171379,315,31,1,...,-0.051263,-0.526002,6.760394,-0.14792,-0.039052,-0.14792,39.919562,-0.02505,0.002721,-0.02505
2,77,24,46.0,45.0,37127,16397,23325.56338,22520,31,1,...,0.294437,-0.526002,6.760394,-0.14792,0.14398,-0.14792,-0.02505,-0.02505,-0.02505,-0.02505
3,87,20,43.816555,42.0,15841,6307,9213.284116,8556,31,1,...,0.174446,-0.526002,6.760394,-0.14792,0.130267,-0.14792,-0.02505,-0.02505,-0.02505,-0.02505
4,84,39,60.0,58.5,81204,45248,60969.125,54709,28,1,...,0.687565,0.687565,6.760394,-0.14792,1.579158,-0.14792,-0.02505,-0.02505,-0.02505,-0.02505


In [None]:
bmd_df[raw_og_features].columns

In [None]:
sorted(predicted_data_col.unique())

In [None]:
## PLOTS NUMBER OF REVIEWS VS NUMBER OF $ PRICE COUNTS
fig, ax = plt.subplots(2,1,figsize=(14,16),sharey=False)
ax = ax.ravel()
predicted_data_col = bmd_df['kmeans4raw_og_features']
for i in sorted(predicted_data_col.unique()):
    ax[0].scatter(bmd_df[predicted_data_col==i].balance, bmd_df[predicted_data_col==i].age, label=str(i))
    ax[0].legend()
    ax[1].scatter(bmd_df[predicted_data_col==i].balance, bmd_df[predicted_data_col==i].age, label=str(i))
    ax[1].legend()
#     ax[2].scatter(bmd_df[predicted_data_col==i].balance, bmd_df[predicted_data_col==i].age, label=str(i))
#     ax[2].legend()
#     ax[3].scatter(bmd_df[predicted_data_col==i].balance, bmd_df[predicted_data_col==i].age, label=str(i))
#     ax[3].legend()
#     ax[4].scatter(bmd_df[predicted_data_col==i].balance, bmd_df[predicted_data_col==i].age, label=str(i))
#     ax[4].legend()

#ax[0].scatter(data.pr_1, data.rv_1)
# ax[0].set_title('\$ by ZIP', fontsize=22)
# ax[0].set_xlabel('Number Businesses in Range (Standardized)')
# ax[0].set_ylabel('Number of Reviews (Standardized)')
# ax[0].legend()

#ax[1].scatter(data.pr_2, data.rv_2)
# ax[1].set_title('\$$ by ZIP', fontsize=22)
# ax[1].set_xlabel('Number Businesses in Range (Standardized)')
# ax[1].set_ylabel('Number of Reviews (Standardized)')
# ax[1].legend()

In [None]:
predicted_data_col.unique()

In [None]:
predicted_data_col = bmd_df['kmeans5raw_og_features']

display(bmd_df.groupby('kmeans5raw_og_features').agg([max,min,np.mean,np.median]))

In [None]:
# Hyper parameters chosen:
kmeans4raw_og_features = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=42, tol=0.0001, verbose=0)

model5raw_og_features = kmeans4raw_og_features.fit(bmd_df[raw_og_features])
bmd_df['kmeans4raw_og_features'] = model5raw_og_features.labels_
print("Final model silhuete", silhouette_score(bmd_df[raw_og_features], model5raw_og_features.labels_))
print("Final model Inertia", model5raw_og_features.inertia_)

print("\nFinal model Cluster Count\n")

print(bmd_df['kmeans4raw_og_features'].value_counts(sort=False))

print("\n\nFinal model Term Deposit per cluster:")
display(bmd_df.groupby(['kmeans4raw_og_features'])['target'].agg([max,min,np.mean,np.median]))

In [None]:
bmd_df.columns

In [None]:
bmd_df_target = bmd_df[raw_og_features].join(bmd_df2, how='left')

In [None]:
fig, ax = plt.subplots(2,2,figsize=(14,8),sharey=False)
ax = ax.ravel()
predicted_data_col = bmd_df['kmeans4raw_og_features']
ax[0].scatter(bmd_df[predicted_data_col==0].balance, bmd_df[predicted_data_col==0].age, label=str(0))
ax[0].legend(loc='center')
ax[1].scatter(bmd_df[predicted_data_col==1].balance, bmd_df[predicted_data_col==1].age, label=str(1))
ax[1].legend(loc='center')
ax[2].scatter(bmd_df[predicted_data_col==2].balance, bmd_df[predicted_data_col==2].age, label=str(2))
ax[2].legend(loc='center')
ax[3].scatter(bmd_df[predicted_data_col==3].balance, bmd_df[predicted_data_col==3].age, label=str(3))
ax[3].legend(loc='center')


In [None]:
fig, ax = plt.subplots(2,2,figsize=(14,8),sharey=False)
ax = ax.ravel()
predicted_data_col = bmd_df['kmeans4raw_og_features']
ax[0].hist(bmd_df[predicted_data_col==0].age, label=str(0))
ax[0].legend(loc='center')
ax[1].hist(bmd_df[predicted_data_col==1].age, label=str(1))
ax[1].legend(loc='center')
ax[2].hist(bmd_df[predicted_data_col==2].age, label=str(2))
ax[2].legend(loc='center')
ax[3].hist(bmd_df[predicted_data_col==3].age, label=str(3))
ax[3].legend(loc='center')


In [None]:
predicted_data_col = bmd_df['kmeans5raw_og_features']

display(bmd_df.groupby('kmeans5raw_og_features').agg([max,min,np.mean,np.median]))

In [None]:
bmd_df.shape

In [None]:
predicted_data_col = bmd_df['kmeans4raw_og_features']

grouped_bmd_df = bmd_df.groupby('kmeans4raw_og_features').agg([min,max,np.median])
grouped_bmd_df.shape

In [None]:
bmd_df.columns[50:40]

In [None]:
grouped_bmd_df[['age_group_Gen X (35-49)', 'age_group_Baby Boomers (50-71)', 'age_group_Silent Gen (72-89)', 'age_group_GI Gen (90+)']]