In [75]:
import pandas as pd
import numpy as np

campaign = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank.csv')

# Goals 
1. the core task we're interested in is identifying those customers most likely to subscribe to a term deposit

 

2. actionable patterns in data

 

3. does frequent contact have an adverse effect

 

4. seperate models for historic economic indicators

 

5. the model should be saved to a file

 

6. run the model on a different set of data and make target predictions

In [76]:
campaign.dtypes

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                  object
dtype: object

In [77]:
campaign.head(5)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [78]:
yes = campaign[ campaign['y'] == 'yes']
yes['day_of_week'].value_counts()

day_of_week
thu    958
wed    865
tue    857
fri    767
mon    761
Name: count, dtype: int64

In [79]:
campaign.replace({'no':0,'yes':1},inplace=True)

In [80]:
from typing_extensions import dataclass_transform
'''import altair as alt

day = campaign.groupby(["day_of_week","y"])["day_of_week"].count()
day.head(15)

alt.Chart(day).mark_bar().transform_calculate().encode(
    alt.X('y:N', title='Survived', axis=alt.Axis(labelFontSize=14, labelAngle=0)),
    alt.Y('', title='Total Survivors'),
    column=alt.Column('day_of_week:N', title=None)
).properties(
    title='How Did Passenger Gender Affect Survival',
    width=100
)'''

'import altair as alt\n\nday = campaign.groupby(["day_of_week","y"])["day_of_week"].count()\nday.head(15)\n\nalt.Chart(day).mark_bar().transform_calculate().encode(\n    alt.X(\'y:N\', title=\'Survived\', axis=alt.Axis(labelFontSize=14, labelAngle=0)),\n    alt.Y(\'\', title=\'Total Survivors\'),\n    column=alt.Column(\'day_of_week:N\', title=None)\n).properties(\n    title=\'How Did Passenger Gender Affect Survival\',\n    width=100\n)'

In [81]:
check = campaign[["default", "housing", "loan"]]
check.apply(pd.Series.value_counts)
campaign = campaign.replace("unknown", np.nan)
#(campaign==np.nan).sum()
campaign.isna().sum()

age                  0
job                294
marital             69
education         1535
default           7725
housing            894
loan               894
contact              0
month                0
day_of_week          0
campaign             0
pdays                0
previous             0
poutcome             0
emp.var.rate         0
cons.price.idx       0
cons.conf.idx        0
euribor3m            0
nr.employed          0
y                    0
dtype: int64

In [82]:
# both default and poutcome have so little useable data in comparison to missing data or such a poor balance of values that I don't think they are worth using in training.
from sklearn.impute import SimpleImputer
campaign = campaign.drop(["default", "poutcome"], axis=1)
campaign["housing"] = campaign["housing"].astype(float)
campaign["loan"] = campaign["loan"].astype(float)
campaign.dtypes

age                 int64
job                object
marital            object
education          object
housing           float64
loan              float64
contact            object
month              object
day_of_week        object
campaign            int64
pdays               int64
previous            int64
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                   int64
dtype: object

In [83]:

campaign["education"]=campaign["education"].replace(["illiterate","basic.4y","basic.6y","basic.9y","high.school","professional.course","university.degree"],[0,1,2,3,4,5,6])
campaign["education"].value_counts()
campaign = pd.get_dummies(campaign,prefix=["job"],columns=["job"])
campaign = pd.get_dummies(campaign,prefix=["marital"],columns=["marital"])
campaign = pd.get_dummies(campaign,prefix=["contact"],columns=["contact"])
#campaign["job"] = campaign["job"].replace(["unemployed","student","services","housemaid","blue-collar","technician","admin.","management","self-employed","entrepreneur","retired"],[0,1,2,3,4,5,6,7,8,9,10])
#campaign["job"].value_counts()
campaign.dtypes


age                    int64
education            float64
housing              float64
loan                 float64
contact               object
month                 object
day_of_week           object
campaign               int64
pdays                  int64
previous               int64
emp.var.rate         float64
cons.price.idx       float64
cons.conf.idx        float64
euribor3m            float64
nr.employed          float64
y                      int64
job_admin.              bool
job_blue-collar         bool
job_entrepreneur        bool
job_housemaid           bool
job_management          bool
job_retired             bool
job_self-employed       bool
job_services            bool
job_student             bool
job_technician          bool
job_unemployed          bool
marital_divorced        bool
marital_married         bool
marital_single          bool
dtype: object