In [1]:
import pandas as pd
import numpy as np

campaign = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank.csv')

# Goals 
1. the core task we're interested in is identifying those customers most likely to subscribe to a term deposit

 

2. actionable patterns in data

 

3. does frequent contact have an adverse effect

 

4. seperate models for historic economic indicators

 

5. the model should be saved to a file

 

6. run the model on a different set of data and make target predictions

In [2]:
campaign.dtypes

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                  object
dtype: object

In [3]:
campaign.head(5)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [4]:
yes = campaign[ campaign['y'] == 'yes']
yes['day_of_week'].value_counts()

thu    958
wed    865
tue    857
fri    767
mon    761
Name: day_of_week, dtype: int64

In [5]:
campaign.replace({'no':0,'yes':1},inplace=True)

In [6]:
from typing_extensions import dataclass_transform
'''import altair as alt

day = campaign.groupby(["day_of_week","y"])["day_of_week"].count()
day.head(15)

alt.Chart(day).mark_bar().transform_calculate().encode(
    alt.X('y:N', title='Survived', axis=alt.Axis(labelFontSize=14, labelAngle=0)),
    alt.Y('', title='Total Survivors'),
    column=alt.Column('day_of_week:N', title=None)
).properties(
    title='How Did Passenger Gender Affect Survival',
    width=100
)'''

'import altair as alt\n\nday = campaign.groupby(["day_of_week","y"])["day_of_week"].count()\nday.head(15)\n\nalt.Chart(day).mark_bar().transform_calculate().encode(\n    alt.X(\'y:N\', title=\'Survived\', axis=alt.Axis(labelFontSize=14, labelAngle=0)),\n    alt.Y(\'\', title=\'Total Survivors\'),\n    column=alt.Column(\'day_of_week:N\', title=None)\n).properties(\n    title=\'How Did Passenger Gender Affect Survival\',\n    width=100\n)'

In [7]:
check = campaign[["default", "housing", "loan"]]
check.apply(pd.Series.value_counts)
campaign = campaign.replace("unknown", np.nan)
#(campaign==np.nan).sum()
campaign.isna().sum()

age                  0
job                294
marital             69
education         1535
default           7725
housing            894
loan               894
contact              0
month                0
day_of_week          0
campaign             0
pdays                0
previous             0
poutcome             0
emp.var.rate         0
cons.price.idx       0
cons.conf.idx        0
euribor3m            0
nr.employed          0
y                    0
dtype: int64

In [8]:
# both default and poutcome have so little useable data in comparison to missing data or such a poor balance of values that I don't think they are worth using in training.
campaign = campaign.drop(["default", "poutcome"], axis=1)
campaign["housing"] = campaign["housing"].astype(float)
campaign["loan"] = campaign["loan"].astype(float)
campaign.dtypes

age                 int64
job                object
marital            object
education          object
housing           float64
loan              float64
contact            object
month              object
day_of_week        object
campaign            int64
pdays               int64
previous            int64
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                   int64
dtype: object

In [9]:

campaign["education"]=campaign["education"].replace(["illiterate","basic.4y","basic.6y","basic.9y","high.school","professional.course","university.degree"],[1,2,3,4,5,6,7])
campaign["education"].value_counts()
campaign = pd.get_dummies(campaign,prefix=["job"],columns=["job"])
campaign = pd.get_dummies(campaign,prefix=["marital"],columns=["marital"])
campaign = pd.get_dummies(campaign,prefix=["contact"],columns=["contact"])
#campaign["job"] = campaign["job"].replace(["unemployed","student","services","housemaid","blue-collar","technician","admin.","management","self-employed","entrepreneur","retired"],[0,1,2,3,4,5,6,7,8,9,10])
#campaign["job"].value_counts()


In [10]:
campaign['day_of_week'] = campaign['day_of_week'].replace(["mon","tue","wed","thu","fri"],[1,2,3,4,5]) 
campaign['month'] = campaign['month'].replace(["jan","feb","mar","apr", "may","jun", "jul", "aug", "sep", "oct", "nov", "dec"],[1,2,3,4,5,6,7,8,9,10,11,12]) 
print(campaign['day_of_week'].value_counts())
print(campaign['month'].value_counts())


4    7742
1    7657
3    7347
2    7287
5    7036
Name: day_of_week, dtype: int64
5     12370
7      6445
8      5555
6      4817
11     3698
4      2369
10      653
9       508
3       496
12      158
Name: month, dtype: int64


In [11]:
campaign.dtypes

age                    int64
education            float64
housing              float64
loan                 float64
month                  int64
day_of_week            int64
campaign               int64
pdays                  int64
previous               int64
emp.var.rate         float64
cons.price.idx       float64
cons.conf.idx        float64
euribor3m            float64
nr.employed          float64
y                      int64
job_admin.             uint8
job_blue-collar        uint8
job_entrepreneur       uint8
job_housemaid          uint8
job_management         uint8
job_retired            uint8
job_self-employed      uint8
job_services           uint8
job_student            uint8
job_technician         uint8
job_unemployed         uint8
marital_divorced       uint8
marital_married        uint8
marital_single         uint8
contact_cellular       uint8
contact_telephone      uint8
dtype: object

In [12]:
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')
imp_mean.fit(campaign[['loan','housing']])
campaign[['loan','housing']] = imp_mean.transform(campaign[['loan','housing']])
imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_mode.fit(campaign[["education"]])
campaign[["education"]] = imp_mode.transform(campaign[["education"]])
campaign.isna().sum()

age                  0
education            0
housing              0
loan                 0
month                0
day_of_week          0
campaign             0
pdays                0
previous             0
emp.var.rate         0
cons.price.idx       0
cons.conf.idx        0
euribor3m            0
nr.employed          0
y                    0
job_admin.           0
job_blue-collar      0
job_entrepreneur     0
job_housemaid        0
job_management       0
job_retired          0
job_self-employed    0
job_services         0
job_student          0
job_technician       0
job_unemployed       0
marital_divorced     0
marital_married      0
marital_single       0
contact_cellular     0
contact_telephone    0
dtype: int64

In [13]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# Encode our features and target as needed
features = ["age",
"education",
"housing",
"loan",
"month",
"day_of_week",
"campaign",
"pdays",
"emp.var.rate",
"cons.price.idx",
"cons.conf.idx",
"euribor3m",
"nr.employed",
"job_admin.",
"job_blue-collar",
"job_entrepreneur",
"job_housemaid",
"job_management",
"job_retired",
"job_self-employed",
"job_services",
"job_student",
"job_technician",
"job_unemployed",
"marital_divorced",
"marital_married",
"marital_single",
"contact_cellular",
"contact_telephone"]
X = pd.get_dummies(campaign[features], drop_first=True)
y = campaign['y']

# Split our data into training and test data, with 30% reserved for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

# Build the decision tree
clf = DecisionTreeClassifier()

# Train it
clf.fit(X_train, y_train)

# Test it 
clf.score(X_test, y_test)

0.8429098102688607

In [14]:
import pickle
s = pickle.dumps(clf)
clf2 = pickle.loads(s)

In [21]:
campaign = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank_holdout_test.csv')
clf2 = pickle.loads(s)

campaign.replace({'no':0,'yes':1},inplace=True)

campaign = campaign.replace("unknown", np.nan)

campaign = campaign.drop(["default", "poutcome"], axis=1)
campaign["housing"] = campaign["housing"].astype(float)
campaign["loan"] = campaign["loan"].astype(float)

campaign["education"]=campaign["education"].replace(["illiterate","basic.4y","basic.6y","basic.9y","high.school","professional.course","university.degree"],[1,2,3,4,5,6,7])
campaign["education"].value_counts()
campaign = pd.get_dummies(campaign,prefix=["job"],columns=["job"])
campaign = pd.get_dummies(campaign,prefix=["marital"],columns=["marital"])
campaign = pd.get_dummies(campaign,prefix=["contact"],columns=["contact"])

campaign['day_of_week'] = campaign['day_of_week'].replace(["mon","tue","wed","thu","fri"],[1,2,3,4,5]) 
campaign['month'] = campaign['month'].replace(["jan","feb","mar","apr", "may","jun", "jul", "aug", "sep", "oct", "nov", "dec"],[1,2,3,4,5,6,7,8,9,10,11,12]) 

imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')
imp_mean.fit(campaign[['loan','housing']])
campaign[['loan','housing']] = imp_mean.transform(campaign[['loan','housing']])
imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_mode.fit(campaign[["education"]])
campaign[["education"]] = imp_mode.transform(campaign[["education"]])


from pathlib import Path  
filepath = Path('out.csv')  
filepath.parent.mkdir(exist_ok=True)

predict = clf2.predict(campaign[features])
df = pd.DataFrame(predict, columns=["predictions"])

df.to_csv(filepath, index=False)

Does it make sense to call students at all?

In [16]:
# Does it make sense to call students at all?
import pandas as pd

campaign = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank.csv')

yes = campaign[ campaign['y'] == 'yes']
yes['job'].value_counts()

admin.           1234
technician        659
blue-collar       568
retired           387
management        303
services          293
student           249
self-employed     137
unemployed        134
entrepreneur      113
housemaid          96
unknown            35
Name: job, dtype: int64

In [17]:
import altair as alt

alt.Chart(yes).mark_bar(
    
).encode(
    x=alt.X('job:N', axis=alt.Axis(labelFontSize=14, labelAngle=0)),
    y=alt.Y('count()', title='Number of People'),
).properties(
    title="The Job of the Client Aubscribed a Term Deposit"
)