# Programming tasks: part 9
Required datasets: `bank-full-encoded.csv`, `pima.csv`, `SAheart2.csv`.

In [35]:
import numpy as np
import matplotlib.pyplot as plt
import pandas
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

## Task 1

### a) 
Load `bank-full-encoded.csv` and split the dataset in training (70%) and test (30%) sets.

In [2]:
dataframe = pandas.read_csv('bank-full-encoded.csv', header=0)
dataframe.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,40,4,1,2,0,3036,1,0,2,4,8,261,0,0,0,3,0
1,26,9,2,1,0,945,1,0,2,4,8,151,0,0,0,3,0
2,15,2,1,1,0,918,1,1,2,4,8,76,0,0,0,3,0
3,29,1,1,3,0,2420,1,0,2,4,8,92,0,0,0,3,0
4,15,11,2,3,0,917,0,0,2,4,8,198,0,0,0,3,0


In [3]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   age        45211 non-null  int64
 1   job        45211 non-null  int64
 2   marital    45211 non-null  int64
 3   education  45211 non-null  int64
 4   default    45211 non-null  int64
 5   balance    45211 non-null  int64
 6   housing    45211 non-null  int64
 7   loan       45211 non-null  int64
 8   contact    45211 non-null  int64
 9   day        45211 non-null  int64
 10  month      45211 non-null  int64
 11  duration   45211 non-null  int64
 12  campaign   45211 non-null  int64
 13  pdays      45211 non-null  int64
 14  previous   45211 non-null  int64
 15  poutcome   45211 non-null  int64
 16  y          45211 non-null  int64
dtypes: int64(17)
memory usage: 5.9 MB


In [4]:
data = dataframe.values
X, y = data[:,:-1], data[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.7)

### b) 
Fit a naive Bayes model to the training set and calculate the AUC score.




In [5]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
pred = gnb.predict_proba(X_test)[:,1]
print(100*round(roc_auc_score(y_test, pred), 4))

81.74


### c) 
Discretize the training set. Use the bin edges from the training set to discretize the test set. Fit the categorical Naive Bayes model to the discretized training set. Calculate the AUC score.

In [6]:
disc = KBinsDiscretizer(n_bins = 10, strategy = 'uniform', encode = 'ordinal')
colt = ColumnTransformer([('disc', disc, [0]+[5]+[9]+list(range(11,15)))], remainder='passthrough')  # transform columns 0,5,9 and 11-14
cnb = CategoricalNB()
pipe = Pipeline([('colt', colt), ('cnb', cnb)])
pipe.fit(X_train, y_train)  # fit_transform the ColumnTransformer, then fit the CategoricalNB
cnb_probs = pipe.predict_proba(X_test)[:,1]    # fit the ColumnTransformer, predict_proba on the CategoricalNB
print(round(100*roc_auc_score(y_test, cnb_probs), 2))

87.46


In [7]:
pipe['cnb'].classes_

array([0, 1])

In [8]:
disc2 = pipe['colt'].transformers_
disc2

[('disc',
  KBinsDiscretizer(encode='ordinal', n_bins=10, strategy='uniform'),
  [0, 5, 9, 11, 12, 13, 14]),
 ('remainder', 'passthrough', [1, 2, 3, 4, 6, 7, 8, 10, 15])]

## Task 2



### a) 
Load `pima.csv` and split the dataset in training (70%) and test (30%) sets.

In [9]:
from pandas.io.parsers.readers import read_csv
dataframe = pandas.read_csv('pima.csv', header=0)
dataframe.head()

Unnamed: 0,times,glucose,blood,skin,insulin,BMI,pedigree,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [11]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   times     768 non-null    int64  
 1   glucose   768 non-null    int64  
 2   blood     768 non-null    int64  
 3   skin      768 non-null    int64  
 4   insulin   768 non-null    int64  
 5   BMI       768 non-null    float64
 6   pedigree  768 non-null    float64
 7   age       768 non-null    int64  
 8   class     768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [12]:
data = dataframe.values
X,y = data[:,:-1], data[:,-1]

X_train,X_test,y_train,y_test = train_test_split(X,y,train_size = 0.7,test_size = 0.3)

Recall task 2 from part 8. The `pima.csv` contains missing values marked as 0 in the `blood` column (which contains patients' blood pressure). Before continuing we will create copies of training and test sets and change 0 values to numpy NaN.

In [20]:
X_train2 = X_train.copy()
X_test2 = X_test.copy()

X_train2[:,2][X_train2[:,2]==0] = np.nan
X_test2[:,2][X_test2[:,2]==0] = np.nan

### b)
Consider two methods of replacing 0 values in the `blood` column: using the mean of other observations vs replacing them with the value 50. Transform the original dataset and train a decision tree with max number of observations in a leaf equal to [4,5,...,10] and a max depth of [6,...,12]. Using a pipeline find the best imputing method and tree parameters.

In [21]:
imputer = SimpleImputer()
tree = DecisionTreeClassifier()
pipe = Pipeline(steps = [('imputer', imputer), ('tree', tree)])

param_grid = {'imputer__strategy':['mean','constant'],
              'imputer__fill_value':[50],
              'tree__min_samples_leaf':[4,5,6,7,8,9,10],
              'tree__max_depth':[6,7,8,9,10,11,12]}

In [22]:
search = GridSearchCV(pipe,param_grid,n_jobs=-1)
search.fit(X_train2,y_train)

GridSearchCV(estimator=Pipeline(steps=[('imputer', SimpleImputer()),
                                       ('tree', DecisionTreeClassifier())]),
             n_jobs=-1,
             param_grid={'imputer__fill_value': [50],
                         'imputer__strategy': ['mean', 'constant'],
                         'tree__max_depth': [6, 7, 8, 9, 10, 11, 12],
                         'tree__min_samples_leaf': [4, 5, 6, 7, 8, 9, 10]})

In [23]:
search.best_params_

{'imputer__fill_value': 50,
 'imputer__strategy': 'mean',
 'tree__max_depth': 12,
 'tree__min_samples_leaf': 10}

### c) 

Using the optimal parameters found in the previous step, replace the missing values in the training set and fit a decision tree model to it.


In [26]:
search.fit(X_train2,y_train)
print(search.score(X_test2, y_test))

0.7662337662337663


For comparison we will train a decision tree on the original dataset and using default parameters.

In [28]:
tree = DecisionTreeClassifier()
tree.fit(X_train,y_train)
print(tree.score(X_test, y_test))

0.6926406926406926


We can see that by using a pipline we got considerably better results. 

## Task 3

### a)

Load `SAheart2.csv` and split the dataset into training (70%) and test sets.

In [29]:
dataframe = pandas.read_csv('SAheart2.csv')
dataframe.head()

Unnamed: 0,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,chd
0,160,12.0,5.73,23.11,Present,49,25.3,97.2,52,1
1,144,0.01,4.41,28.61,Absent,55,28.87,2.06,63,1
2,118,0.08,3.48,32.28,,52,29.14,3.81,46,0
3,170,7.5,6.41,38.03,Present,51,31.99,24.26,58,1
4,134,,3.5,27.78,Present,60,25.99,57.34,49,1


In [30]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 462 entries, 0 to 461
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sbp        462 non-null    int64  
 1   tobacco    452 non-null    float64
 2   ldl        462 non-null    float64
 3   adiposity  462 non-null    float64
 4   famhist    452 non-null    object 
 5   typea      462 non-null    int64  
 6   obesity    462 non-null    float64
 7   alcohol    462 non-null    float64
 8   age        462 non-null    int64  
 9   chd        462 non-null    int64  
dtypes: float64(5), int64(4), object(1)
memory usage: 36.2+ KB


We can instantly spot some missing values, particularly in `tobacco` and `famhist` columns.

In [49]:
data = dataframe.values
X, y = data[:,:-1], data[:,-1]
y = y.astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

### b)

Create a pipeline, which, for discrete variables, will replace missing values with the most frequent class and perform one-hot encoding. For continous variables replace missing values using mean. Use logistic regression.

In [50]:
imputer_con = SimpleImputer()     # initialize imputer for continous variables, this method uses mean as a default strategy
imputer_disc = SimpleImputer(strategy='most_frequent')  # initialize imputer for discrete varaibles

encoder_disc = OneHotEncoder(sparse=False, drop='first')

pipe_disc = Pipeline(steps=[('imputer', imputer_disc), ('encoder', encoder_disc)])

colt = ColumnTransformer([('continous',imputer_con,list(range(0,4))+list(range(5,9))),
                          ('discrete',pipe_disc,[4])])

model = LogisticRegression(penalty = 'none', max_iter=10000)

pipe = Pipeline(steps=[('prepr', colt), ('model', model)])

### c)

Using the pipline created in the previous step transform the data and fit a logistic model to it. Calculate accuracy and AUC score.

In [51]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('prepr',
                 ColumnTransformer(transformers=[('continous', SimpleImputer(),
                                                  [0, 1, 2, 3, 5, 6, 7, 8]),
                                                 ('discrete',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder(drop='first',
                                                                                 sparse=False))]),
                                                  [4])])),
                ('model', LogisticRegression(max_iter=10000, penalty='none'))])

In [54]:
print('Model accuracy:', pipe.score(X_test, y_test))

pred = pipe.predict_proba(X_test)
print('AUC score:', roc_auc_score(y_test, pred[:,1]))       

Model accuracy: 0.7194244604316546
AUC score: 0.7341573033707866
