In [1]:
import pandas as pd
import numpy as np

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor

import warnings
warnings.filterwarnings("ignore")

# 1.

In [2]:
boston = pd.read_csv("data/Boston.csv")
print(boston.shape)
boston.head()

(506, 14)


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [3]:
# scaling
scaler = MinMaxScaler()
scaler.fit(boston)
boston = scaler.transform(boston)
boston

array([[0.00000000e+00, 1.80000000e-01, 6.78152493e-02, ...,
        1.00000000e+00, 8.96799117e-02, 4.22222222e-01],
       [2.35922539e-04, 0.00000000e+00, 2.42302053e-01, ...,
        1.00000000e+00, 2.04470199e-01, 3.68888889e-01],
       [2.35697744e-04, 0.00000000e+00, 2.42302053e-01, ...,
        9.89737254e-01, 6.34657837e-02, 6.60000000e-01],
       ...,
       [6.11892474e-04, 0.00000000e+00, 4.20454545e-01, ...,
        1.00000000e+00, 1.07891832e-01, 4.20000000e-01],
       [1.16072990e-03, 0.00000000e+00, 4.20454545e-01, ...,
        9.91300620e-01, 1.31070640e-01, 3.77777778e-01],
       [4.61841693e-04, 0.00000000e+00, 4.20454545e-01, ...,
        1.00000000e+00, 1.69701987e-01, 1.53333333e-01]])

In [4]:
X_boston = boston[:, :-1]
y_boston = boston[:, -1]
print(X_boston.shape, y_boston.shape)

(506, 13) (506,)


## (a)

In [5]:
# split
X_boston_train, X_boston_test, y_boston_train, y_boston_test = train_test_split(X_boston, y_boston, random_state=42, test_size=0.3)
print(f"X_train: {X_boston_train.shape}, X_test: {X_boston_test.shape}")
print(f"y_train: {y_boston_train.shape}, y_test: {y_boston_test.shape}\n")

# fit
boston_lr = LinearRegression()
boston_lr.fit(X_boston_train, y_boston_train)

lr_train = boston_lr.predict(X_boston_train)
lr_test = boston_lr.predict(X_boston_test)

lr_train_mse = ((y_boston_train - lr_train)**2).mean()
lr_test_mse = ((y_boston_test - lr_test)**2).mean()

print(f"LR train mse: {round(lr_train_mse, 5)}")
print(f"LR test mse: {round(lr_test_mse, 5)}")

X_train: (354, 13), X_test: (152, 13)
y_train: (354,), y_test: (152,)

LR train mse: 0.01113
LR test mse: 0.01063


## (b)

In [6]:
# get T0
boston_reg_dt = DecisionTreeRegressor()
boston_reg_dt.fit(X_boston_train, y_boston_train)

path = boston_reg_dt.cost_complexity_pruning_path(X_boston_train, y_boston_train)

ccp_alphas, impurities = path.ccp_alphas, path.impurities

# floating point issue
ccp_alphas = np.abs(ccp_alphas)

# get cp
regs = []
for ccp_alpha in ccp_alphas:
    reg_dt = DecisionTreeRegressor(random_state=42, ccp_alpha=ccp_alpha)
    reg_dt.fit(X_boston_train, y_boston_train)
    regs.append(reg_dt)
    

regs = regs[:-1]
ccp_alphas = ccp_alphas[:-1]

train_mse = []
test_mse = []

for reg in regs:
    boston_train_pred = reg.predict(X_boston_train)
    boston_test_pred = reg.predict(X_boston_test)
    train_mse.append(((y_boston_train-boston_train_pred)**2).mean())
    test_mse.append(((y_boston_test-boston_test_pred)**2).mean())
    

# get best alpha (min test mse)
best_alpha = round(ccp_alphas[test_mse.index(min(test_mse))], 5)

best_rt = DecisionTreeRegressor(random_state=42, ccp_alpha=best_alpha)
best_rt.fit(X_boston_train, y_boston_train)

rt_train_pred = best_rt.predict(X_boston_train)
rt_test_pred = best_rt.predict(X_boston_test)

# mse
best_rt_train_mse = ((y_boston_train-rt_train_pred)**2).mean()
best_rt_test_mse = ((y_boston_test-rt_test_pred)**2).mean()

print(f"RT train mse: {round(best_rt_train_mse, 5)}")
print(f"RT test mse: {round(best_rt_test_mse, 5)}")

RT train mse: 0.00195
RT test mse: 0.00421


## (c)

In [7]:
print(f"LR test mse: {round(lr_test_mse, 5)}")
print(f"RT test mse: {round(best_rt_test_mse, 5)}")

LR test mse: 0.01063
RT test mse: 0.00421


# 2.

In [8]:
pima = pd.read_csv("data/pima.csv")
pima.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [9]:
pima.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [10]:
X_pima = pima.drop(["Outcome"], axis=1)
y_pima = pima["Outcome"]
print(X_pima.shape, y_pima.shape)

(768, 8) (768,)


## (a)

In [11]:
# TODO: scaling
# split
X_pima_train, X_pima_test, y_pima_train, y_pima_test = train_test_split(X_pima, y_pima, random_state=42, test_size=0.3)
print(f"X_train: {X_pima_train.shape}, X_test: {X_pima_test.shape}")
print(f"y_train: {y_pima_train.shape}, y_test: {y_pima_test.shape}\n")

# fit
pima_lr = LogisticRegression()
pima_lr.fit(X_pima_train, y_pima_train)

lr_train = pima_lr.predict(X_pima_train)
lr_test = pima_lr.predict(X_pima_test)

lr_train_acc = accuracy_score(y_pima_train, lr_train)*100
lr_test_acc = accuracy_score(y_pima_test, lr_test)*100

print(f"LR train acc: {round(lr_train_acc, 2)}")
print(f"LR test acc: {round(lr_test_acc, 2)}")


# get coef
coef = pd.DataFrame({"features": X_pima_train.columns, "Coef": pima_lr.coef_[0]})
coef

X_train: (537, 8), X_test: (231, 8)
y_train: (537,), y_test: (231,)

LR train acc: 78.21
LR test acc: 74.03


Unnamed: 0,features,Coef
0,Pregnancies,0.056544
1,Glucose,0.035585
2,BloodPressure,-0.010878
3,SkinThickness,-0.001791
4,Insulin,-0.000989
5,BMI,0.107612
6,DiabetesPedigreeFunction,0.523939
7,Age,0.035826


## (b)

In [12]:
# LDA
lda = LinearDiscriminantAnalysis()
lda.fit(X_pima_train, y_pima_train)

# QDA
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_pima_train, y_pima_train)

# NB
nb = GaussianNB()
nb.fit(X_pima_train, y_pima_train)


GaussianNB()

## (c)

In [13]:
# LDA
lda_train = lda.predict(X_pima_train)
lda_test = lda.predict(X_pima_test)

lda_train_acc = accuracy_score(y_pima_train, lda_train)*100
lda_test_acc = accuracy_score(y_pima_test, lda_test)*100



# QDA
qda_train = qda.predict(X_pima_train)
qda_test = qda.predict(X_pima_test)

qda_train_acc = accuracy_score(y_pima_train, qda_train)*100
qda_test_acc = accuracy_score(y_pima_test, qda_test)*100



# LDA
nb_train = nb.predict(X_pima_train)
nb_test = nb.predict(X_pima_test)

nb_train_acc = accuracy_score(y_pima_train, nb_train)*100
nb_test_acc = accuracy_score(y_pima_test, nb_test)*100


print(f"LDA train acc: {round(lda_train_acc, 2)}")
print(f"QDA train acc: {round(qda_train_acc, 2)}")
print(f"NB train acc: {round(nb_train_acc, 2)}\n")


print(f"LDA test acc: {round(lda_test_acc, 2)}")
print(f"QDA test acc: {round(qda_test_acc, 2)}")
print(f"NB test acc: {round(nb_test_acc, 2)}\n")


print("The best classification method in terms of accuracy for Pima dataset is LDA.\nBecause LDA has the smallest test error.")

LDA train acc: 77.84
QDA train acc: 76.54
NB train acc: 76.72

LDA test acc: 73.16
QDA test acc: 76.62
NB test acc: 74.46

The best classification method in terms of accuracy for Pima dataset is LDA.
Because LDA has the smallest test error.
