# Import

In [60]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.pipeline import Pipeline

# Quiz 1, Iris data frame

In [2]:
iris = load_iris()

In [3]:
# Data load
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


# Quiz2, boxplot with plotly express

In [4]:
# Transform the data format
iris_melted = iris_df.melt(var_name='Features', value_name='Values')

# make boxplot
boxplot = px.box(iris_melted, x='Features', y='Values')
boxplot.show()

# Quiz3, Standard scaler

In [5]:
# Create scaler instance
ss = StandardScaler()

# Data study and transfrom
iris_scaled = ss.fit_transform(iris_df)

# Scaled data to DataFrame
iris_scaled_df = pd.DataFrame(iris_scaled, columns=iris.feature_names)

iris_scaled_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444


# Quiz4, boxplot of scaled data

In [6]:
# Transform the data format
iris_ss_melted = iris_scaled_df.melt(var_name='Features', value_name='Values')

# make boxplot
boxplot = px.box(iris_ss_melted, x='Features', y='Values')
boxplot.show()

# Quiz5, split data

In [7]:
X = iris_scaled_df
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

print("Train data size:", X_train.shape)
print("Test data size:", X_test.shape)

Train data size: (120, 4)
Test data size: (30, 4)


# Quiz6, Training and evaluating accuracy

In [8]:
# Create model instance
rf = RandomForestClassifier(random_state=13)
dt = DecisionTreeClassifier(random_state=13)
lr = LogisticRegression(random_state=13)
knn = KNeighborsClassifier(n_neighbors=3)

In [9]:
# Train RandomForest
rf.fit(X_train, y_train)

In [10]:
# Train DecisionTree
dt.fit(X_train, y_train)

In [11]:
# Train LogisticRegression
lr.fit(X_train, y_train)

In [12]:
# Train kNN
knn.fit(X_train, y_train)

In [13]:
# Predict test data about each model
y_pred_rf = rf.predict(X_test)
y_pred_dt = dt.predict(X_test)
y_pred_lr = lr.predict(X_test)
y_pred_knn = knn.predict(X_test)

# Evaluate accuracy of each model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
accuracy_knn = accuracy_score(y_test, y_pred_knn)

print(f"Random Forest Accuracy: {accuracy_rf}")
print(f"Decision Tree Accuracy: {accuracy_dt}")
print(f"Logistic Regression Accuracy: {accuracy_lr}")
print(f"KNN Accuracy: {accuracy_knn}")

Random Forest Accuracy: 0.9666666666666667
Decision Tree Accuracy: 0.9333333333333333
Logistic Regression Accuracy: 0.9666666666666667
KNN Accuracy: 0.9666666666666667


# Quiz7, Sorting accuracy datas to the data frame

In [14]:
# Predict and Evaluate accuracy about train data of each model
train_acc_rf = accuracy_score(y_train, rf.predict(X_train))
train_acc_dt = accuracy_score(y_train, dt.predict(X_train))
train_acc_lr = accuracy_score(y_train, lr.predict(X_train))
train_acc_knn = accuracy_score(y_train, knn.predict(X_train))

# Predict and Evaluate accuracy about test data of each model
test_acc_rf = accuracy_score(y_test, rf.predict(X_test))
test_acc_dt = accuracy_score(y_test, dt.predict(X_test))
test_acc_lr = accuracy_score(y_test, lr.predict(X_test))
test_acc_knn = accuracy_score(y_test, knn.predict(X_test))


In [15]:
# Sorting accuracy datas to the dataframe
result = pd.DataFrame({
    'Model': ['Random Forest', 'Decision Tree', 'Logistic Regression', 'KNN'],
    'Train Accuracy': [train_acc_rf, train_acc_dt, train_acc_lr, train_acc_knn],
    'Test Accuracy': [test_acc_rf, test_acc_dt, test_acc_lr, test_acc_knn]
})

result

Unnamed: 0,Model,Train Accuracy,Test Accuracy
0,Random Forest,1.0,0.966667
1,Decision Tree,1.0,0.933333
2,Logistic Regression,0.975,0.966667
3,KNN,0.95,0.966667


# Quiz8, kFolding and cross validation

In [16]:
# 5겹 k-Fold 설정
kf = KFold(n_splits=5, shuffle=True, random_state=13)

# Calculate cross valudation score
scores_rf = cross_val_score(rf, X, y, cv=kf)
scores_dt = cross_val_score(dt, X, y, cv=kf)
scores_lr = cross_val_score(lr, X, y, cv=kf)
scores_knn = cross_val_score(knn, X, y, cv=kf)

# mean and std
result['CV Mean'] = [scores_rf.mean(), scores_dt.mean(), scores_lr.mean(), scores_knn.mean()]
result['CV Std'] = [scores_rf.std(), scores_dt.std(), scores_lr.std(), scores_knn.std()]

result

Unnamed: 0,Model,Train Accuracy,Test Accuracy,CV Mean,CV Std
0,Random Forest,1.0,0.966667,0.953333,0.033993
1,Decision Tree,1.0,0.933333,0.96,0.024944
2,Logistic Regression,0.975,0.966667,0.953333,0.033993
3,KNN,0.95,0.966667,0.946667,0.033993


# Quiz9, boxplot of CV score

In [17]:
# Create boxplot datas
data = [
    go.Box(y=scores_rf, name="Random Forest"),
    go.Box(y=scores_dt, name="Decision Tree"),
    go.Box(y=scores_lr, name="Logistic Regression"),
    go.Box(y=scores_knn, name="KNN")
]

# Set plot layouts
layout = go.Layout(
    title="Model Cross Validation Scores",
    yaxis_title="Accuracy",
    xaxis_title="Model",
    showlegend=False
)

# ploting 
fig = go.Figure(data=data, layout=layout)
fig.show()

# Quiz10, CV score of each fold

In [18]:
models = {
    "Random Forest": RandomForestClassifier(random_state=13),
    "Decision Tree": DecisionTreeClassifier(random_state=13),
    "Logistic Regression": LogisticRegression(random_state=13),
    "KNN": KNeighborsClassifier(n_neighbors=3)
}

# Initialize
foldCV = []

# repeat evaluating each fold
for fold, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # repeat evaluating each model
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        
        # save result
        foldCV.append({
            "Fold": fold + 1,
            "Model": name,
            "Accuracy": score
        })

# make data frame
foldCV_df = pd.DataFrame(foldCV)

foldCV_df


Unnamed: 0,Fold,Model,Accuracy
0,1,Random Forest,0.966667
1,1,Decision Tree,0.933333
2,1,Logistic Regression,0.966667
3,1,KNN,0.966667
4,2,Random Forest,0.966667
5,2,Decision Tree,0.966667
6,2,Logistic Regression,0.933333
7,2,KNN,0.933333
8,3,Random Forest,0.9
9,3,Decision Tree,0.933333


# Quiz11, Pipeline(standard scaler -> decision tree) and cross validation

In [19]:
iris = load_iris()
X = iris.data
y = iris.target

# Set pipeline
estimators = [('scaler', StandardScaler()),
               ('clf', DecisionTreeClassifier(max_depth=2, random_state=13))]

pipe = Pipeline(estimators)

# Data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13, stratify=y)

# 5-fold cross-validation
scores = cross_val_score(pipe, X_train, y_train, cv=5)  

In [20]:
print("Cross-validation scores:", scores)
print("Average cross-validation score: {:.2f}".format(np.mean(scores)))

Cross-validation scores: [0.875      0.91666667 0.95833333 0.95833333 1.        ]
Average cross-validation score: 0.94


# Quiz12, Four pipelines

## Error case

In [32]:
# Set pipeline
estimators = [('scaler', StandardScaler()),
               ('clf1', DecisionTreeClassifier(max_depth=2, random_state=13)),
               ('clf2', RandomForestClassifier(random_state=13))]

pipe = Pipeline(estimators)
pipe

In [33]:
# Data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13, stratify=y)

# 5-fold cross-validation
scores = cross_val_score(pipe, X_train, y_train, cv=5)

print("Cross-validation scores:", scores)
print("Average cross-validation score: {:.2f}".format(np.mean(scores)))

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/j-hyungjun/venv/MLDL_venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/j-hyungjun/venv/MLDL_venv/lib/python3.10/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/j-hyungjun/venv/MLDL_venv/lib/python3.10/site-packages/sklearn/pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
  File "/home/j-hyungjun/venv/MLDL_venv/lib/python3.10/site-packages/sklearn/pipeline.py", line 388, in _fit
    self._validate_steps()
  File "/home/j-hyungjun/venv/MLDL_venv/lib/python3.10/site-packages/sklearn/pipeline.py", line 258, in _validate_steps
    raise TypeError(
TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'DecisionTreeClassifier(max_depth=2, random_state=13)' (type <class 'sklearn.tree._classes.DecisionTreeClassifier'>) doesn't


## use ensemble, votingclassifier

In [49]:
voting_clf = VotingClassifier(
    estimators=[('dt', DecisionTreeClassifier(max_depth=2, random_state=13)),
                ('rf', RandomForestClassifier(random_state=13))],
    voting='soft')

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', voting_clf)
])

pipe

In [50]:
# Data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13, stratify=y)

# 5-fold cross-validation
scores_DTRF = cross_val_score(pipe, X_train, y_train, cv=5)

print("Cross-validation scores:", scores_DTRF)
print("Average cross-validation score: {:.2f}".format(np.mean(scores_DTRF)))

Cross-validation scores: [0.875      0.91666667 0.95833333 0.95833333 1.        ]
Average cross-validation score: 0.94


In [51]:
voting_clf = VotingClassifier(
    estimators=[('dt1', DecisionTreeClassifier(max_depth=2, random_state=13)),
                ('dt2', DecisionTreeClassifier(random_state=13))],
    voting='soft')

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', voting_clf)
])

pipe

In [52]:
# Data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13, stratify=y)

# 5-fold cross-validation
scores_DTDT = cross_val_score(pipe, X_train, y_train, cv=5)

print("Cross-validation scores:", scores_DTDT)
print("Average cross-validation score: {:.2f}".format(np.mean(scores_DTDT)))

Cross-validation scores: [0.875      0.91666667 0.875      0.91666667 1.        ]
Average cross-validation score: 0.92


In [53]:
voting_clf = VotingClassifier(
    estimators=[('dt', DecisionTreeClassifier(max_depth=2, random_state=13)),
                ('lr', LogisticRegression(random_state=13))],
    voting='soft')

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', voting_clf)
])

pipe

In [54]:
# Data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13, stratify=y)

# 5-fold cross-validation
scores_DTLR = cross_val_score(pipe, X_train, y_train, cv=5)

print("Cross-validation scores:", scores_DTLR)
print("Average cross-validation score: {:.2f}".format(np.mean(scores_DTLR)))

Cross-validation scores: [0.875      0.91666667 0.95833333 0.95833333 1.        ]
Average cross-validation score: 0.94


In [55]:
voting_clf = VotingClassifier(
    estimators=[('dt', DecisionTreeClassifier(max_depth=2, random_state=13)),
                ('kNN', KNeighborsClassifier(n_neighbors=3))],
    voting='soft')

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', voting_clf)
])

pipe

In [56]:
# Data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13, stratify=y)

# 5-fold cross-validation
scores_DTkNN = cross_val_score(pipe, X_train, y_train, cv=5)

print("Cross-validation scores:", scores_DTkNN)
print("Average cross-validation score: {:.2f}".format(np.mean(scores_DTkNN)))

Cross-validation scores: [0.91666667 0.91666667 1.         0.91666667 1.        ]
Average cross-validation score: 0.95


In [57]:
print("DT-RF CV scores:", scores_DTRF)
print("Average DT-RF CV scores: {:.2f}".format(np.mean(scores_DTRF)))
print("DT-DT CV scores:", scores_DTDT)
print("Average DT-DT CV scores: {:.2f}".format(np.mean(scores_DTDT)))
print("DT-LR CV scores:", scores_DTLR)
print("Average DT-LR CV scores: {:.2f}".format(np.mean(scores_DTLR)))
print("DT-kNN CV scores:", scores_DTkNN)
print("Average DT-kNN CV scores: {:.2f}".format(np.mean(scores_DTkNN)))

DT-RF CV scores: [0.875      0.91666667 0.95833333 0.95833333 1.        ]
Average DT-RF CV scores: 0.94
DT-DT CV scores: [0.875      0.91666667 0.875      0.91666667 1.        ]
Average DT-DT CV scores: 0.92
DT-LR CV scores: [0.875      0.91666667 0.95833333 0.95833333 1.        ]
Average DT-LR CV scores: 0.94
DT-kNN CV scores: [0.91666667 0.91666667 1.         0.91666667 1.        ]
Average DT-kNN CV scores: 0.95


# Quiz13, GridSearchCV

In [58]:
# load data
iris = load_iris()
X = iris.data
y = iris.target

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

In [59]:
# Set pipelines
pipeline_dt = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', DecisionTreeClassifier())
])

pipeline_rf = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier())
])

pipeline_knn = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', KNeighborsClassifier())
])

In [61]:
# Set variable grid for gridSearchCV
param_grid_dt = {
    'clf__max_depth': [3, 5, 7, 9, None],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4]
}

param_grid_rf = {
    'clf__n_estimators': [50, 100, 200],
    'clf__max_depth': [3, 5, 7, None],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4]
}

param_grid_knn = {
    'clf__n_neighbors': [3, 5, 7, 9],
    'clf__weights': ['uniform', 'distance'],
    'clf__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

In [62]:
# GridSearchCV object
grid_search_dt = GridSearchCV(pipeline_dt, param_grid_dt, cv=5, n_jobs=-1)
grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=5, n_jobs=-1)
grid_search_knn = GridSearchCV(pipeline_knn, param_grid_knn, cv=5, n_jobs=-1)

In [63]:
#train
grid_search_dt.fit(X_train, y_train)
grid_search_rf.fit(X_train, y_train)
grid_search_knn.fit(X_train, y_train)

In [66]:
# Print the best parameters for each model
print("Decision Tree Best Parameters:", grid_search_dt.best_params_)
print("Random Forest Best Parameters:", grid_search_rf.best_params_)
print("kNN Best Parameters:", grid_search_knn.best_params_)

# Evaluate the models on test data
print("Decision Tree Test Accuracy:", grid_search_dt.score(X_test, y_test))
print("Random Forest Test Accuracy:", grid_search_rf.score(X_test, y_test))
print("kNN Test Accuracy:", grid_search_knn.score(X_test, y_test))

Decision Tree Best Parameters: {'clf__max_depth': 7, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2}
Random Forest Best Parameters: {'clf__max_depth': 3, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 10, 'clf__n_estimators': 50}
kNN Best Parameters: {'clf__algorithm': 'auto', 'clf__n_neighbors': 7, 'clf__weights': 'uniform'}
Decision Tree Test Accuracy: 0.9333333333333333
Random Forest Test Accuracy: 0.9666666666666667
kNN Test Accuracy: 0.9666666666666667
