In [6]:
import pandas as pd
import numpy as np
import gzip
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.metrics import precision_score,recall_score,f1_score
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

#### Function to display the Image 

#### Loading the Data

In [7]:
from sklearn.model_selection import train_test_split
# 구글드라이브 연결
from google.colab import drive
drive.mount('/gdrive')
# 데이터파일 불러오기
df = pd.read_csv('/gdrive/My Drive/3-2_machine_learning/image-10k.csv', header=None)

X = df.iloc[:, 1:].values     # 데이터샘플
y = df.iloc[:, 0].values      # 타겟레이블


x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1, stratify=y)

images = ['T-shirt', 'Trouser', 'Sweater', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Boot']

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


### Preparing the Data 

In [8]:
scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(x_train.astype(np.float64))

## Training with Stochastic Gradient Descent 

In [9]:
sgd_clf=SGDClassifier(random_state=42)
sgd_clf.fit(X_train_scaled,y_train)

SGDClassifier(random_state=42)

In [11]:
y_train_predict=sgd_clf.predict(x_train[0].reshape(1,-1))

In [12]:
y_train[0]

0

In [16]:
print(y_train_predict[0])
#showImage(x_train[0])

2


In [17]:
y_train_predict=sgd_clf.predict(X_train_scaled)

In [None]:
sgd_accuracy=accuracy_score(y_train,y_train_predict)
sgd_precision=precision_score(y_train,y_train_predict,average='weighted')
sgd_recall=recall_score(y_train,y_train_predict,average='weighted')
sgd_f1_score=f1_score(y_train,y_train_predict,average='weighted')

In [None]:
print(sgd_accuracy,sgd_precision,sgd_recall,sgd_f1_score)

0.9048 0.9058837567126912 0.9048 0.904266314745271


## Training with Softmax (Logistic) Regression

In [None]:
log_clf=LogisticRegression(multi_class="multinomial",solver='lbfgs',C=10,random_state=42)
log_clf.fit(X_train_scaled,y_train)

In [None]:
y_train_predict=log_clf.predict(X_train_scaled)

In [None]:
log_accuracy=accuracy_score(y_train,y_train_predict)
log_precision=precision_score(y_train,y_train_predict,average='weighted')
log_recall=recall_score(y_train,y_train_predict,average='weighted')
log_f1_score=f1_score(y_train,y_train_predict,average='weighted')

In [None]:
print(log_accuracy,log_precision,log_recall,log_f1_score)

## Training with Decision Trees 

In [None]:
dec_tree_clf=DecisionTreeClassifier(max_depth=50,random_state=42)
dec_tree_clf.fit(X_train,y_train)

In [None]:
y_train_predict=dec_tree_clf.predict(X_train)

In [None]:
dec_tree_accuracy=accuracy_score(y_train,y_train_predict)
dec_tree_precision=precision_score(y_train,y_train_predict,average='weighted')
dec_tree_recall=recall_score(y_train,y_train_predict,average='weighted')
dec_tree_f1_score=f1_score(y_train,y_train_predict,average='weighted')

In [None]:
print(dec_tree_accuracy,dec_tree_precision,dec_tree_recall,dec_tree_f1_score)

## Training with Random Forest

In [None]:
rnd_clf=RandomForestClassifier(n_estimators=100,max_depth=50,random_state=42)
rnd_clf.fit(X_train,y_train)

In [None]:
y_train_predict=rnd_clf.predict(X_train)

In [None]:
rnd_accuracy=accuracy_score(y_train,y_train_predict)
rnd_precision=precision_score(y_train,y_train_predict,average='weighted')
rnd_recall=recall_score(y_train,y_train_predict,average='weighted')
rnd_f1_score=f1_score(y_train,y_train_predict,average='weighted')

In [None]:
print(rnd_accuracy,rnd_precision,rnd_recall,rnd_f1_score)

## Training with Voting Classifier 

In [None]:
log_clf_ens=LogisticRegression(multi_class="multinomial",solver='lbfgs',C=10,random_state=42)
rnd_clf_ens=RandomForestClassifier(n_estimators=100,max_depth=50,random_state=42)

In [None]:
voting_clf=VotingClassifier(
    estimators=[('lr',log_clf_ens),('rf',rnd_clf_ens)],
    voting='soft')

In [None]:
voting_clf.fit(X_train_scaled,y_train)

In [None]:
y_train_predict=voting_clf.predict(X_train_scaled)

In [None]:
voting_accuracy=accuracy_score(y_train,y_train_predict)
voting_precision=precision_score(y_train,y_train_predict,average='weighted')
voting_recall=recall_score(y_train,y_train_predict,average='weighted')
voting_f1_score=f1_score(y_train,y_train_predict,average='weighted')

In [None]:
print(voting_accuracy,voting_precision,voting_recall,voting_f1_score)

## Cross Validation with SGD 

In [None]:
def display_scores(scores):
    print(scores)
    print(scores.mean())
    print(scores.std())
    
sgd_clf = SGDClassifier(random_state=42) 

sgd_cv_scores = cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy") 
display_scores(sgd_cv_scores)
sgd_cv_accuracy = sgd_cv_scores.mean()

y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)

In [None]:
confusion_matrix(y_train, y_train_pred)
sgd_cv_precision = precision_score(y_train, y_train_pred, average='weighted')
sgd_cv_recall = recall_score(y_train, y_train_pred, average='weighted')
sgd_cv_f1_score = f1_score(y_train, y_train_pred, average='weighted')

print("SGD CV Accuracy: ", sgd_cv_accuracy)
print("SGD CV Precision: ", sgd_cv_precision)
print("SGD CV Recall: ", sgd_cv_recall)
print("SGD CV F1 Score: ", sgd_cv_f1_score)

## Cross Validation with Softmax Regression

In [None]:
log_clf=LogisticRegression(multi_class="multinomial",solver="lbfgs",C=10,random_state=42)

In [None]:
log_cv_scores=cross_val_score(log_clf,X_train_scaled,y_train,cv=3,scoring="accuracy")
display_scores(log_cv_scores)

In [None]:
log_cv_accuracy=log_cv_scores.mean()

In [None]:
y_train_pred=cross_val_predict(log_clf,X_train_scaled,y_train,cv=3)

In [None]:
confusion_matrix(y_train,y_train_pred)

In [None]:
log_cv_precision = precision_score(y_train, y_train_pred, average='weighted')
log_cv_recall = recall_score(y_train, y_train_pred, average='weighted')
log_cv_f1_score = f1_score(y_train, y_train_pred, average='weighted')

print("Log CV Accuracy: ", log_cv_accuracy)
print("Log CV Precision: ", log_cv_precision)
print("Log CV Recall: ", log_cv_recall)
print("Log CV F1 Score: ", log_cv_f1_score)

## Cross Validation with Decision Trees 

In [None]:
dec_tree_clf=DecisionTreeClassifier(max_depth=50,random_state=42)

In [None]:
dec_tree_cv_scores=cross_val_score(dec_tree_clf,X_train_scaled,y_train,cv=3,scoring="accuracy")
display_scores(dec_tree_cv_scores)

In [None]:
dec_tree_cv_accuracy=dec_tree_cv_scores.mean()

In [None]:
y_train_pred=cross_val_predict(dec_tree_clf,X_train_scaled,y_train,cv=3)

In [None]:
confusion_matrix(y_train,y_train_pred)

In [None]:
dec_tree_cv_precision = precision_score(y_train, y_train_pred, average='weighted')
dec_tree_cv_recall = recall_score(y_train, y_train_pred, average='weighted')
dec_tree_cv_f1_score = f1_score(y_train, y_train_pred, average='weighted')

print(dec_tree_cv_accuracy)
print(dec_tree_cv_precision)
print(dec_tree_cv_recall)
print(dec_tree_cv_f1_score)

## Cross Validation with Random Forest 

In [None]:
rnd_clf=RandomForestClassifier(n_estimators=20,max_depth=10,random_state=42)

In [None]:
rnd_cv_scores=cross_val_score(rnd_clf,X_train_scaled,y_train,cv=3,scoring="accuracy")
display_scores(rnd_cv_scores)

In [None]:
rnd_cv_accuracy=rnd_cv_scores.mean()

In [None]:
y_train_pred=cross_val_predict(rnd_clf,X_train_scaled,y_train,cv=3)

In [None]:
confusion_matrix(y_train,y_train_pred)

In [None]:
rnd_cv_precision = precision_score(y_train, y_train_pred, average='weighted')
rnd_cv_recall = recall_score(y_train, y_train_pred, average='weighted')
rnd_cv_f1_score = f1_score(y_train, y_train_pred, average='weighted')

print(rnd_cv_accuracy)
print(rnd_cv_precision)
print(rnd_cv_recall)
print(rnd_cv_f1_score)

## Cross Validation with Voting Classifier 

In [None]:
voting_clf = VotingClassifier(
    estimators=[('lr',log_clf_ens),('rf',rnd_clf_ens)],
    voting='soft')

In [None]:
voting_cv_scores=cross_val_score(voting_clf,X_train_scaled,y_train,cv=3,scoring="accuracy")
display_scores(voting_cv_scores)

In [None]:
voting_cv_accuracy=voting_cv_scores.mean()

In [None]:
y_train_pred=cross_val_predict(voting_clf,X_train_scaled,y_train,cv=3)

In [None]:
confusion_matrix(y_train,y_train_pred)
voting_cv_precision = precision_score(y_train, y_train_pred, average='weighted')
voting_cv_recall = recall_score(y_train, y_train_pred, average='weighted')
voting_cv_f1_score = f1_score(y_train, y_train_pred, average='weighted')

print(voting_cv_accuracy)
print(voting_cv_precision)
print(voting_cv_recall)
print(voting_cv_f1_score)

In [None]:
 #As we see, **Voting classifier** gives us the best results. So we choose Voting Classifier as our final model.

#### Dimensionality Reduction using PCA

In [None]:
pca=PCA(n_components=0.99)

In [None]:
X_train_reduced=pca.fit_transform(X_train)

In [None]:
pca.n_components_

In [None]:
np.sum(pca.explained_variance_ratio_)


In [None]:
X_train_recovered=pca.inverse_transform(X_train_reduced)

##### Checking whether we lost any significant information by PCA

In [None]:
def plot_digits(instances, images_per_row=5, **options):
    size = 28
    images_per_row = min(len(instances), images_per_row)
    images = [instance.reshape(size,size) for instance in instances]
    n_rows = (len(instances) - 1) // images_per_row + 1
    row_images = []
    n_empty = n_rows * images_per_row - len(instances)
    images.append(np.zeros((size, size * n_empty)))
    for row in range(n_rows):
        rimages = images[row * images_per_row : (row + 1) * images_per_row]
        row_images.append(np.concatenate(rimages, axis=1))
    image = np.concatenate(row_images, axis=0)
    plt.imshow(image, cmap = matplotlib.cm.binary, **options)
    plt.axis("off")

plt.figure(figsize=(7, 4))
plt.subplot(121)
# Plotting 'original' image
plot_digits(X_train[::2100])
plt.title("Original", fontsize=16)
plt.subplot(122)
# Plotting the corresponding 'recovered' image
plot_digits(X_train_recovered[::2100])
plt.title("Compressed", fontsize=16)
plt.show()

#### Tuning Hyperparameters

In [None]:
param_grid = [
    {
        "lr__multi_class":["multinomial"],
        "lr__solver":["lbfgs"],
        "lr__C":[5],
        "rf__n_estimators":[20],
        "rf__max_depth":[10,15],
    }]

In [None]:
voting_clf_grid_search = VotingClassifier(
    estimators=[('lr',log_clf_ens),('rf',rnd_clf_ens)],
    voting='soft')

In [None]:
grid_search=GridSearchCV(voting_clf_grid_search,param_grid,cv=3,scoring='neg_mean_squared_error')

In [None]:
grid_search.fit(X_train_reduced,y_train)

In [None]:
grid_search.best_params_

In [None]:
final_model=grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
X_test_reduced=pca.transform(X_test)

In [None]:
y_test_predict=final_model.predict(X_test_reduced)

In [None]:
confusion_matrix(y_test,y_test_predict)

In [None]:
final_accuracy=accuracy_score(y_test,y_test_predict)
final_precision = precision_score(y_test, y_test_predict, average='weighted')
final_recall = recall_score(y_test, y_test_predict, average='weighted')
final_f1_score = f1_score(y_test, y_test_predict, average='weighted')

print(final_accuracy)
print(final_precision)
print(final_recall)
print(final_f1_score)