## Ahmad Hojatimalekshah

### Collaborate: Arash Modaresi, Amir Kazemzadeh, Ali Nazari

# CS534 Homework 3

### Loading the library

In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets, svm
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split , StratifiedKFold,KFold
from sklearn.svm import SVC

### Loading iris data

In [2]:
iris = datasets.load_iris()
X = iris.data
y = iris.target 
y[y == 2]=0

### Spliting train and test

We will split the training data to train and validation in the main code

In [3]:
X, X_test, y, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

### Defining the parameters grid

In [4]:
# C, gamma and degree space
c = np.array([0.01,0.1,1,10])
g = np.array([0.0001,0.001,0.01,0.1,1])
d = np.array([1,2,3,4,5])

# Meshgrid of the parameters
cv, gv, dv = np.meshgrid(c,g, d)

# All parameters in different columns
parm = np.concatenate((cv.reshape((len(c)*len(g)*len(d),1)),gv.reshape((len(c)*len(g)*len(d),1)),dv.reshape((len(c)*len(g)*len(d),1))), axis=1)

## Different Classifiers and their accuracy
#### We use K-fold cross validation with number of splits equal n_splits
#### We also standardize train and tests 

- n_splits : number of splits for K-Fold cross validation
- ind : list of test and train index splitted by K-Fold
- df2 : mean of accuracy for folds
- X_train_std: standardized train features
- X_test_std: standardized test features
- X_validation: standardized validation features
- X_polyf_train: transformed training data
- X_polyf_valid: transformed validation data
- y_train: train targets (0,1)
- y_test: test targets (0,1)
- y_validation: validation targets (0,1)

### Function for inner k-fold cross validation

In [5]:
def acc_(n_splits,X,y,parm):
    
    # Defining a K-Fold cross validation
    skf = KFold(n_splits=n_splits,random_state=1234,shuffle=True )

    # Train and validation data index
    ind = list(skf.split(X, y))

    # Empty DataFrame for results
    df = pd.DataFrame(columns=['C','gamma','degree','rbf','poly',
                                    'linear'])

    # loop over length of the parameters and different folds
    for i in range(len(parm)):

        # Creating different classifiers
        clf_rbf = SVC(kernel='rbf', C=parm[i][0], gamma=parm[i][1])
        clf_poly = SVC(kernel='poly', C=parm[i][0], gamma=parm[i][1], degree = parm[i][2])
        clf_polyf = SVC(kernel='linear', C=parm[i][0], gamma=parm[i][1])

        # model for transfering the data to polynomial
        poly = PolynomialFeatures(degree =  (parm[i][2]).astype(int))
        
        # Empty DataFrame for results
        df1 = pd.DataFrame()

        # Loop over the folds
        for j in range(0,n_splits):

            # Train and validation data for each fold
            X_train = X[ind[j][0]]
            y_train = y[ind[j][0]]
            X_valid = X[ind[j][1]]
            y_valid = y[ind[j][1]]

            # train and validation for polynomial features
            X_polyf_train = poly.fit_transform(X[ind[j][0]])
            X_polyf_valid = poly.fit_transform(X[ind[j][1]])

            # Standardize the train and validation data
            sc = StandardScaler()
            X_train_std = sc.fit_transform(X_train)
            X_valid_std = sc.transform(X_valid)
            X_polyf_train_std = sc.fit_transform(X_polyf_train)
            X_polyf_valid_std = sc.transform(X_polyf_valid)

            # Fitting the classifiers
            clf_rbf.fit(X_train_std, y_train)
            clf_poly.fit(X_train_std, y_train)
            clf_polyf.fit(X_polyf_train_std, y_train)

            # validation accuracy
            acc_rbf = clf_rbf.score(X_valid_std, y_valid)
            acc_poly = clf_poly.score(X_valid_std, y_valid)
            acc_polyf = clf_polyf.score(X_polyf_valid_std, y_valid)


            # Appending the output of each inner fold
            df1 = df1.append(pd.DataFrame([parm[i][0], parm[i][1],parm[i][2],acc_rbf,acc_poly,acc_polyf]),ignore_index=True)
        # mean of the inner folds
        df2 = np.mean(df1,axis=1)
        
        # parameters with validation accuracy
        df = df.append({'C':df2[0],'gamma':df2[1],'degree':df2[2],'rbf':df2[3],
                        'poly':df2[4],'linear':df2[5]},
                       ignore_index=True)
        
    return df

### K-fold cross validation

In [6]:
# number of splits for K-Fold cross validation
n_splits= 3           # number of folds for inner k-fold
outer_fold_num = 5    # number of folds for outer k-fold

# Defining a K-Fold cross validation
skf = KFold(n_splits=outer_fold_num,random_state=1234,shuffle=True )

# Train and validation data index
ind = list(skf.split(X, y))

# defime emty dataframe for saving the results
df_output = pd.DataFrame()
df_result = pd.DataFrame(columns=['kernel','C','gamma','degree','accuracy'])

# Outer k-fold cross validation
for i in range(0,outer_fold_num):
    
    # Train and validation data for each fold
    X_train = X[ind[i][0]]
    y_train = y[ind[i][0]]
    X_valid = X[ind[i][1]]
    y_valid = y[ind[i][1]]

    # inner k-fold cross validation result
    df = acc_(n_splits,X_train,y_train,parm)
    
    # find the maximum accuracy of inner k-fold cross validation
    df_cols = df[['rbf','poly','linear']]
    row = max(df_cols.idxmax())
    culmn = df_cols.max()[df_cols.max() == df_cols.max(axis=1).max()].index

    # look for the kernels within the optimum parameters
    if culmn[0]=='rbf':
        clf = SVC(kernel=culmn[0], C=df.iloc[row,0], gamma=df.iloc[row,1])
    elif culmn[0]=='poly':
        clf = SVC(kernel=culmn[0], C=df.iloc[row,0], gamma=df.iloc[row,1], degree = df.iloc[row,2])
    else: 
        clf = SVC(kernel=culmn[0], C=df.iloc[row,0], gamma=df.iloc[row,1])
        
        # transform the data to polynomial according to the degree
        poly = PolynomialFeatures(degree = df.iloc[row,2].astype(int))
        X_train = poly.fit_transform(X[ind[i][0]])
        X_valid = poly.fit_transform(X[ind[i][1]])
        
    # Standardize the train and validation data
    sc = StandardScaler()
    X_train_std = sc.fit_transform(X_train)
    X_valid_std = sc.transform(X_valid)

    # Fitting the classifiers
    clf.fit(X_train_std, y_train)

    # validation accuracy
    acc_rbf = clf.score(X_valid_std, y_valid)
    
    # append the model parameters and accuracy for each fold
    df_result = df_result.append({'kernel':culmn[0],'C':df.iloc[row,0],'gamma':df.iloc[row,1],
                                  'degree':df.iloc[row,2], 'accuracy':acc_rbf},ignore_index=True)


### The optimum parameters and classifier

In [7]:
df_col = df_result['accuracy']
M = df_result.iloc[df_col.idxmax(),:]

### Lets see the prediction for the test dataset by the best classifier and the optimum parameters

In [8]:
sc = StandardScaler()
X_train_std = sc.fit_transform(X)
X_test_std = sc.transform(X_test)
# 
clf = SVC(kernel=M[0],C=M[1], gamma=M[2],degree=M[3])
clf.fit(X_train_std, y)

# test accuracy
clf.score(X_test_std, y_test)

0.9333333333333333