In [2]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [4]:
data = pd.read_csv('data_with_labels.csv')
data.head()

Unnamed: 0,State,Year,Slab,MultiBeam,GirderFloorBeamSystem,TeaBeam,BoxBeamMultiple,ArchDeck,Culvert,TrussThru,Frame,BoxBeamSingle,TrussDeck,Label
0,ALABAMA,1992,0.326781,0.257628,0.292597,0.417426,0.299461,0.850491,0.13438,0.887495,0.851453,0.014134,0.805998,1
1,ALASKA,1992,0.08247,0.268559,0.115272,0.172852,0.264056,0.141922,1.0,0.49467,0.137405,0.0,0.030212,1
2,ARIZONA,1992,0.183139,0.177047,0.096338,0.224856,0.137023,0.320507,0.061438,0.927453,0.197519,0.087231,0.41382,1
3,ARKANSAS,1992,0.17086,0.329185,0.22203,0.323595,0.192091,0.369497,0.054384,0.672889,0.131139,0.0,0.586634,1
4,CALIFORNIA,1992,0.218799,0.497345,0.885631,0.344593,0.342176,0.623282,0.126319,0.941477,0.0,0.0,0.621819,1


In [5]:
data.shape

(1144, 14)

In [6]:
X = np.array(data.iloc[:,2:-1])

y = np.array(data['Label'])

# split data into test train set.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [7]:
# Create the grid parameters
grid_params = {"n_neighbors": [2, 3, 4, 5, 6, 7, 8],
              "weights": ["uniform", "distance"],
               "metric": ["euclidean", "manhattan"]            
              }
# Create the GridSearchCV object
gridsearch = GridSearchCV(
                KNeighborsClassifier(),
                grid_params,
                cv = 10,
                n_jobs = -1
                )

gs_fitted = gridsearch.fit(X_train, y_train);
print('best score:', gs_fitted.best_score_)
print('best params:', gs_fitted.best_params_)

best score: 0.9195759233926128
best params: {'metric': 'manhattan', 'n_neighbors': 2, 'weights': 'distance'}


Fit a model with the best parameter

In [8]:
# fit a new model with best parameter(k)
neighs = gs_fitted.best_params_['n_neighbors']
metric = gs_fitted.best_params_['metric']
weighs = gs_fitted.best_params_['weights']

gsknn  = KNeighborsClassifier(n_neighbors = neighs, metric = metric, weights = weighs)
gsknn.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = gsknn.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.916083916083916


In [9]:
f1_Score = f1_score(y_test, y_pred)
print("f1-Score:", f1_Score*100, '%')

f1-Score: 91.48936170212765 %
