<a href="https://colab.research.google.com/github/aruna20200/aruna_akula/blob/master/KNN_Implemebtation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd 
import numpy as np 
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings("ignore")

In [2]:
np.random.seed(42)

In [3]:
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv",sep = ';')
y = df.pop('quality')

In [15]:
df.head()

Unnamed: 0,"fixed acidity;""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality"""
0,7;0.27;0.36;20.7;0.045;45;170;1.001;3;0.45;8.8;6
1,6.3;0.3;0.34;1.6;0.049;14;132;0.994;3.3;0.49;9...
2,8.1;0.28;0.4;6.9;0.05;30;97;0.9951;3.26;0.44;1...
3,7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4...
4,7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4...


In [4]:
df.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
dtype: int64

In [5]:
for i in df.columns:
    df[i] = df[i].fillna(np.mean(df[i]))
train, test, y_train, y_test = train_test_split(df,y, test_size = 0.2)

In [6]:
lr = LogisticRegression()
def model_fit(alg,train,test,y_train,y_test,if_cv = True,cv_folds = 5):
    alg.fit(train,y_train)
    
    #performing Cross Validation
    if if_cv: 
        cv_score = cross_val_score(alg,train,y_train,cv = cv_folds,scoring = 'f1_macro')
    
    predictions = alg.predict(test)
    
    #Print 
    print("\n Model Report")
    print(classification_report(y_test,predictions))
    
    if if_cv: 
        print("CV REPORT :- Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g"%(np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score)))
    
    print("Accuracy: ",accuracy_score(y_test,predictions))
    print('-'*100)
    
    confu = pd.DataFrame(confusion_matrix(y_test,predictions))
    print(confu)

In [7]:
confu = model_fit(lr,train, test, y_train, y_test)


 Model Report
              precision    recall  f1-score   support

           3       1.00      0.20      0.33         5
           4       0.00      0.00      0.00        25
           5       0.46      0.32      0.38       291
           6       0.46      0.82      0.59       432
           7       0.60      0.02      0.03       192
           8       0.00      0.00      0.00        35

    accuracy                           0.46       980
   macro avg       0.42      0.23      0.22       980
weighted avg       0.46      0.46      0.38       980

CV REPORT :- Mean - 0.1438456 | Std - 0.006476701 | Min - 0.1321111 | Max - 0.1499328
Accuracy:  0.45918367346938777
----------------------------------------------------------------------------------------------------
   0  1   2    3  4  5
0  1  0   2    2  0  0
1  0  0   8   17  0  0
2  0  0  93  197  1  0
3  0  0  78  353  1  0
4  0  0  20  169  3  0
5  0  0   1   34  0  0


In [8]:
train_scale = StandardScaler(train)
test_scale = StandardScaler(test)

In [9]:
def KNN_predicts(train, test,y_train,y_test,scaler,neighbours, metric ='manhattan', weights = 'uniform'):
    train_scaled = scaler.fit_transform(train)
    test_scaled = scaler.transform(test)
    KNN = KNeighborsClassifier(n_neighbors =neighbours,metric = metric,weights = weights,n_jobs= 4)
    KNN.fit(train_scaled,y_train)
    predictions = KNN.predict(test_scaled)
    

    
    print("Accuracy: ",accuracy_score(y_test,predictions))
    print('-'*100)
    return KNN

In [10]:
KNN_predicts(train, test, y_train, y_test, StandardScaler(), 1)

Accuracy:  0.6479591836734694
----------------------------------------------------------------------------------------------------


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='manhattan',
                     metric_params=None, n_jobs=4, n_neighbors=1, p=2,
                     weights='uniform')

In [11]:
for k in range(1,11):
    print('Accuracy score on kNN using n_neighbours = {0}:'.format(2**k), end = ' ')
    KNN_predicts(train, test, y_train, y_test, StandardScaler(), 2**k)

Accuracy score on kNN using n_neighbours = 2: Accuracy:  0.5724489795918367
----------------------------------------------------------------------------------------------------
Accuracy score on kNN using n_neighbours = 4: Accuracy:  0.5551020408163265
----------------------------------------------------------------------------------------------------
Accuracy score on kNN using n_neighbours = 8: Accuracy:  0.5438775510204081
----------------------------------------------------------------------------------------------------
Accuracy score on kNN using n_neighbours = 16: Accuracy:  0.5418367346938775
----------------------------------------------------------------------------------------------------
Accuracy score on kNN using n_neighbours = 32: Accuracy:  0.5520408163265306
----------------------------------------------------------------------------------------------------
Accuracy score on kNN using n_neighbours = 64: Accuracy:  0.5387755102040817
------------------------------------

In [12]:
for weights in ['uniform','distance']:
    print('Accuracy score on kNN using n_neighbours = {0}:'.format(weights), end = ' ')
    KNN_predicts(train, test, y_train, y_test, StandardScaler(),5,metric ='chebyshev',weights = weights)

Accuracy score on kNN using n_neighbours = uniform: Accuracy:  0.5244897959183673
----------------------------------------------------------------------------------------------------
Accuracy score on kNN using n_neighbours = distance: Accuracy:  0.6459183673469387
----------------------------------------------------------------------------------------------------


In [13]:
##Feature Engineering

In [14]:
def create_poly(train,test, degree):
    poly = PolynomialFeatures(degree = degree)
    train_poly = poly.fit_transform(train)
    test_poly = poly.fit_transform(test)
    return train_poly, test_poly

In [15]:
for degree in [1,2,3]:
    train_poly,test_poly = create_poly(train,test,degree)
    print('Polynomial degree',degree)
    Knn = KNN_predicts(train_poly,test_poly,y_train,y_test,StandardScaler(),5,metric ='chebyshev',weights = 'distance')
    print('-'*100)

Polynomial degree 1
Accuracy:  0.6459183673469387
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
Polynomial degree 2
Accuracy:  0.6673469387755102
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
Polynomial degree 3
Accuracy:  0.6551020408163265
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------


In [16]:
Knn = KNN_predicts(train_poly,test_poly,y_train,y_test,StandardScaler(),5,metric ='chebyshev',weights = 'distance')

Accuracy:  0.6551020408163265
----------------------------------------------------------------------------------------------------


In [17]:
Knn

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='chebyshev',
                     metric_params=None, n_jobs=4, n_neighbors=5, p=2,
                     weights='distance')

In [18]:
train_poly,test_poly = create_poly(train,test,2)

In [19]:
from sklearn.externals import joblib 
joblib.dump(Knn,'Model.pkl')

['Model.pkl']

In [20]:
knn_new_model = joblib.load('Model.pkl')

In [21]:
def feature_eng(df):
    df['feat1'] = df['total sulfur dioxide'] / df['free sulfur dioxide']
    df['feat2'] = df['pH'] * df['fixed acidity']
    return df

In [22]:
train = feature_eng(train)
test = feature_eng(test)

In [23]:
print("The score after feature engineering: ")
KNN_predicts(train_poly,test_poly,y_train,y_test,StandardScaler(),5,metric ='chebyshev',weights = 'distance')

The score after feature engineering: 
Accuracy:  0.6673469387755102
----------------------------------------------------------------------------------------------------


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='chebyshev',
                     metric_params=None, n_jobs=4, n_neighbors=5, p=2,
                     weights='distance')