### Importing necessary libraries

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Loading the dataset using pandas

In [6]:
data = pd.read_csv(r'C:\Users\astha\OneDrive\Desktop\sample\dataset.csv')

### Displaying the first 5 rows of data 

In [7]:
data = data.drop(['Unnamed: 3'],1)
print(data.head(5))

                                   CLASS1  CLASS2  \
0  desc                                    manner   
1                                    ENTY  cremat   
2  desc                                    manner   
3                                    ENTY  animal   
4                      abb              R     exp   

                                           QUESTIONS  
0  How did serfdom develop in and then leave Russ...  
1   What films featured the character Popeye Doyle ?  
2  How can I find               a list of celebri...  
3  What fowl gr                  abs the spotligh...  
4                    What is the full form of .com ?  


### Applying Label Encoder

In [8]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
data['CLASS1'] = label_encoder.fit_transform(data['CLASS1'])
data['CLASS2'] = label_encoder.fit_transform(data['CLASS2'])

### Applying Tf-Idf Vectorizer
#### Here categorical value is replaced with a numeric value between 0 and the number of classes minus 1. Suppose if the categorical variable value contains 5 distinct classes, we use (0, 1, 2, 3, and 4).

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer()
x = v.fit_transform(data['QUESTIONS'])

### Extracting Features and Target Variables

In [10]:
y = np.array(data.drop(['QUESTIONS'],1))

In [11]:
x = np.array(data['QUESTIONS'])

In [12]:
print(y)

[[ 5 24]
 [ 0  9]
 [ 5 24]
 ...
 [ 3 41]
 [ 3 41]
 [ 0 10]]


In [16]:
print(x)
#after applying tf-idf vectorizer the data is converted into numeric data so that it can be processed

  (0, 6492)	0.4037062902888527
  (0, 4329)	0.4066817455759092
  (0, 7509)	0.3909461231545298
  (0, 514)	0.18684999050433973
  (0, 3841)	0.13368636039675932
  (0, 2261)	0.4397889648165346
  (0, 6688)	0.4608959763409549
  (0, 2294)	0.18975685693610375
  (0, 3722)	0.15543569906373914
  (1, 2437)	0.5146818044794617
  (1, 5805)	0.4654273778494601
  (1, 1542)	0.3535079432208928
  (1, 7498)	0.0962428806450149
  (1, 2959)	0.4340948738501773
  (1, 3013)	0.4340948738501773
  (1, 8102)	0.08586453915803502
  (2, 5077)	0.3844431027906946
  (2, 6158)	0.42649920616178494
  (2, 1493)	0.5038459072304332
  (2, 5297)	0.1440827944264803
  (2, 4432)	0.4413887662675721
  (2, 3019)	0.3082765415509709
  (2, 1370)	0.26802397400893385
  (2, 3722)	0.18435308046260268
  (3, 4952)	0.3730978582068181
  :	:
  (15448, 7770)	0.43960675020172413
  (15448, 7848)	0.3634440803628635
  (15448, 3972)	0.13970912837689312
  (15448, 5297)	0.14851087200800386
  (15448, 8102)	0.09399910067697755
  (15448, 3841)	0.163430403600344

### Splitting the data into training and testing data
#### Here the test set containg 30 % of data and training set contains 70% of data

In [17]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.3, random_state=0)  

### Training the Model
#### Here K Nearest Neighbours Model is imported and trained on training data

In [18]:
from sklearn.neighbors import KNeighborsClassifier  
classifier= KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2 )  
classifier.fit(x_train, y_train)

KNeighborsClassifier()

### Predicting the result using test dataset

In [19]:
#Predicting the test set result  
y_pred= classifier.predict(x_test) 

### Here the number reprsent the classes. 1st Column represents the prediction of class 1 and second column represent prediction of class 2

In [20]:
print(y_pred)

[[ 1 20]
 [ 0  6]
 [ 0 16]
 ...
 [ 1 43]
 [ 0 16]
 [ 3 12]]


In [21]:
print("Class one is",y_pred[0][0],"Class two is",y_pred[0][1])  #prediction of data in row 0 of test dataset
print("Class one is",y_pred[1][0],"Class two is",y_pred[1][1])  #prediction of data in row 1 of test dataset
print("Class one is",y_pred[2][0],"Class two is",y_pred[2][1])  #prediction of data in row 2 of test dataset
print("Class one is",y_pred[10][0],"Class two is",y_pred[10][1])  #prediction of data in row 10 of test dataset
print("Class one is",y_pred[150][0],"Class two is",y_pred[150][1])  #prediction of data in row 150 of test dataset

Class one is 1 Class two is 20
Class one is 0 Class two is 6
Class one is 0 Class two is 16
Class one is 5 Class two is 12
Class one is 3 Class two is 7


### Saving the Model

In [22]:
import pickle
with open("pickle_model", "wb") as file:
    pickle.dump(classifier, file)