In [None]:
# These are all the modules required to prepare our machine learning algorithm.  We are using here K-Nearest Neighbour algorithm. 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# This line is to show plots inside the jupyter notebook.
%matplotlib inline 

In [None]:
# we are loading our data in a "data" variable.
data = pd.read_csv("data/data.csv")

# from all the data we are just selecting "ITIN_YIELD" and "DISTANCE" column to use and saving these columns to "source_data"
source_data = data[['ITIN_YIELD', 'DISTANCE']]

# from those two column we are trying to predict "ITIN_FARE" column. So we are also saving the column in "predicted_column" variable
predicted_column = data['ITIN_FARE']

# with this line we are just showing first 5 line of our all data
data.head()

In [7]:
# inside the "train_test_split()" we are providing our data. And populating our X_train, Y_train, X_test and Y_test variables
# X_train variable contains the data of  our training "source_data" 
# X_test variable contains the data of  our testing "sourse_data"
# Y_train variable contains the data of  our training "predicted_column"
# Y_test variable contains the data of  our testing "predicted_column"

X_train, X_test, y_train, y_test = train_test_split(source_data, predicted_column, random_state=42)


# we are loading our algorithm in "forest" variable
knn = KNeighborsClassifier(n_neighbors=1)


# we are telling our algorithm to train from our training dataset
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

In [8]:
# its just the shape and size of our source data set.  

print("Source data shape and size: {}".format(source_data.shape))
print("Training source_data shape and size: {}".format(X_train.shape))
print("Testing source_data shape and size: {}".format(X_test.shape))

Source data shape and size: (3529905, 2)
Training source_data shape and size: (2647428, 2)
Testing source_data shape and size: (882477, 2)


In [9]:
# Now we are trying to predict our estimated fare from by giving our algorithm some value. And we can see that, its trying to predict.

X_new = np.array([[0.2306, 850.0]])
prediction = knn.predict(X_new)
print("Prediction: {}".format(prediction))

Prediction: [ 196.]


In [10]:
# Now we are giving our algorithm our test data set. So that, it can try to predict the values of test data also.

y_pred = knn.predict(X_test)
print("Test set predictions:\n {}".format(y_pred))

Test set predictions:
 [ 479.  273.  357. ...,  688.  148.  612.]


In [12]:
# Here are we are trying to watch our prediction score. How much it can learn from our training and testing data.

print("Accuracy on training set: {:.3f}".format(knn.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(knn.score(X_test, y_test)))

Accuracy on training set: 0.998
Accuracy on test set: 0.644
