# Assignment 1 - NumPy

**Name, bhagya raj, Student no**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Provide local path and file names for analysis

In [2]:
# local Path here
path=r'D:\Assignment_1_NumPy_-1676463244/'

# Classification file
class_file_name = "class1.csv"

# Regression file
regr_file_name  = "regr2.csv"

## Part 1 - KNN classification

### 1.1 KNN classification algorithm

In this section you should write a function ``knn_classify(test, train, k)`` that takes train and test data as numpy ndarrays, and a k-value as an integer, and returns the class-values of the test data.

In [3]:
def knn_classify(test, train, k):
    
    dis=[]
    for tr_sample in train:
        # Calculating Euclidean distance of test data corresponding to each training point
        dis.append(np.sqrt(np.sum( (test - tr_sample) ** 2 )))
        
    # Sorting for minimum distances and their indices
    ndis=np.sort(dis)[:k]
    nind=np.array(dis).argsort()[:k]
    
    # Determining labels of minimum distances neighbors
    row_pred = train_y[nind]
    
    # total classes in labelled data
    label_classes=np.unique(train_y)
    
    # Using inverse distancing wieghting proporation
    inv=1/np.array(ndis)
    # average inverse distance
    m_inv = inv / np.sum(inv)[np.newaxis]
    weighted_vote_count=[]
    
    # Loop for determining the proportional weight of each class.
    for label in label_classes:
        index=row_pred==label
        weighted_vote = np.sum(m_inv[index])
        weighted_vote_count.append(weighted_vote)
    probable_class_index = np.argmax(weighted_vote_count)
    label=label_classes[probable_class_index]
    
    return label
    

### 1.2 Data Analysis

In this section you should read the data. Then split it randomly into train (60%), validation (20%), and test (20%) data. Use the train and validation data to find k-value giving the best classification result. Then use this k-value to classify the test data and report your findings: the k-value and the percentage of correct predictions.

In [4]:
# Uploading data from local machine
#path = r'D:\freelance_projects\Assignment_1_NumPy_-1676463244/class2.csv'
data=np.genfromtxt(path+class_file_name, delimiter=',')[1:]


#Splitting dataset into training, validating and testing as according to implied ratios.
tr_split=int(0.6*len(data))
val_split=int(0.8*len(data))

#Training dataset
train=data[:tr_split]
train_x=train[:,1:]
train_y=train[:,0].astype(int)

#Validation dataset
val=data[tr_split:val_split]
val_x=val[:,1:]
val_y=val[:,0].astype(int)

#Testing dataset
test=data[val_split:]
test_x=test[:,1:]
test_y=test[:,0].astype(int)



# KNN Classification Implementation
validation_result=[]

#Looping K value
for k in range(2,40):
    val_predict=[]
    
    #Looping each feature of validation data
    for v in val_x:
        val_predict.append(knn_classify(v,train_x,k))
    
    # Calculating Percent Correct Predictions for Validation
    val_pc=(np.count_nonzero(val_predict-val_y == 0)/np.array(val_predict).shape[0])*100
    
    #Appending Percent Correct Predictions value corresponding to each "K" value.
    validation_result.append([val_pc,k])
    
    # Sorting Maximum Percent Correct Predictions along with its "K" value
    validation_result.sort(key=lambda x: x[0],reverse=True)
    
    # Best Percent Correct Predictions and "K" value
    best_validation_result = validation_result[0][0]
    best_k = validation_result[0][1]

# Predicting against test data using best "K" value   
test_predict=[]   
for t in test_x:
    test_predict.append(knn_classify(t,train_x,best_k))

# Calculating Percent Correct Predictions for Test
test_pc=(np.count_nonzero(test_predict-test_y == 0)/np.array(test_predict).shape[0])*100


#Results
print(f'Percentage Correct Prediction of Validation Data = {np.round(best_validation_result,2)} %')
print(f'Percentage Correct Prediction of Test Data = {np.round(test_pc,2)} %')
print(f'Best K Nearest Neighbor value (k) = {np.round(best_k,1)}\n')

Percentage Correct Prediction of Validation Data = 85.0 %
Percentage Correct Prediction of Test Data = 87.5 %
Best K Nearest Neighbor value (k) = 2



## Part 2 - KNN and linear regression

### 2.1 KNN regression algorithm

In this section you should write a function ``knn_regression(train, test, k)`` that takes train and test data, and a k-value, and returns the regression (fitted) values of the responses of the test data.

In [5]:
def knn_regression(test, train, k):
    
    dis=[]
    # Calculating distancing corresponding to training data
    for tr_sample in train:
        dis.append(np.sqrt(np.sum( (test - tr_sample) ** 2 )))
    
    # Sorting the minimum distance and their indices.
    ndis=np.sort(dis)[:k]
    nind=np.array(dis).argsort()[:k]
    
    # When minimum distance is zero inverse distancing yeild infinity 
    # So in that case, we consider uses mean to calculate label.
    if ndis[0]==0.0:
        label=train_y[nind].mean()
        return round(label,2)
    # Otherwise, We are using inverse distancing method
    else:
        row_pred = train_y[nind]
        inv=1/np.array(ndis)
        label = round(np.matmul(inv, row_pred) / np.sum(inv),2)
        return label

### 2.2 Linear regression algorithm

In [6]:
def linear_regression(train,test,y):
    # Calculating Beta
    b = np.dot((np.linalg.inv(np.dot(train.T,train))), np.dot(train.T,y))
    
    # Taking product of test and Beta.
    predict=np.dot(test, b)
    
    return predict.reshape(len(predict),)

### 2.3 Data Analysis

In [7]:
# Uploading data from local machine
#path = r'D:\freelance_projects\Assignment_1_NumPy_-1676463244/regr2.csv' 
regr_data=np.genfromtxt(path+regr_file_name, delimiter=',')[1:]

#Splitting dataset into training, validating and testing as according to implied ratios.
# Splits
tr_split=int(0.6*len(regr_data))
val_split=int(0.8*len(regr_data))

#Training dataset
train=regr_data[:tr_split]
train_x=train[:,1:]
train_y=train[:,0]

#Validation dataset
val=regr_data[tr_split:val_split]
val_x=val[:,1:]
val_y=val[:,0]

#Testing dataset
test=regr_data[val_split:]
test_x=test[:,1:]
test_y=test[:,0]



#KNN Regression Implementation
validation_result=[]

# looping "K" value 
for k in range(2,15):
    val_predict=[]
    
    #Looping each feature of validation data
    for v in val_x:
        val_predict.append(knn_regression(v,train_x,k))
    
    # Calculating Residual Sum of Square (RSS) for validation
    val_rss=np.sum(np.square(val_predict-val_y))
    
    # Appending RSS value corresponding to each "K" value.
    validation_result.append([val_rss,k])
    
    # Sorting for minimum RSS value,to get best "K" value
    validation_result.sort(key=lambda x: x[0])
    
    # Best RSS and K value
    best_validation_result = validation_result[0][0]
    best_k = validation_result[0][1]

# Predicting against test data using best "K" value
test_predict=[]   
for t in test_x:
    test_predict.append(knn_regression(t,train_x,best_k))
    
# Calculating Residual Sum of Square (RSS) for Test
test_rss=np.sum(np.square(test_predict-test_y))



# Linear Regression Implementation

# Reshaping data according to implied instruction, inserting column of ones for features and reshaping labels.
lr_train_x = np.append(np.ones((len(train_x),1)), train_x, axis=1)
lr_train_y = np.array(train_y).reshape((len(train_y),1)) 
lr_test_x=np.append(np.ones((len(test_x),1)), test_x, axis=1)

# Predicitng
lr_prediction=linear_regression(lr_train_x, lr_test_x, lr_train_y)

# Calculating Residual Sum of Square (RSS) of linear regression for Test
lr_prediction_rss=np.sum(np.square(lr_prediction-test_y))

#Results
print('---K Nearest Neighbour Regression:')
print(f'Residual Sum of Square (RSS) of Validation Data = {np.round(best_validation_result,2)} ')
print(f'Residual Sum of Square (RSS) of Test Data = {np.round(test_rss,2)} ')
print(f'Best K Nearest Neighbor value (k) = {np.round(best_k,1)}\n')
print('---Linear Regression:')
print(f'Residual Sum of Square (RSS) of Test Data = {np.round(lr_prediction_rss,2)} ')

---K Nearest Neighbour Regression:
Residual Sum of Square (RSS) of Validation Data = 0.04 
Residual Sum of Square (RSS) of Test Data = 0.05 
Best K Nearest Neighbor value (k) = 3

---Linear Regression:
Residual Sum of Square (RSS) of Test Data = 0.1 
