In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier



In [None]:
data = pd.read_csv('./data/parkinsons_updrs.data.csv')


In [None]:
data.info()

In [None]:
data.columns

In [None]:
data=data.loc[:,["index","subject#","age","sex","test_time","motor_UPDRS","total_UPDRS","Jitter(%)","Shimmer","NHR","HNR","RPDE","DFA","PPE"]]


In [None]:
data.hist(column=['motor_UPDRS','total_UPDRS','Jitter(%)','Shimmer','NHR','HNR','RPDE','DFA','PPE'], figsize=(20,20))


In [None]:
data.isnull().sum()

In [None]:
# print all the columns of the dataframe
data.columns

In [None]:
# A1.
# Evaluate the interclass spread between the two classes in the dataset
# using the following features: motor_UPDRS, total_UPDRS, Jitter(%), Shimmer, NHR, HNR, RPDE, DFA, PPE


class1_data = np.array(data[data['subject#'] == 1])
class2_data = np.array(data[data['subject#'] == 2])

class1_centroid = np.mean(class1_data, axis=0)
class2_centroid = np.mean(class2_data, axis=0)

intraclass_spread_class1 = np.mean(np.linalg.norm(class1_data - class1_centroid, axis=1))
intraclass_spread_class2 = np.mean(np.linalg.norm(class2_data - class2_centroid, axis=1))


interclass_distance = np.linalg.norm(class1_centroid - class2_centroid)

print('Class 1 Centroid:', class1_centroid)
print('Class 2 Centroid:', class2_centroid)
print('Intraclass Spread Class 1:', intraclass_spread_class1)
print('Intraclass Spread Class 2:', intraclass_spread_class2)
print('Interclass Distance:', interclass_distance)


In [None]:
# select one feature vector
feature_vector_1 = np.array(data['motor_UPDRS'])
feature_vector_1 = feature_vector_1.reshape(-1, 1)

feature_vector_2 = np.array(data['total_UPDRS'])
feature_vector_2 = feature_vector_2.reshape(-1, 1)


feature_vector_1.mean(axis=0)
feature_vector_2.mean(axis=0)

print('FV-1 >> Mean >> ', feature_vector_1.mean(axis=0))
print('FV-2 >> Mean >> ', feature_vector_2.mean(axis=0))

In [None]:
centroid_vector_1 = np.mean(feature_vector_1, axis=0)
centroid_vector_2 = np.mean(feature_vector_2, axis=0)

print('FV-1 >> Centroid >> ', centroid_vector_1)
print('FV-2 >> Centroid >> ', centroid_vector_2)

In [None]:
intraclass_spread_1 = np.mean(np.linalg.norm(feature_vector_1 - centroid_vector_1, axis=1))
intraclass_spread_2 = np.mean(np.linalg.norm(feature_vector_2 - centroid_vector_2, axis=1))

print('FV-1 >> Intraclass Spread >> ', intraclass_spread_1)
print('FV-2 >> Intraclass Spread >> ', intraclass_spread_2)

In [None]:
interclass_distance = np.linalg.norm(centroid_vector_1 - centroid_vector_2)

print('Interclass Distance:', interclass_distance)

In [None]:
#A2.

# data

num_bins = 10
hist, bins = np.histogram(data['motor_UPDRS'], bins=num_bins)

mean = np.mean(data['motor_UPDRS'])
variance = np.var(data['motor_UPDRS'])

#plot histogram
plt.hist(data['motor_UPDRS'], bins=num_bins, alpha=0.5, color='blue', edgecolor='black')
plt.xlabel('motor_UPDRS')
plt.ylabel('Frequency')
plt.title('Histogram of motor_UPDRS')
plt.grid(True)


print("Mean >> ", mean)
print("Variance >> ", variance)

plt.show()

In [None]:
from scipy.spatial import distance

In [None]:
#A3. Take any two feature vectors from your dataset. Calculate the Minkwoski distance with r from 1 
# to 10. Make a plot of the distance and observe the nature of this graph.

vector1 = np.array(data['motor_UPDRS'])
vector2 = np.array(data['total_UPDRS'])

r_values = range(1, 11) # r = 1 to 10

distance = [distance.minkowski(vector1, vector2, p=r) for r in r_values]

plt.plot(r_values, distance, marker='o', linestyle='-')
plt.xlabel('r')
plt.ylabel('Minkowski Distance')
plt.title('Minkowski Distance vs. r')
plt.grid(True)
plt.show()



In [None]:
from sklearn.model_selection import train_test_split



In [None]:

X = data.iloc[:, 2:13].values
y = data.iloc[:, 1].values

# X = data[['subject#', 'age', 'sex', 'test_time', 'Jitter(%)', 'Shimmer', 'NHR', 'HNR', 'RPDE', 'DFA', 'PPE']]
# y = data['motor_UPDRS']  # Change 'motor_UPDRS' to 'total_UPDRS' if needed

In [None]:
X

In [None]:
y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
#  Train a kNN classifier (k =3) using the training set obtained from above exercise. Following code 
# for help: 
# >>> import numpy as np 
# >>> from sklearn.neighbors import KNeighborsClassifier 
# >>> neigh = KNeighborsClassifier(n_neighbors=3) 
# >>> neigh.fit(X, y) 

In [None]:
#A5. 

from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)

In [None]:
#a6.
# Test the score

model.score(X_test, y_test)

In [None]:
# A7. 
model.predict(X_test)

In [None]:
# A8. 
# Make k = 1 to implement NN classifier and compare the results with kNN (k = 3). Vary k from 1 to 
# 11 and make an accuracy plot.


In [None]:
model_1 = KNeighborsClassifier(n_neighbors=1)
model_1.fit(X_train, y_train)

In [None]:
model_1.score(X_test, y_test)

In [None]:
model_1.predict(X_test)

In [None]:
# Vary k from 1 to  11 and make an accuracy plot.
k_values = range(1, 12)

accu_scores = []

for k in k_values:
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    accu_scores.append(accuracy)

#plot the accuracy graph
plt.figure(figsize=(12, 6))
plt.plot(k_values, accu_scores, marker='o', linestyle='--')
plt.title('Accuracy vs. K Value')
plt.xlabel('K values')
plt.ylabel('Accuracy')
plt.xticks(k_values)
plt.grid(True)
plt.show()

In [None]:
# A9.
# Please evaluate confusion matrix for your classification problem. From confusion matrix, the 
# other performance metrics such as precision, recall and F1-Score measures for both training and test 
# data. Based on your observations, infer the models learning outcome (underfit / regularfit / overfit).

In [None]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

In [None]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

confusion_matrix_train = confusion_matrix(y_train, y_train_pred)
confusion_matrix_test = confusion_matrix(y_test, y_test_pred)

# precision, recall and F1-score for training data
precision_train = precision_score(y_train, y_train_pred, average='micro')
recall_train = recall_score(y_train, y_train_pred, average='micro')
f1_score_train = f1_score(y_train, y_train_pred, average='micro')

# precision, recall and F1-score for test data

precision_test = precision_score(y_test, y_test_pred, average='micro')
recall_test = recall_score(y_test, y_test_pred, average='micro')
f1_score_test = f1_score(y_test, y_test_pred, average='micro')

#print the confusion matrix and other metrics
print('Confusion Matrix - Training Data')
print(confusion_matrix_train)
print('Precision - Training Data', precision_train)
print('Recall - Training Data', recall_train)
print('F1 score - Training Data', f1_score_train)

print('Confusion Matrix - Test Data')
print(confusion_matrix_test)
print('Precision - Test Data', precision_test)
print('Recall - Test Data', recall_test)
print('F1 score - Test Data', f1_score_test)




