https://drive.google.com/file/d/1kKFB8IPHhAFmo59KrUAgVVU4nSCpgYVC/view?usp=drive_link

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
cd /content/gdrive/My Drive/CSE4020_ML

/content/gdrive/My Drive/CSE4020_ML


In [58]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

# Algorithm functions

In [25]:
def naive_bayes(X_train, X_test, y_train, y_test):
    nb_classifier = GaussianNB()
    nb_classifier.fit(X_train, y_train)
    y_pred = nb_classifier.predict(X_test)

    print("Naïve Bayes Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, average='weighted'))
    print("Recall:", recall_score(y_test, y_pred, average='weighted'))

In [32]:
def decision_tree_gini(X_train, X_test, y_train, y_test, depth):
    dt_classifier = DecisionTreeClassifier(criterion='gini', max_depth=depth)
    dt_classifier.fit(X_train, y_train)
    y_pred = dt_classifier.predict(X_test)

    print("Decision Tree Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, average='weighted'))
    print("Recall:", recall_score(y_test, y_pred, average='weighted'))

In [37]:
def decision_tree_entropy(X_train, X_test, y_train, y_test, depth):
    dt_classifier = DecisionTreeClassifier(criterion='entropy', max_depth=depth)
    dt_classifier.fit(X_train, y_train)
    y_pred = dt_classifier.predict(X_test)

    print("Decision Tree Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, average='weighted'))
    print("Recall:", recall_score(y_test, y_pred, average='weighted'))

In [43]:
def logistic_regression(X_train, X_test, y_train, y_test, iterations):
    lr_classifier = LogisticRegression(max_iter=iterations)
    lr_classifier.fit(X_train, y_train)
    y_pred = lr_classifier.predict(X_test)

    print("Logistic Regression Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, average='weighted'))
    print("Recall:", recall_score(y_test, y_pred, average='weighted'))

In [49]:
def knn_eucildean(X_train, X_test, y_train, y_test, k):
    knn_classifier = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
    knn_classifier.fit(X_train, y_train)
    y_pred = knn_classifier.predict(X_test)

    print("KNN Classification Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, average='weighted'))
    print("Recall:", recall_score(y_test, y_pred, average='weighted'))

In [50]:
def knn_manhattan(X_train, X_test, y_train, y_test, k):
    knn_classifier = KNeighborsClassifier(n_neighbors=k, metric='manhattan')
    knn_classifier.fit(X_train, y_train)
    y_pred = knn_classifier.predict(X_test)

    print("KNN Classification Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, average='weighted'))
    print("Recall:", recall_score(y_test, y_pred, average='weighted'))

# Task 1 - Heart Dataset

In [17]:
data = pd.read_csv("heart.csv")
data.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [18]:
data.isnull().sum()

male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [19]:
imputer = SimpleImputer(strategy='mean')
data_imputed = imputer.fit_transform(data)
data_imputed = pd.DataFrame(data_imputed, columns=data.columns)
data_imputed.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1.0,39.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,195.0,106.0,70.0,26.97,80.0,77.0,0.0
1,0.0,46.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,250.0,121.0,81.0,28.73,95.0,76.0,0.0
2,1.0,48.0,1.0,1.0,20.0,0.0,0.0,0.0,0.0,245.0,127.5,80.0,25.34,75.0,70.0,0.0
3,0.0,61.0,3.0,1.0,30.0,0.0,0.0,1.0,0.0,225.0,150.0,95.0,28.58,65.0,103.0,1.0
4,0.0,46.0,3.0,1.0,23.0,0.0,0.0,0.0,0.0,285.0,130.0,84.0,23.1,85.0,85.0,0.0


In [21]:
X = data_imputed.drop('TenYearCHD', axis=1)
y = data_imputed['TenYearCHD']

In [22]:
scaler = StandardScaler()
X_norm = scaler.fit_transform(X)
X_norm = pd.DataFrame(X_norm, columns=X.columns)
X_norm

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose
0,1.153192,-1.234951,2.007088,-0.988271,-0.757974,-1.758438e-01,-0.077033,-0.671101,-0.162477,-0.941346,-1.195907,-1.082625,0.286943,0.342744,-0.217517
1,-0.867158,-0.418257,0.020905,-0.988271,-0.757974,-1.758438e-01,-0.077033,-0.671101,-0.162477,0.299595,-0.515187,-0.158988,0.719325,1.590275,-0.261311
2,1.153192,-0.184916,-0.972187,1.011868,0.925835,-1.758438e-01,-0.077033,-0.671101,-0.162477,0.186782,-0.220209,-0.242955,-0.113502,-0.073099,-0.524078
3,-0.867158,1.331800,1.013996,1.011868,1.767740,-1.758438e-01,-0.077033,1.490089,-0.162477,-0.264469,0.800871,1.016549,0.682474,-0.904786,0.921141
4,-0.867158,-0.418257,1.013996,1.011868,1.178407,-1.758438e-01,-0.077033,-0.671101,-0.162477,1.089284,-0.106755,0.092912,-0.663807,0.758588,0.132840
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4233,1.153192,0.048425,-0.972187,1.011868,-0.673784,-1.758438e-01,-0.077033,1.490089,-0.162477,1.721036,2.116930,0.764648,0.041271,-0.821617,0.176634
4234,1.153192,0.165095,1.013996,1.011868,2.862216,-1.758438e-01,-0.077033,-0.671101,-0.162477,-0.670595,-0.265590,-0.242955,-1.496634,-0.904786,-0.611667
4235,-0.867158,-0.184916,0.020905,1.011868,0.925835,-2.059023e-17,-0.077033,-0.671101,-0.162477,0.254470,-0.061374,-0.914691,-0.934046,0.675419,0.176634
4236,-0.867158,-0.651598,-0.972187,1.011868,0.504883,-1.758438e-01,-0.077033,-0.671101,-0.162477,-0.602907,-0.265590,0.344813,-1.631754,0.841756,0.000000


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.2, random_state=42)

In [24]:
naive_bayes(X_train, X_test, y_train, y_test)

Naïve Bayes Confusion Matrix:
[[676  48]
 [100  24]]
Accuracy: 0.8254716981132075
Precision: 0.7924933540815665
Recall: 0.8254716981132075



In [33]:
decision_tree_gini(X_train, X_test, y_train, y_test,2)

Decision Tree Confusion Matrix:
[[723   1]
 [124   0]]
Accuracy: 0.8525943396226415
Precision: 0.7287819384731906
Recall: 0.8525943396226415


In [38]:
decision_tree_entropy(X_train, X_test, y_train, y_test,2)

Decision Tree Confusion Matrix:
[[724   0]
 [124   0]]
Accuracy: 0.8537735849056604
Precision: 0.7289293342826628
Recall: 0.8537735849056604


  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
decision_tree_gini(X_train, X_test, y_train, y_test,4)

Decision Tree Confusion Matrix:
[[696  28]
 [106  18]]
Accuracy: 0.8419811320754716
Precision: 0.7981497241310178
Recall: 0.8419811320754716


In [41]:
decision_tree_entropy(X_train, X_test, y_train, y_test,4)

Decision Tree Confusion Matrix:
[[695  29]
 [104  20]]
Accuracy: 0.8431603773584906
Precision: 0.8023283580794823
Recall: 0.8431603773584906


In [44]:
logistic_regression(X_train, X_test, y_train, y_test, 200)

Logistic Regression Confusion Matrix:
[[718   6]
 [115   9]]
Accuracy: 0.8573113207547169
Precision: 0.823641532084532
Recall: 0.8573113207547169


In [48]:
logistic_regression(X_train, X_test, y_train, y_test, 25)

Logistic Regression Confusion Matrix:
[[718   6]
 [115   9]]
Accuracy: 0.8573113207547169
Precision: 0.823641532084532
Recall: 0.8573113207547169


In [51]:
knn_eucildean(X_train, X_test, y_train, y_test, 3)

KNN Classification Confusion Matrix:
[[676  48]
 [106  18]]
Accuracy: 0.8183962264150944
Precision: 0.7779246160392711
Recall: 0.8183962264150944


In [52]:
knn_manhattan(X_train, X_test, y_train, y_test, 3)

KNN Classification Confusion Matrix:
[[669  55]
 [104  20]]
Accuracy: 0.8125
Precision: 0.7778999568779648
Recall: 0.8125


In [54]:
knn_eucildean(X_train, X_test, y_train, y_test, 8)

KNN Classification Confusion Matrix:
[[711  13]
 [118   6]]
Accuracy: 0.8455188679245284
Precision: 0.7784240713078415
Recall: 0.8455188679245284


> Observation:

*   For the **decision tree algorithm**, using the entropy instead of gini index to calculate information gain allows for higher accuarcy, precision and recall. A lower max depth of 2 provides better accuracy and recall but lower precision than a higher max depth of 4.
*   For the **logistic regression algorithm**, increasing the number of iterations from 25 to 200 seems to have no significant effect on accuracy, precision or recall and so 25 iterations seems an sufficient amount
* For the **kNN classiffication algorithm**, the eucilidian distance metric allows for higher accuracy and recall but lower precision than the manhattan distance metric. Increasing the k value from 3 to 8, improves all performance metrics and hence is a more suitable hyperparameter.







# Task 2 - Music Emotion Dataset

In [55]:
data = pd.read_csv("music.csv")
data.head()

Unnamed: 0,Class,_RMSenergy_Mean,_Lowenergy_Mean,_Fluctuation_Mean,_Tempo_Mean,_MFCC_Mean_1,_MFCC_Mean_2,_MFCC_Mean_3,_MFCC_Mean_4,_MFCC_Mean_5,...,_Chromagram_Mean_9,_Chromagram_Mean_10,_Chromagram_Mean_11,_Chromagram_Mean_12,_HarmonicChangeDetectionFunction_Mean,_HarmonicChangeDetectionFunction_Std,_HarmonicChangeDetectionFunction_Slope,_HarmonicChangeDetectionFunction_PeriodFreq,_HarmonicChangeDetectionFunction_PeriodAmp,_HarmonicChangeDetectionFunction_PeriodEntropy
0,relax,0.052,0.591,9.136,130.043,3.997,0.363,0.887,0.078,0.221,...,0.426,1.0,0.008,0.101,0.316,0.261,0.018,1.035,0.593,0.97
1,relax,0.125,0.439,6.68,142.24,4.058,0.516,0.785,0.397,0.556,...,0.002,1.0,0.0,0.984,0.285,0.211,-0.082,3.364,0.702,0.967
2,relax,0.046,0.639,10.578,188.154,2.775,0.903,0.502,0.329,0.287,...,0.184,0.746,0.016,1.0,0.413,0.299,0.134,1.682,0.692,0.963
3,relax,0.135,0.603,10.442,65.991,2.841,1.552,0.612,0.351,0.011,...,0.038,1.0,0.161,0.757,0.422,0.265,0.042,0.354,0.743,0.968
4,relax,0.066,0.591,9.769,88.89,3.217,0.228,0.814,0.096,0.434,...,0.004,0.404,1.0,0.001,0.345,0.261,0.089,0.748,0.674,0.957


In [59]:
X = data.drop('Class', axis=1)
y = data['Class']

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)


label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)
X_train

array([[0.29691211, 0.75062344, 0.34491078, ..., 0.03884624, 0.56349206,
        0.76315789],
       [0.152019  , 0.63341646, 0.14420709, ..., 0.18004187, 0.42063492,
        0.71052632],
       [0.17814727, 0.69326683, 0.12837396, ..., 0.04861596, 0.11111111,
        0.71052632],
       ...,
       [0.56057007, 0.75561097, 0.17014325, ..., 0.16515469, 0.65873016,
        0.78947368],
       [0.53444181, 0.50124688, 0.07393818, ..., 0.47825076, 0.86772487,
        0.73684211],
       [0.24940618, 0.6159601 , 0.26021613, ..., 0.40358223, 0.82539683,
        0.71052632]])

In [60]:
naive_bayes(X_train, X_test, y_train, y_test)

Naïve Bayes Confusion Matrix:
[[11  5  1  1]
 [ 1 17  0  0]
 [ 0  1 24  1]
 [ 3  2  4  9]]
Accuracy: 0.7625
Precision: 0.7710564263322884
Recall: 0.7625


In [62]:
decision_tree_gini(X_train, X_test, y_train, y_test,4)

Decision Tree Confusion Matrix:
[[ 8  2  0  8]
 [ 1 16  0  1]
 [ 0  0 14 12]
 [ 0  2  5 11]]
Accuracy: 0.6125
Precision: 0.6968174342105262
Recall: 0.6125


In [63]:
decision_tree_entropy(X_train, X_test, y_train, y_test,4)

Decision Tree Confusion Matrix:
[[ 8  3  5  2]
 [ 1 16  0  1]
 [ 0  0 11 15]
 [ 0  2  2 14]]
Accuracy: 0.6125
Precision: 0.6684771825396826
Recall: 0.6125


In [65]:
decision_tree_gini(X_train, X_test, y_train, y_test,8)

Decision Tree Confusion Matrix:
[[11  3  1  3]
 [ 2 13  1  2]
 [ 2  0 13 11]
 [ 0  2  6 10]]
Accuracy: 0.5875
Precision: 0.6152289377289376
Recall: 0.5875


In [64]:
decision_tree_entropy(X_train, X_test, y_train, y_test,8)

Decision Tree Confusion Matrix:
[[11  1  1  5]
 [ 0 17  0  1]
 [ 1  0 15 10]
 [ 2  2  6  8]]
Accuracy: 0.6375
Precision: 0.6646266233766234
Recall: 0.6375


In [66]:
logistic_regression(X_train, X_test, y_train, y_test, 200)

Logistic Regression Confusion Matrix:
[[12  2  1  3]
 [ 1 17  0  0]
 [ 0  0 21  5]
 [ 2  2  5  9]]
Accuracy: 0.7375
Precision: 0.7340382819794584
Recall: 0.7375


In [74]:
logistic_regression(X_train, X_test, y_train, y_test, 50)

Logistic Regression Confusion Matrix:
[[12  2  1  3]
 [ 1 17  0  0]
 [ 0  0 22  4]
 [ 2  2  5  9]]
Accuracy: 0.75
Precision: 0.7440625
Recall: 0.75


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [75]:
knn_eucildean(X_train, X_test, y_train, y_test, 3)

KNN Classification Confusion Matrix:
[[13  4  1  0]
 [ 1 17  0  0]
 [ 8  5  7  6]
 [ 5  4  5  4]]
Accuracy: 0.5125
Precision: 0.5008333333333334
Recall: 0.5125


In [76]:
knn_manhattan(X_train, X_test, y_train, y_test, 3)

KNN Classification Confusion Matrix:
[[12  4  2  0]
 [ 1 16  1  0]
 [ 6  2  7 11]
 [ 3  5  6  4]]
Accuracy: 0.4875
Precision: 0.458248106060606
Recall: 0.4875


In [77]:
knn_eucildean(X_train, X_test, y_train, y_test, 8)

KNN Classification Confusion Matrix:
[[12  5  1  0]
 [ 1 17  0  0]
 [ 5  5 10  6]
 [ 5  4  5  4]]
Accuracy: 0.5375
Precision: 0.5339034011220196
Recall: 0.5375


In [78]:
knn_manhattan(X_train, X_test, y_train, y_test, 8)

KNN Classification Confusion Matrix:
[[13  5  0  0]
 [ 0 18  0  0]
 [ 6  3 11  6]
 [ 4  5  5  4]]
Accuracy: 0.575
Precision: 0.5712565743338008
Recall: 0.575


> Observation:

*   For the **decision tree algorithm**, using gini and enrtropy metric provide similar values for accuracy and recall although the entropy metric gives higher precision. Increasing the max depth from 4 to 8 allows for better performace metrics in entropy based decision tree yet worse performance metrics for the gini index based algorithm.
*   For the **logistic regression algorithm**, decresing the number of iterations from 200 to 50 seems to cause issues and hence is an insufficent amount of iteration for an optimal algorithm
* For the **kNN classiffication algorithm**, the eucilidian distance metric allows for higher accuracy and recall when the k value is 3 but manhattan distance metric is bettern when k value is 8.





