In [34]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier, VotingClassifier, AdaBoostClassifier

In [35]:
df = pd.read_csv("letter-recognition.data.txt", header=None)
print(df.head())


  0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15  16
0  T   2   8   3   5   1   8  13   0   6   6  10   8   0   8   0   8
1  I   5  12   3   7   2  10   5   5   4  13   3   9   2   8   4  10
2  D   4  11   6   8   6  10   6   2   6  10   3   7   3   7   3   9
3  N   7  11   6   6   3   5   9   4   6   4   4  10   6  10   2   8
4  G   2   1   3   1   1   8   6   6   6   6   5   9   1   7   5  10


In [36]:
X = df.iloc[:, 1:]
Y = df.iloc[:, 0]

In [37]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=30)

In [38]:
# Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train, Y_train)
dt_pred = dt.predict(X_test)
print("\n--- Decision Tree ---")
print("Confusion Matrix:\n", confusion_matrix(Y_test, dt_pred))
print("\nClassification Report:\n", classification_report(Y_test, dt_pred))
print("Accuracy:", accuracy_score(Y_test, dt_pred))



--- Decision Tree ---
Confusion Matrix:
 [[209   0   0   0   0   2   2   2   0   0   0   2   3   1   0   1   0   0
    1   2   2   0   0   0   1   1]
 [  1 186   0   3   0   3   2   3   1   3   4   1   1   0   3   2   0   2
    5   0   0   2   1   1   3   1]
 [  0   2 186   0   6   0  10   0   1   0   1   0   0   0   5   0   1   2
    1   3   0   1   0   0   1   0]
 [  0   5   0 186   0   2   1   5   0   0   1   0   0   2   5   2   1   1
    5   0   0   0   0   0   2   1]
 [  0   1   8   0 183   6   5   2   1   2   2   1   0   0   0   1   3   0
    4   1   1   0   0   8   0   3]
 [  0   1   0   0   4 186   0   2   0   1   1   0   1   0   0   7   0   2
    2   7   0   1   2   5   3   0]
 [  0   1   4   2   3   5 189   1   1   0   3   1   1   0   3   1   5   3
    3   0   0   3   1   0   2   2]
 [  0   3   0   5   3   3   3 159   0   1   7   1   1   2   3   1   2   7
    2   0   1   0   0   1   1   0]
 [  1   2   0   0   1   0   0   0 221   4   0   1   0   0   0   2   0   0
    1   0   

In [39]:
# KNN
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
knn_pred = knn.predict(X_test)
print("\n--- KNN ---")
print("Confusion Matrix:\n", confusion_matrix(Y_test, knn_pred))
print("\nClassification Report:\n", classification_report(Y_test, knn_pred))
print("Accuracy:", accuracy_score(Y_test, knn_pred))


--- KNN ---
Confusion Matrix:
 [[227   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0
    0   1   0   0   0   0   0   0]
 [  0 217   0   1   2   0   0   1   0   1   0   0   0   0   0   0   0   4
    0   0   0   0   0   2   0   0]
 [  0   0 206   0   1   0   3   0   0   0   0   1   0   0   3   0   0   1
    0   0   0   2   1   2   0   0]
 [  0   1   0 210   0   0   0   3   0   0   0   0   0   2   1   0   0   1
    0   0   0   0   0   1   0   0]
 [  0   2   4   0 214   1   3   0   0   0   1   0   0   0   0   0   0   0
    2   0   0   1   0   1   0   3]
 [  0   0   0   1   1 203   0   1   2   0   0   0   0   1   0   9   0   0
    0   4   0   2   1   0   0   0]
 [  0   2   0   3   3   0 214   1   0   0   0   0   0   0   7   0   1   1
    0   0   0   0   1   1   0   0]
 [  0   5   1   4   0   0   2 186   0   0   4   0   0   0   0   0   0   4
    0   0   0   0   0   0   0   0]
 [  0   0   1   0   0   0   0   0 230   4   0   0   0   0   0   1   0   0
    0   0   0   0   0 

In [40]:
# Naive Bayes
nb = GaussianNB()
nb.fit(X_train, Y_train)
nb_pred = nb.predict(X_test)
print("\n--- Naive Bayes ---")
print("Confusion Matrix:\n", confusion_matrix(Y_test, nb_pred))
print("\nClassification Report:\n", classification_report(Y_test, nb_pred))
print("Accuracy:", accuracy_score(Y_test, nb_pred))


--- Naive Bayes ---
Confusion Matrix:
 [[191   0   0   2   0   0   0   2   0   2   1   0   7   2   0   0   2   3
    9   0   2   0   1   2   3   0]
 [  0 151   0  10   0   0   4   5  26   2   5   0   6   0   1   0   1  10
    2   0   0   0   3   2   0   0]
 [  0   0 161   0   9   0  11   0   0   0  21   0   3   0   3   0   3   1
    5   2   1   0   0   0   0   0]
 [  2  16   0 154   0   1   0   1  12   5   1   0   1   1  11   2   0   5
    7   0   0   0   0   0   0   0]
 [  0   3   3   0  72   1  43   1  18   0  12   0   0   0   0   0  13   1
    9   1   1   0   1  47   1   5]
 [  0  13   0  12   0 157   5   1   1   0   0   0   1   4   0   9   6   1
    2   4   0   0   4   3   2   0]
 [  3   5  28  11   0   1 118   1   5   0   7   0   7   0   5   0  22   5
    9   0   0   0   6   1   0   0]
 [  2   7   0  18   0   2   3  66   4   0   9   0  10   0  33   0   2  14
    1   0   2   0   4  26   3   0]
 [  0   5   0  15   3   1   0   0 188  10   0   1   0   0   0   1   1   0
   11   0   0 

In [41]:
# Bagging
bagging = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, random_state=30)
bagging.fit(X_train, Y_train)
bag_pred = bagging.predict(X_test)
print("\n--- Bagging ---")
print("Confusion Matrix:\n", confusion_matrix(Y_test, bag_pred))
print("\nClassification Report:\n", classification_report(Y_test, bag_pred))
print("Accuracy:", accuracy_score(Y_test, bag_pred))


--- Bagging ---
Confusion Matrix:
 [[223   0   0   0   0   0   0   0   0   0   0   2   1   0   0   0   0   0
    2   0   0   0   0   0   1   0]
 [  0 216   0   0   0   2   2   2   0   0   0   0   1   0   0   0   0   0
    0   0   0   2   0   3   0   0]
 [  0   0 201   0   6   0   7   0   0   0   0   0   0   0   4   0   0   1
    0   0   0   0   0   1   0   0]
 [  0   2   0 208   0   0   0   1   0   0   0   0   0   0   2   1   0   1
    2   0   0   0   0   0   2   0]
 [  0   0   4   0 211   1   3   0   0   0   3   0   0   0   0   1   1   0
    3   0   0   0   0   3   0   2]
 [  0   3   0   0   1 199   0   1   0   1   0   0   0   1   0  10   0   0
    2   5   0   1   1   0   0   0]
 [  0   2   2   4   5   0 213   1   0   0   0   0   0   0   3   0   3   0
    0   0   0   1   0   0   0   0]
 [  0   2   0   4   0   1   2 183   0   0   4   0   0   0   3   0   0   6
    1   0   0   0   0   0   0   0]
 [  0   1   0   1   1   1   0   0 224   6   0   0   0   0   0   2   0   0
    0   0   0   0 

In [42]:
# Boosting (AdaBoost)
boosting = AdaBoostClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, random_state=30)
boosting.fit(X_train, Y_train)
boost_pred = boosting.predict(X_test)
print("\n--- Boosting ---")
print("Confusion Matrix:\n", confusion_matrix(Y_test, boost_pred))
print("\nClassification Report:\n", classification_report(Y_test, boost_pred))
print("Accuracy:", accuracy_score(Y_test, boost_pred))


--- Boosting ---




Confusion Matrix:
 [[211   0   0   0   0   0   1   2   0   1   1   2   2   1   0   1   0   1
    3   0   2   0   0   0   1   0]
 [  1 184   0   3   0   3   2   3   0   3   4   0   0   0   4   2   3   1
    6   0   0   2   1   1   5   0]
 [  0   2 187   0   5   0  13   0   0   0   2   0   0   1   3   0   2   2
    0   1   1   1   0   0   0   0]
 [  0   3   0 186   0   4   1   3   0   0   1   0   0   3   5   2   1   2
    6   0   0   0   0   0   1   1]
 [  0   1   7   0 189   4   5   2   0   2   3   0   0   0   0   3   1   1
    4   1   0   0   0   7   0   2]
 [  0   1   0   0   2 186   0   3   4   1   0   0   0   1   3   6   0   1
    1   7   0   1   2   1   3   2]
 [  0   1   3   2   2   3 193   0   1   0   3   1   1   1   2   0   5   2
    5   1   1   3   1   0   1   2]
 [  1   1   1   5   4   2   1 162   0   0   7   3   0   1   2   0   2   6
    3   1   2   0   0   0   1   1]
 [  1   2   0   1   1   0   0   0 218   4   0   1   0   0   0   2   0   0
    2   0   0   0   0   1   1   2]

In [43]:
# Voting
voting = VotingClassifier(estimators=[('dt', DecisionTreeClassifier()), 
                                      ('knn', KNeighborsClassifier()), 
                                      ('nb', GaussianNB())], voting='hard')
voting.fit(X_train, Y_train)
vote_pred = voting.predict(X_test)
print("\n--- Voting ---")
print("Confusion Matrix:\n", confusion_matrix(Y_test, vote_pred))
print("\nClassification Report:\n", classification_report(Y_test, vote_pred))
print("Accuracy:", accuracy_score(Y_test, vote_pred))


--- Voting ---
Confusion Matrix:
 [[223   0   0   0   0   0   0   0   0   0   0   1   1   2   0   0   0   0
    0   0   0   0   0   0   2   0]
 [  0 220   0   1   1   0   0   2   0   0   0   0   0   0   0   0   0   2
    0   0   0   0   0   2   0   0]
 [  0   2 208   0   2   0   4   0   0   0   0   0   0   0   3   0   0   0
    0   0   0   1   0   0   0   0]
 [  0   9   0 206   0   0   0   0   0   0   0   0   0   0   2   1   0   0
    1   0   0   0   0   0   0   0]
 [  0   3   7   0 203   3   4   0   0   0   2   0   0   0   0   0   0   0
    2   0   0   0   0   5   0   3]
 [  0   9   0   8   2 195   0   1   0   0   0   0   0   0   0   5   0   0
    0   3   0   1   1   0   0   0]
 [  4   4   2   8   4   4 200   0   0   0   1   0   0   0   2   0   1   1
    2   0   0   0   1   0   0   0]
 [  4   8   1   9   0   1   5 169   0   0   3   0   1   0   0   0   1   3
    0   0   0   0   0   0   1   0]
 [  0   5   1   3   1   0   0   0 223   2   0   0   0   0   0   1   0   0
    0   0   0   0  

Q1: Ensemble Method by manipulation of Dataset (Bagged Decision Trees)

In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

df = pd.read_csv('letter-recognition.data.txt', header=None)
X = df.iloc[:, 1:]
Y = df.iloc[:, 0]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=30)

dtree = DecisionTreeClassifier(criterion="entropy", random_state=30)
dtree.fit(X_train, Y_train)
y_pred_dt = dtree.predict(X_test)
print("\nDecision Tree Accuracy:", accuracy_score(Y_test, y_pred_dt))

num_trees = 5
model = BaggingClassifier(estimator=DecisionTreeClassifier(criterion="entropy", random_state=30),
                          n_estimators=num_trees, random_state=30)
model.fit(X_train, Y_train)
y_pred_bag = model.predict(X_test)
print("\nBagging Accuracy:", accuracy_score(Y_test, y_pred_bag))

scores_dt = cross_val_score(dtree, X_train, Y_train, cv=5, scoring='accuracy')
scores_bag = cross_val_score(model, X_train, Y_train, cv=5, scoring='accuracy')
print("\nDecision Tree CV Accuracy:", scores_dt.mean())
print("\nBagging CV Accuracy:", scores_bag.mean())



Decision Tree Accuracy: 0.8651666666666666

Bagging Accuracy: 0.892

Decision Tree CV Accuracy: 0.8633571428571427

Bagging CV Accuracy: 0.8875


Q2: Ensemble Method by manipulation of Classifiers (Voting Classifier)

In [45]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

dt = DecisionTreeClassifier()
knn3 = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
knn5 = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
knn5_man = KNeighborsClassifier(n_neighbors=5, metric='manhattan')
nb = GaussianNB()

voting = VotingClassifier(estimators=[
    ('dt', dt),
    ('knn3', knn3),
    ('knn5', knn5),
    ('knn5_man', knn5_man),
    ('nb', nb)], voting='hard')

voting.fit(X_train, Y_train)
y_pred_vote = voting.predict(X_test)
print("\nVoting Classifier Accuracy:", accuracy_score(Y_test, y_pred_vote))

scores_vote = cross_val_score(voting, X_train, Y_train, cv=5, scoring='accuracy')
print("\nVoting CV Accuracy:", scores_vote.mean())



Voting Classifier Accuracy: 0.9483333333333334

Voting CV Accuracy: 0.9454285714285714


Q3: Manipulating the Features

In [53]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

df = pd.read_csv('letter-recognition.data.txt', header=None)
X = df.iloc[:, 1:]
Y = df.iloc[:, 0]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=30)

np.random.seed(30)
for i in range(5):
    feature_set = np.random.choice(X.columns, 10, replace=False)
    dt = DecisionTreeClassifier(max_depth=5, random_state=30)
    dt.fit(X_train[feature_set], Y_train)
    y_pred = dt.predict(X_test[feature_set])
    print(f"\nDecision Tree Accuracy with Feature Set {i+1}:", accuracy_score(Y_test, y_pred))



Decision Tree Accuracy with Feature Set 1: 0.3665

Decision Tree Accuracy with Feature Set 2: 0.3125

Decision Tree Accuracy with Feature Set 3: 0.331

Decision Tree Accuracy with Feature Set 4: 0.3225

Decision Tree Accuracy with Feature Set 5: 0.3405


Q4: Manipulating the Classes

In [49]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

df = pd.read_csv('letter-recognition.data.txt', header=None)
X = df.iloc[:, 1:]
Y = df.iloc[:, 0]

np.random.seed(30)
for i in range(5):
    random_vector = np.random.choice([0, 1], size=len(Y))
    Y_binary = ['Group1' if val == 0 else 'Group2' for val in random_vector]
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y_binary, test_size=0.30, random_state=30)
    
    dt = DecisionTreeClassifier(max_depth=5, random_state=30)
    dt.fit(X_train, Y_train)
    y_pred = dt.predict(X_test)
    
    print(f"\nDecision Tree Accuracy for Random Class Split {i+1}:", accuracy_score(Y_test, y_pred))


Decision Tree Accuracy for Random Class Split 1: 0.5053333333333333

Decision Tree Accuracy for Random Class Split 2: 0.5155

Decision Tree Accuracy for Random Class Split 3: 0.5018333333333334

Decision Tree Accuracy for Random Class Split 4: 0.5055

Decision Tree Accuracy for Random Class Split 5: 0.49433333333333335
