In [1]:
from sklearn.datasets import fetch_openml

In [2]:
mnist = fetch_openml('mnist_784',version=1)

In [3]:
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

# Q.1

In [4]:
from sklearn.model_selection import train_test_split
import numpy as np

In [5]:
# get X and y
X = mnist['data']
y = mnist['target']

In [6]:
# Calculate the size of the training, validation, and test sets
n_samples = len(X)
n_train = int(0.5 * n_samples)
n_val = int(0.1 * n_samples)
n_test = int(0.1 * n_samples)

In [7]:
# Split the dataset into the training, validation, and test sets
X_train = X[:n_train]
y_train = y[:n_train]
X_val = X[n_train:n_train+n_val]
y_val = y[n_train:n_train+n_val]
X_test = X[n_train+n_val:n_train+n_val+n_test]
y_test = y[n_train+n_val:n_train+n_val+n_test]

In [8]:
X_train.shape

(35000, 784)

In [9]:
X_val.shape

(7000, 784)

In [10]:
X_test.shape

(7000, 784)

# Q2

In [11]:
from sklearn.ensemble import RandomForestClassifier

# Build a Random Forest classifier
rf_clf = RandomForestClassifier(n_jobs=-1,random_state=42)

# Train the classifier on the training data
rf_clf.fit(X_train, y_train)

In [12]:
from sklearn.ensemble import BaggingClassifier

# Build a Bagging classifier
bag_clf = BaggingClassifier(n_jobs=-1,random_state=42)

# Train the classifier on the training data
bag_clf.fit(X_train, y_train)

In [13]:
from sklearn.tree import DecisionTreeClassifier

# Build a Decision Tree classifier
dt_clf = DecisionTreeClassifier(random_state=42)

# Train the classifier on the training data
dt_clf.fit(X_train, y_train)

# Q.3

In [14]:
# Get the feature importances for the Decision Tree
dt_feature_importances = dt_clf.feature_importances_

# Sort the feature importances in descending order and get the top ten indices
sorted_dt_feature_indices = np.argsort(dt_feature_importances)[::-1][:10]

# Get the top ten feature names
top_ten_dt_features = X_train.columns[sorted_dt_feature_indices]
print("Top ten variable importance ranking from Decision Tree classifier:")
for i, feature in enumerate(top_ten_dt_features):
    print(f"{i+1}. {feature}")

# Get the feature importances for the Random Forest classifier
rf_feature_importances = rf_clf.feature_importances_

# Sort the feature importances in descending order and get the top ten indices
sorted_rf_feature_indices = np.argsort(rf_feature_importances)[::-1][:10]

# Get the top ten feature names
top_ten_rf_features = X_train.columns[sorted_rf_feature_indices]
print("\nTop ten variable importance ranking from Random Forest classifier:")
for i, feature in enumerate(top_ten_rf_features):
    print(f"{i+1}. {feature}")


Top ten variable importance ranking from Decision Tree classifier:
1. pixel490
2. pixel351
3. pixel436
4. pixel543
5. pixel433
6. pixel212
7. pixel271
8. pixel156
9. pixel598
10. pixel235

Top ten variable importance ranking from Random Forest classifier:
1. pixel351
2. pixel379
3. pixel462
4. pixel438
5. pixel407
6. pixel434
7. pixel410
8. pixel378
9. pixel489
10. pixel382


In [15]:
# common variables
set(top_ten_rf_features).intersection(set(top_ten_dt_features))

{'pixel351'}

__Interpretation:__

- For both classifiers, pixel351 appears in the top two, suggesting it's crucial for making predictions.
- Some pixels are common in both lists, indicating their consistent importance, like pixel156 and pixel490 is common for both Models, but pixel490 is the highest contributing feature for Decision Tree unlike Random Forest classifier
- The rankings provide information about the importance of each pixel in predicting the target variable.
- Features at the top of the list have a higher impact on the model's decisions.

# Q.4

In [16]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

# Build a VotingClassifier using hard voting
voting_clf = VotingClassifier(estimators=[('rf', rf_clf), 
                                          ('bag', bag_clf), 
                                          ('dt', dt_clf)], 
                                          voting='hard',
                                          n_jobs=-1,verbose=True)

# Train the VotingClassifier on the training data
voting_clf.fit(X_train, y_train)

# Make predictions on the validation data for VotingClassifier
voting_val_preds = voting_clf.predict(X_val)

# Make predictions on the validation data for Randomforest
rf_val_preds = rf_clf.predict(X_val)

# Make predictions on the validation data for DecisionTree
dt_val_preds = dt_clf.predict(X_val)

# Make predictions on the validation data for Bagging
bag_val_preds = bag_clf.predict(X_val)

# Calculate the accuracy
voting_val_acc = accuracy_score(y_val, voting_val_preds)
rf_val_acc = accuracy_score(y_val, rf_val_preds)
dt_val_acc = accuracy_score(y_val, dt_val_preds)
bag_val_acc = accuracy_score(y_val, bag_val_preds)

print("Hard voting validation accuracy:", voting_val_acc)
print("Randomforest validation accuracy:", rf_val_acc)
print("Decision Tree validation accuracy:", dt_val_acc)
print("Bagging validation accuracy:", bag_val_acc)

Hard voting validation accuracy: 0.9485714285714286
Randomforest validation accuracy: 0.9678571428571429
Decision Tree validation accuracy: 0.8502857142857143
Bagging validation accuracy: 0.9337142857142857


# Q5 
__Does the ensemble outperform the individual classifiers?__

- No, Hard voting based ensemble classifier performed better than Decision Tree and Bagging classifier, but Randomforest classifier is still the best with 96.78% accuracy

# Q6 and Q7
- Decision Tree classifier is having smallest accuracy of 84.6%, Therefore removing it from part of hard voting based ensemble model

In [17]:
# Build a VotingClassifier using hard voting
voting_clf = VotingClassifier(estimators=[('rf', rf_clf),
                                          ('bag', bag_clf)],
                                          voting='hard',
                                          n_jobs=-1,verbose=True)

# Train the VotingClassifier on the training data
voting_clf.fit(X_train, y_train)

# Make predictions on the Test data for VotingClassifier
voting_val_preds = voting_clf.predict(X_test)

# Make predictions on the Test data for Randomforest
rf_val_preds = rf_clf.predict(X_test)

# Make predictions on the Test data for Bagging
bag_val_preds = bag_clf.predict(X_test)

# Calculate the accuracy
voting_val_acc = accuracy_score(y_test, voting_val_preds)
rf_val_acc = accuracy_score(y_test, rf_val_preds)
bag_val_acc = accuracy_score(y_test, bag_val_preds)
print("Hard voting Testing accuracy:", voting_val_acc)
print("Randomforest Testing accuracy:", rf_val_acc)
print("Bagging Testing accuracy:", bag_val_acc)

Hard voting Testing accuracy: 0.9384285714285714
Randomforest Testing accuracy: 0.959
Bagging Testing accuracy: 0.9274285714285714


# Q8

Now Hard voting based ensemble classifier with 93.48% accuracy outperform Bagging classifier by more than 2%, Still Randomforest is the best with 95.7% accuracy

In [None]:
56