In [179]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import keras

In [102]:
np.random.seed(9)

## PART 1

In [103]:
df = keras.datasets.mnist.load_data(path="mnist.npz")

In [104]:
train, test = df[0], df[1]

In [105]:
train_x, train_y = train[0], train[1]

In [106]:
test_x, test_y = test[0], test[1]

In [107]:
train_x = train_x.reshape(-1, 784)
test_x = test_x.reshape(-1, 784)

In [108]:
print(train_x.shape)
print(test_x.shape)

(60000, 784)
(10000, 784)


In [109]:
clf = DecisionTreeClassifier()
clf.fit(train_x, train_y)

In [110]:
np.round(clf.score(test_x, test_y) * 100, 2)

87.87

## PART 2

In [111]:
# Select 50% of data with 70% of features.

In [112]:
train_x.shape

(60000, 784)

In [113]:
len_data = np.int64(train_x.shape[0] * 0.5)
len_features = np.int64(np.round(train_x.shape[1] * 0.7))

In [114]:
data_idx = np.random.choice(train_x.shape[0], size = len_data, replace=False)
feature_idx = np.random.choice(train_x.shape[1], size = len_features, replace=False)

In [115]:
# Turning to functions
def get_data_feature(X, y):
    len_data = np.int64(X.shape[0] * 0.5)
    len_features = np.int64(np.round(X.shape[1] * 0.7))
    
    data_idx = np.random.choice(X.shape[0], size = len_data, replace=False)
    feature_idx = np.random.choice(X.shape[1], size = len_features, replace=False)

    return X[:, feature_idx][data_idx], y[data_idx], data_idx, feature_idx

In [116]:
# Checking just to be sure:
print(data_idx.shape, feature_idx.shape)

(30000,) (549,)


In [117]:
def classify_score(train_x, train_y, test_x ,test_y):
    clf = DecisionTreeClassifier()
    clf.fit(train_x, train_y)
    return np.round(clf.score(test_x, test_y) * 100, 2), clf

In [118]:
# MAJOR PROBLEM WITH TESTING 784 and 549

In [193]:
voters = {}
for i in range(20):
    print(f'Running {i} out of 20.')
    X, y, m, n = get_data_feature(train_x, train_y)
    test_i = test_x[:, n]
    score, model = classify_score(X, y, test_i, test_y)
    voters[i] = [model, n, score]

Running 0 out of 20.
Running 1 out of 20.
Running 2 out of 20.
Running 3 out of 20.
Running 4 out of 20.
Running 5 out of 20.
Running 6 out of 20.
Running 7 out of 20.
Running 8 out of 20.
Running 9 out of 20.
Running 10 out of 20.
Running 11 out of 20.
Running 12 out of 20.
Running 13 out of 20.
Running 14 out of 20.
Running 15 out of 20.
Running 16 out of 20.
Running 17 out of 20.
Running 18 out of 20.
Running 19 out of 20.


In [203]:
def get_most_popular(voters):
    # This is a list of all outputs by all 20 models
    voters_out = []
    # Not very clever to use for-loop but im on a tight deadline.
    for i, j in voters.items():
        # Which features to drop for each model
        feature_drop_scheme = j[1]
        # Get votes.
        voters_out.append(j[0].predict(test_x[:, feature_drop_scheme]))

    # Make a dataframe so it will be manipulated with ease.
    voters_output_df = pd.DataFrame(data = voters_out, index = np.arange(1, 21))
    out_popular = np.array(voters_output_df.mode(axis=0).loc[0])
    return out_popular

In [None]:
def get_most_popular_weighted(voters):
    # This is a list of all outputs by all 20 models
    voters_out = []
    sum_weights = np.sum([j[2] for i, j in voters.items()])
    # Not very clever to use for-loop but im on a tight deadline.
    for i, j in voters.items():
        # Which features to drop for each model
        feature_drop_scheme = j[1]
        # Get votes.
        voters_out.append(j[0].predict(test_x[:, feature_drop_scheme]))

    # Make a dataframe so it will be manipulated with ease.
    voters_output_df = pd.DataFrame(data = voters_out, index = np.arange(1, 21))
    voters_output_df.value_counts
    # out_popular = np.array(voters_output_df.mode(axis=0).loc[0])
    return out_popular

In [197]:
out_popular = get_most_popular(voters)

In [198]:
# Just for a reference
accuracy_score(test_y, voters[2][0].predict(test_x[:, voters[2][1]]))

0.8487

In [201]:
acc = accuracy_score(test_y, out_popular) * 100
print(f'Accuracy score: {acc:.2f}')

Accuracy score: 95.40


In [215]:
voters_out = []
weights = np.array([j[2] for i, j in voters.items()])
sum_weights = np.sum([j[2] for i, j in voters.items()])
# Not very clever to use for-loop but im on a tight deadline.
for i, j in voters.items():
    # Which features to drop for each model
    feature_drop_scheme = j[1]
    # Get votes.
    voters_out.append(j[0].predict(test_x[:, feature_drop_scheme]))

In [262]:
# Make a dataframe so it will be manipulated with ease.
voters_output_df = pd.DataFrame(data = voters_out, index = np.arange(1, 21))
voters_output_df['weighted_values'] = weights/sum_weights


In [313]:
voters_output_df['weights'] = np.int64((weights - weights.min())/(weights.max() - weights.min()) * 9) + 1

In [384]:
weighed_votes_pred = np.int64(np.average(voters_output_df.iloc[:, :10000], weights = voters_output_df['weights'], axis = 0))

In [385]:
accuracy_score(test_y, weighed_votes_pred)

0.667

## Part 5

In [386]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier().fit(train_x, train_y)
clf.score(test_x, test_y)

0.9699