In [1]:
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [2]:
df = pd.read_csv('mnist.csv')
df.head()

Unnamed: 0,id,class,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,31953,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,34452,8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,60897,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,36953,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1981,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
df.columns

Index(['id', 'class', 'pixel1', 'pixel2', 'pixel3', 'pixel4', 'pixel5',
       'pixel6', 'pixel7', 'pixel8',
       ...
       'pixel775', 'pixel776', 'pixel777', 'pixel778', 'pixel779', 'pixel780',
       'pixel781', 'pixel782', 'pixel783', 'pixel784'],
      dtype='object', length=786)

In [4]:
df.shape

(4000, 786)

In [5]:
df.isnull().sum()

id          0
class       0
pixel1      0
pixel2      0
pixel3      0
           ..
pixel780    0
pixel781    0
pixel782    0
pixel783    0
pixel784    0
Length: 786, dtype: int64

In [6]:
# removig id column becuse not needed
df.drop('id', axis=1, inplace=True)

In [7]:
df.columns

Index(['class', 'pixel1', 'pixel2', 'pixel3', 'pixel4', 'pixel5', 'pixel6',
       'pixel7', 'pixel8', 'pixel9',
       ...
       'pixel775', 'pixel776', 'pixel777', 'pixel778', 'pixel779', 'pixel780',
       'pixel781', 'pixel782', 'pixel783', 'pixel784'],
      dtype='object', length=785)

In [8]:
df.values

array([[5, 0, 0, ..., 0, 0, 0],
       [8, 0, 0, ..., 0, 0, 0],
       [5, 0, 0, ..., 0, 0, 0],
       ...,
       [7, 0, 0, ..., 0, 0, 0],
       [9, 0, 0, ..., 0, 0, 0],
       [2, 0, 0, ..., 0, 0, 0]], shape=(4000, 785))

In [9]:
data = df.values
X = data[:, 1:]
y = data[:, 0]
print(X.shape, y.shape)

(4000, 784) (4000,)


In [10]:
X[4].shape

(784,)

In [11]:
for i in range(10):
    px.imshow(X[i].reshape(28, 28)).show(height=60)

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(3200, 784) (800, 784) (3200,) (800,)


In [13]:
# applying random forest and gradient boosting and comparing results by plotting confusion matrix and bar graphs
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

In [14]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_y_pred = rf.predict(X_test)

In [15]:
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)
gb_y_pred = gb.predict(X_test)

In [16]:
# getting confusion matrix and then plotting it
rf_cm = confusion_matrix(y_test, rf_y_pred)
gb_cm = confusion_matrix(y_test, gb_y_pred)

fig = make_subplots(
    rows=1, cols=2, subplot_titles=["Random Forest", "Gradient Boosting"]
)
fig.add_trace(
    go.Heatmap(z=rf_cm, x=[str(i) for i in range(10)], y=[str(i) for i in range(10)]),
    row=1,
    col=1,
)
fig.add_trace(
    go.Heatmap(z=gb_cm, x=[str(i) for i in range(10)], y=[str(i) for i in range(10)]),
    row=1,
    col=2,
)
fig.show()

In [17]:
results = pd.DataFrame(
    {
        "Random Forest": [f1_score(y_test, rf_y_pred, average="weighted"), precision_score(y_test, rf_y_pred, average="weighted"), recall_score(y_test, rf_y_pred, average="weighted")],
        "Gradient Boosting": [f1_score(y_test, gb_y_pred, average="weighted"), precision_score(y_test, gb_y_pred, average="weighted"), recall_score(y_test, gb_y_pred, average="weighted")]
    },
    index=["F1 Score", "Precision", "Recall"]
)

px.bar(results, barmode="group").show()

**hyperparameter tuning**

In [18]:
from sklearn.model_selection import GridSearchCV

rf_params = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5, n_jobs=-1)
rf_grid.fit(X_train, y_train)
rf_grid_y_pred = rf_grid.predict(X_test)

In [19]:
rf_grid_cm = confusion_matrix(y_test, rf_grid_y_pred)
rf_grid_results = pd.DataFrame(
    {
        "Random Forest": [f1_score(y_test, rf_y_pred, average="weighted"), precision_score(y_test, rf_y_pred, average="weighted"), recall_score(y_test, rf_y_pred, average="weighted")],
        "Random Forest Grid": [f1_score(y_test, rf_grid_y_pred, average="weighted"), precision_score(y_test, rf_grid_y_pred, average="weighted"), recall_score(y_test, rf_grid_y_pred, average="weighted")]
    },
    index=["F1 Score", "Precision", "Recall"]
)

px.bar(rf_grid_results, barmode="group").show()