In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from tensorflow import keras
from tensorflow.keras import layers, models
from keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split

# Having a look at training data

In [None]:
train_df = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')

In [None]:
train_df

In [None]:
import matplotlib.pyplot as plt

# Get the label counts from the DataFrame
label_counts = train_df['label'].value_counts()

# Create a bar chart
plt.bar(label_counts.index, label_counts.values)

# Set labels and title
plt.xlabel('Label')
plt.ylabel('Count')
plt.title('Distribution of Labels')

# Set x-axis ticks
plt.xticks(list(label_counts.index))

# Show the plot
plt.show()


**The dataset seems pretty well balanced with similar number of image data for each numbers from 0 to 9. The digit 1 has the highest number of image data while digit 5 has the lowest.**

In [None]:
def image_row_reshaper(df, row_number, visualize=False, test_df = False):
    '''
        Parameters:
            df --> Pandas dataframe type
            row_number --> row index (For example: 0, 1, 2, 3, etc.)
            visualize --> If True then displays the raw data into 28 by 28 pixels image.
                          If False then returns '28 by 28 pixels reshaped image data' and 'corresponding label'
    '''
    raw_image_data = df.loc[row_number,] #Taking user given image from the dataframe
    
    if test_df == False:
        image_label = raw_image_data[0] #This is the label of the current particular image
        filtered_image_data = raw_image_data[1:] #taking data after the first value as the first value is label
    else:
        filtered_image_data = raw_image_data
        
    reshaped_image_data = filtered_image_data.values.reshape((28,28)) #Reshaping the image into 28 by 28 pixes
    
    if visualize==True:
        plt.imshow(reshaped_image_data, cmap='gray')
        if test_df == False:
            plt.title(f'Label: {image_label}')
        plt.show()
    else:
        return reshaped_image_data, image_label

In [None]:
image_row_reshaper(train_df, 3, visualize=True)
image_row_reshaper(train_df, 4, visualize=True)
image_row_reshaper(train_df, 7, visualize=True)

In [None]:
train_df.shape

**If you want to see other training data image, uncomment the code below and just replace '_' with a number from 0 (inclusive) to 41999 (inclusive).**

In [None]:
# image_row_reshaper(train_df, _, visualize=True)

# Transforming data

In [None]:
def dataframe_formatter(df, isTest=False):
    
    if isTest==False:
        y = df['label'] # extracting just the labels into a scalar form
        X = train_df.loc[:,'pixel0':'pixel783'] # extracting just the pixels data
        X = X.apply(lambda x: x/255) # normalizing the values between 0 and 1
        X = X.values.reshape(-1, 28, 28, 1)
        return X, y
        
    elif isTest==True:
        X = df
        X = X.apply(lambda x: x/255) # normalizing the values between 0 and 1
        X = X.values.reshape(-1, 28, 28, 1)
        return X

In [None]:
X_train, y_train = dataframe_formatter(train_df, isTest=False)

In [None]:
# Split the data into a train and validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=13)

# Building model

In [None]:
model = models.Sequential(
    [
        layers.Conv2D(20, (3, 3), activation='relu', input_shape=(28, 28, 1)),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(32, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(32, (2, 2), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Flatten(),
        layers.Dense(units=200, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(units=10, activation='sigmoid')
    ]
)
model.summary()

In [None]:
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

In [None]:
# Early stopping in case of 3 bad epochs
stop_training = EarlyStopping(monitor='val_loss', patience=3)

In [None]:
# Train the model with 20 epochs
model.fit(
    X_train,
    y_train,
    epochs=20,
    validation_data=(X_val, y_val),
    callbacks=stop_training
)

# Importing test data

In [None]:
test_df = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')

In [None]:
test_df

In [None]:
test_df.shape

In [None]:
X_test = dataframe_formatter(test_df, isTest=True)

In [None]:
X_test.shape

# Submission

**Creating a submission file for the competition**

In [None]:
# Perform predictions for all rows of X_test at once
predictions = model.predict(X_test, verbose=0)

# Get the predicted class labels by finding the indices of the maximum values
predicted_labels = np.argmax(predictions, axis=-1)


In [None]:
submission = pd.read_csv('/kaggle/input/digit-recognizer/sample_submission.csv')
submission.shape

In [None]:
submission['Label'] = list(predicted_labels)

In [None]:
submission

In [None]:
# Save DataFrame to CSV
submission.to_csv('submission.csv', index=False)

# Testing Model Visually

In [None]:
def test_model(row):
    image_row_reshaper(test_df, row, visualize=True, test_df = True)
    print(f'Predicted Label: {predicted_labels[row]}')

In [None]:
# call the function using this way:
# test_model(row) where row can be any values from 0 to 28000 only!

test_model(0)

**You can change the value of int passed to test_model function in above cell to check other test images visually!**