## 1. Data loading

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
print(train.head())
print(test.head())

   label  pixel0  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  \
0      1       0       0       0       0       0       0       0       0   
1      0       0       0       0       0       0       0       0       0   
2      1       0       0       0       0       0       0       0       0   
3      4       0       0       0       0       0       0       0       0   
4      0       0       0       0       0       0       0       0       0   

   pixel8  ...  pixel774  pixel775  pixel776  pixel777  pixel778  pixel779  \
0       0  ...         0         0         0         0         0         0   
1       0  ...         0         0         0         0         0         0   
2       0  ...         0         0         0         0         0         0   
3       0  ...         0         0         0         0         0         0   
4       0  ...         0         0         0         0         0         0   

   pixel780  pixel781  pixel782  pixel783  
0         0         0         

## 2. Data division

In [4]:
# X_train - images, y_train - number labels
X_train = train.drop('label', axis=1)
y_train = train['label']

## 3. Data normalization

In [5]:
# Image pixels usually contain numbers from 0 to 255
# It is recommended to normalize them by dividing by 255, so numbers will be between 0 and 1
X_train = X_train / 255.0
test = test / 255.0

## 4. Data modeling (LR) - 92.05%

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [7]:
# Dividing data for train and test
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [8]:
# Initializing and model teaching
model = LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='multinomial')
model.fit(X_train_split, y_train_split)



In [9]:
# Model prediction
y_pred = model.predict(X_test_split)

# Accuracy score
accuracy = accuracy_score(y_test_split, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9205


In [10]:
# Error matrix and classification review
print(confusion_matrix(y_test_split, y_pred))
print(classification_report(y_test_split, y_pred))

[[826   0   3   0   3   4  12   2   2   1]
 [  0 917   4   6   0   2   1   1   8   1]
 [  4  11 760  16   3   4   6  12  16   3]
 [  0   3  25 774   0  33   1  11  20   6]
 [  1   1   8   0 777   1   5   3   5  28]
 [  8   6   7  29   7 637  12   5  16   4]
 [  4   0   4   0   6  12 768   1   3   2]
 [  0   1   8   2   6   0   0 794   3  36]
 [  7  17  11  29   6  30   7   2 725  12]
 [  4   4   0   8  29   4   0  33   7 754]]
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       853
           1       0.96      0.98      0.97       940
           2       0.92      0.91      0.91       835
           3       0.90      0.89      0.89       873
           4       0.93      0.94      0.93       829
           5       0.88      0.87      0.87       731
           6       0.95      0.96      0.95       800
           7       0.92      0.93      0.93       850
           8       0.90      0.86      0.88       846
           9       0.89     

In [11]:
# Predicting on a test dataset
y_test_pred = model.predict(test)
y_test_pred

array([2, 0, 9, ..., 3, 9, 2], dtype=int64)

## 5. Data modeling (RF) - 92.05%

In [12]:
from sklearn.ensemble import RandomForestClassifier

In [13]:
# Model teaching
rf_model = RandomForestClassifier(n_estimators=100, random_state=1)
rf_model.fit(X_train_split, y_train_split)

In [14]:
# Predicting
y_pred_rf = rf_model.predict(X_test_split)

# Accuracy score
accuracy_rf = accuracy_score(y_test_split, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy:.4f}")

Random Forest Accuracy: 0.9205


In [15]:
# Error matrix and classification review
print(confusion_matrix(y_test_split, y_pred))
print(classification_report(y_test_split, y_pred))

[[826   0   3   0   3   4  12   2   2   1]
 [  0 917   4   6   0   2   1   1   8   1]
 [  4  11 760  16   3   4   6  12  16   3]
 [  0   3  25 774   0  33   1  11  20   6]
 [  1   1   8   0 777   1   5   3   5  28]
 [  8   6   7  29   7 637  12   5  16   4]
 [  4   0   4   0   6  12 768   1   3   2]
 [  0   1   8   2   6   0   0 794   3  36]
 [  7  17  11  29   6  30   7   2 725  12]
 [  4   4   0   8  29   4   0  33   7 754]]
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       853
           1       0.96      0.98      0.97       940
           2       0.92      0.91      0.91       835
           3       0.90      0.89      0.89       873
           4       0.93      0.94      0.93       829
           5       0.88      0.87      0.87       731
           6       0.95      0.96      0.95       800
           7       0.92      0.93      0.93       850
           8       0.90      0.86      0.88       846
           9       0.89     

In [16]:
y_test_pred_rf = rf_model.predict(test)
y_test_pred_rf

array([2, 0, 9, ..., 3, 9, 2], dtype=int64)

## 6. (!) Neural net learning - 98.29%

In [18]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

In [21]:
# Data pre-load and pre-edit
# Let's be sure that class labels are converted to format 'one-hot encoding'
# Because neural net requires this format for the classification
y_train_categorical = to_categorical(y_train_split, 10)
y_test_categorical = to_categorical(y_test_split, 10)

In [22]:
# Neural net model creating
model = Sequential()

# Input layer
model.add(Dense(512, activation='relu', input_shape=(784,)))
model.add(Dropout(0.3))  # Dropout to prevent retraining

# Hidden layer
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.3))

# Output layer
model.add(Dense(10, activation='softmax'))

# Model compilation
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [28]:
# Model training (20 epochs for example, batch size 128 f.e.
history = model.fit(X_train_split, y_train_categorical, epochs=50, batch_size=128, validation_data=(X_test_split, y_test_categorical))

Epoch 1/50
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.9965 - loss: 0.0112 - val_accuracy: 0.9829 - val_loss: 0.1133
Epoch 2/50
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.9958 - loss: 0.0138 - val_accuracy: 0.9814 - val_loss: 0.1161
Epoch 3/50
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9965 - loss: 0.0097 - val_accuracy: 0.9799 - val_loss: 0.1233
Epoch 4/50
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.9962 - loss: 0.0105 - val_accuracy: 0.9801 - val_loss: 0.1303
Epoch 5/50
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.9963 - loss: 0.0115 - val_accuracy: 0.9805 - val_loss: 0.1195
Epoch 6/50
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.9955 - loss: 0.0156 - val_accuracy: 0.9812 - val_loss: 0.1140
Epoch 7/50
[1m263/263[0m 

In [36]:
# Model accuracy
test_loss, test_accuracy = model.evaluate(X_test_split, y_test_categorical)
print(f"Test Accuracy: {test_accuracy:.4f}")

[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 929us/step - accuracy: 0.9817 - loss: 0.1717
Test Accuracy: 0.9829


In [37]:
# Prediction on the test sample
y_test_pred_nn = model.predict(test)
y_test_pred_nn = y_test_pred_nn.argmax(axis=1)  # Converting back to class labels

[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 927us/step


## 7. Convolutional neural network (CNN) - 98.79%

In [30]:
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten

In [32]:
# Converting data to form (n_samples, 28, 28, 1) for CNN
X_train_cnn = X_train_split.values.reshape(-1, 28, 28, 1)
X_test_cnn = X_test_split.values.reshape(-1, 28, 28, 1)
test_cnn = test.values.reshape(-1, 28, 28, 1)

In [33]:
# CNN model creating
cnn_model = Sequential()

# First convolutional ('svertochniy') layer
cnn_model.add(Conv2D(32, kernel_size=(3,3), activation='relu', input_shape=(28, 28, 1)))
cnn_model.add(MaxPooling2D(pool_size=(2, 2)))

# Second convolutional ('svertochniy') layer
cnn_model.add(Conv2D(64, kernel_size=(3,3), activation='relu'))
cnn_model.add(MaxPooling2D(pool_size=(2, 2)))

# Converting vector
cnn_model.add(Flatten())

# Fully connected layer
cnn_model.add(Dense(128, activation='relu'))

# Output layer
cnn_model.add(Dense(10, activation='softmax'))

# Model compilation
cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [34]:
# Model training
history_cnn = cnn_model.fit(X_train_cnn, y_train_categorical, epochs=20, batch_size=128, validation_data=(X_test_cnn, y_test_categorical))

Epoch 1/20
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.8193 - loss: 0.6332 - val_accuracy: 0.9745 - val_loss: 0.0871
Epoch 2/20
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.9774 - loss: 0.0754 - val_accuracy: 0.9785 - val_loss: 0.0713
Epoch 3/20
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.9847 - loss: 0.0508 - val_accuracy: 0.9823 - val_loss: 0.0576
Epoch 4/20
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - accuracy: 0.9878 - loss: 0.0361 - val_accuracy: 0.9874 - val_loss: 0.0475
Epoch 5/20
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.9918 - loss: 0.0267 - val_accuracy: 0.9862 - val_loss: 0.0447
Epoch 6/20
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.9930 - loss: 0.0208 - val_accuracy: 0.9876 - val_loss: 0.0449
Epoch 7/20
[1m263/263

In [35]:
test_loss_cnn, test_accuracy_cnn = cnn_model.evaluate(X_test_cnn, y_test_categorical)
print(f"Test Accuracy CNN: {test_accuracy_cnn:.4f}")

[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9871 - loss: 0.0656
Test Accuracy CNN: 0.9879


In [39]:
# Prediction on the test sample
y_test_pred_cnn = cnn_model.predict(test_cnn)
y_test_pred_cnn = y_test_pred_cnn.argmax(axis=1)  # Converting back to class labels

[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step


## 8. Final steps

In [40]:
# Will be using data from CNN, because it has the best result
submission_cnn = pd.DataFrame({"ImageId": list(range(1, len(y_test_pred_cnn)+1)),
                              "Label": y_test_pred_cnn})
submission_cnn.to_csv("kaggle_cnn_dr_submission.csv", index=False)