# Best Model and Parameter Selection for Digits Classification

## Imports and data load

In [1]:
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [2]:
digits = load_digits()
dir(digits)

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']

## Train Test Split

Not needed because the models also take nparrays

## Defining models array

>The following array is generated by ChatGPT as he can do the 'guessing' task better. This array is large and takes a lot of CPU if run locally

In [11]:
models = {

    'svm': {
        'model': SVC(),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf', 'poly'],
            'gamma': ['scale', 0.1, 1]
        }
    },

    'LogisticRegression': {
        'model': LogisticRegression(max_iter=1000),
        'params': {
            'C': [0.1, 1, 10],
            'penalty': ['l2'],
            'solver': ['lbfgs']
        }
    },

    'MultinomialNB': {
        'model': MultinomialNB(),
        'params': {
            'alpha': [0.1, 0.5, 1.0]
        }
    },

    'GaussianNB': {
        'model': GaussianNB(),
        'params': {
            'var_smoothing': [1e-9, 1e-8, 1e-7]
        }
    },

    'DecisionTree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'max_depth': [None, 5, 10, 20],
            'min_samples_split': [2, 5, 10],
            'criterion': ['gini', 'entropy']
        }
    },

    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5],
            'max_features': ['sqrt', 'log2']
        }
    }
}

## Training Every possible model

And seeing the score.

In [13]:
# Grid Search CV tries every combination for all parameters for ONE MODEL. For more than one models, we still 
# need a for loop

scores = []

for model_name, model_dict in models.items():
    grid_search_obj = GridSearchCV(model_dict['model'], model_dict['params'], cv=5, return_train_score=False)
    grid_search_obj.fit(digits.data, digits.target)
    scores.append({
        'model' : model_name,
        'best parameters' : grid_search_obj.best_params_,
        'best score' : grid_search_obj.best_score_
    })



In [14]:
scores = pd.DataFrame(scores)
scores

Unnamed: 0,model,best parameters,best score
0,svm,"{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}",0.97385
1,LogisticRegression,"{'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}",0.918217
2,MultinomialNB,{'alpha': 0.1},0.870907
3,GaussianNB,{'var_smoothing': 1e-07},0.832518
4,DecisionTree,"{'criterion': 'entropy', 'max_depth': 20, 'min...",0.814708
5,RandomForest,"{'max_depth': 20, 'max_features': 'log2', 'min...",0.943254


**For me, the winner is SVM with listed parameters.**