In [35]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os
import cv2
import image_slicer
from pandas import DataFrame

In [27]:
def folder_to_numpy(folder):
    if not os.path.exists('temp'): 
        os.mkdir('temp')      
    to_ret = np.array([i for file in os.listdir(folder) for i in separate_file(os.path.join(folder, file)) ])
    for file in os.listdir('.\\temp'):
        os.unlink(os.path.join('.\\temp', file))
    os.rmdir('.\\temp')
    return to_ret.astype(float)

    
def separate_file(file_name): 
    original = cv2.imread(file_name, 0)
    mask = original > 30
    
    mask_left = mask[:, 0:mask.shape[1]//2]
    left = np.max(np.append(np.where(mask_left.all(axis=0))[0], 0))
    
    mask_right = mask[:, mask.shape[1]//2:mask.shape[1]]
    right = np.min(np.append(np.where(mask_right.all(axis=0))[0], mask.shape[1]//2)) + mask.shape[1]//2
    
    mask_top = mask[0:mask.shape[0]//2, :]
    top = np.max(np.append(np.where(mask_top.all(axis=1))[0], 0))
    
    mask_bottom = mask[mask.shape[0]//2:mask.shape[0], :]
    bottom = np.min(np.append(np.where(mask_bottom.all(axis=1))[0], mask.shape[0]//2)) + mask.shape[0]//2
    
    if abs(left - right) < original.shape[1] * .1 or abs(top - bottom) < original.shape[0] * .1:
        cv2.imwrite(os.path.join('.\\temp', 'original.jpg'), original)
    else:
        cv2.imwrite(os.path.join('.\\temp', 'original.jpg'), original[top:bottom, left:right])

    tiles = image_slicer.slice(os.path.join('.\\temp', 'original.jpg'), 81, save=False)
    image_slicer.save_tiles(tiles, directory='.\\temp', prefix='temp')
    
    for j in range(1, 10):
        for i in range(1, 10):
            file = cv2.imread(os.path.join('.\\temp', f'temp_0{j}_0{i}.png'), 0)
            mask = file < 235
            
            mask_left = mask[:, 0:mask.shape[1]//2]
            left = np.max(np.append(np.where(mask_left.all(axis=0))[0], 0))
            
            mask_right = mask[:, mask.shape[1]//2:mask.shape[1]]
            right = np.min(np.append(np.where(mask_right.all(axis=0))[0], mask.shape[1]//2)) + mask.shape[1]//2
            
            mask_top = mask[0:mask.shape[0]//2, :]
            top = np.max(np.append(np.where(mask_top.all(axis=1))[0], 0))
            
            mask_bottom = mask[mask.shape[0]//2:mask.shape[0], :]
            bottom = np.min(np.append(np.where(mask_bottom.all(axis=1))[0], mask.shape[0]//2)) + mask.shape[0]//2
            
            best = max([left, top, file.shape[0]-bottom, file.shape[1]-right])       
            file = file[best+1:file.shape[0]-best-1, best+1:file.shape[1]-best-1]
            
            mask = file < 200
            coords = np.argwhere(mask)
            if len(coords) == 0:
                yield cv2.resize(file, (28, 28)).flatten()
            else:
                x0, y0 = coords.min(axis=0)
                x1, y1 = coords.max(axis=0) + 1
                yield cv2.resize(file[x0:x1, y0:y1], (28, 28)).flatten()

In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

pipeline = make_pipeline(
    StandardScaler(),
    RandomForestClassifier()
)

params = {'randomforestclassifier__n_estimators': [75, 100, 150, 200],
          'randomforestclassifier__max_depth': [4, 5, 6]}

grid = GridSearchCV(pipeline, params, cv=10)

In [29]:
folder = '.\\raw'
answers = np.genfromtxt('target.csv', delimiter=',')
data = folder_to_numpy(folder)

grid.fit(data, answers)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'randomforestclassifier__n_estimators': [75, 100, 150, 200], 'randomforestclassifier__max_depth': [4, 5, 6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [30]:
grid.best_params_

{'randomforestclassifier__max_depth': 5,
 'randomforestclassifier__n_estimators': 200}

In [31]:
grid.best_score_

0.9987004548408057

In [57]:
test_folder = '.\\test'
test_data = folder_to_numpy(test_folder) 

predictions = grid.predict(test_data)
for i in predictions.reshape((predictions.shape[0]//81, 9, 9)):
    print(solve(i))

[[8. 9. 5. 2. 7. 1. 6. 3. 4.]
 [6. 1. 4. 5. 3. 8. 2. 7. 9.]
 [3. 7. 2. 4. 6. 9. 1. 5. 8.]
 [1. 4. 3. 7. 2. 5. 8. 9. 6.]
 [5. 6. 7. 9. 8. 4. 3. 2. 1.]
 [9. 2. 8. 6. 1. 3. 5. 4. 7.]
 [2. 8. 1. 3. 4. 7. 9. 6. 5.]
 [4. 3. 9. 1. 5. 6. 7. 8. 2.]
 [7. 5. 6. 8. 9. 2. 4. 1. 3.]]
[[7. 6. 9. 2. 4. 1. 8. 5. 3.]
 [8. 5. 3. 7. 6. 9. 2. 4. 1.]
 [2. 4. 1. 8. 5. 3. 7. 6. 9.]
 [9. 7. 6. 1. 3. 2. 4. 8. 5.]
 [4. 2. 8. 9. 7. 5. 1. 3. 6.]
 [1. 3. 5. 6. 8. 4. 9. 7. 2.]
 [5. 9. 7. 4. 1. 6. 3. 2. 8.]
 [6. 8. 2. 3. 9. 7. 5. 1. 4.]
 [3. 1. 4. 5. 2. 8. 6. 9. 7.]]


In [54]:
def possible(info):
    for ele in range(81):
        if info[ele//9, ele%9] != 0:
            yield np.array([info[ele//9, ele%9]])
        else:
            invalid = np.unique(np.concatenate([info[ele//9, :], 
                                                info[:, ele%9], 
                                                info[ele//9-ele//9 % 3: ele//9-ele//9 % 3+3, 
                                                     ele % 9-ele % 3: ele % 9-ele % 3+3
                                                    ].flatten()
                                                ]))
            yield np.array([i for i in range(1, 10) if i not in invalid])

In [56]:
def solve(puzzle):
    puzzle = DataFrame(data={'possible': [x for x in possible(puzzle)]})
    i = 0
    mapping = np.zeros((9, 9))
    while i != 81:
        # previous mistake, backing up
        possibles = puzzle.iat[i, 0]
        
        if mapping[i//9, i % 9] == np.max(possibles):
            mapping[i//9, i % 9] = 0
            i -= 1
            if i == -1:
                print("Puzzle cannot be solved")
                break
            continue
        else:
            set_possibles = set(possibles[np.where(possibles > mapping[i//9, i%9])])        
            invalid = set(np.concatenate([mapping[i//9, 0:i%9],
                                          mapping[0:i//9, i%9],
                                          mapping[i//9-i//9%3:i//9, i%9 - i%3:i%9 - i%3 + 3].flatten()
                                         ]))
    
            results = set_possibles - invalid
            if len(results) == 0:
                mapping[i//9, i%9] = 0
                i -= 1
            else:
                mapping[i//9, i%9] = min(results)
                i += 1
    
    return mapping