In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
import matplotlib
import matplotlib.pylab as plt
%matplotlib inline

from sklearn.datasets import load_digits

Load Dataset and convert data to pandas dataframe

In [2]:
d = load_digits()
df = pd.DataFrame(data = d.data)
df['digit'] = d.target

Shuffle dataset

In [3]:
df = df.sample(frac=1)

Copy dataframe for classification using binary pixel values

In [4]:
df_binary = df.copy()

Define simple function to map to dataframe values and convert df_binary features to 1 or 0

In [5]:
def convertToBinarySignal(x):
    if(x > 0):
        return 1
    else:
        return 0

In [6]:
df_binary[df_binary.columns.difference(['digit'])] = df_binary[df_binary.columns.difference(['digit'])].applymap(convertToBinarySignal)

Generate a function that will run prediction algorithm to predict each digit

In [7]:
def digit_prediction(prediction_algo, digits, digit_number):
    
    # Generate target label
    digits['y'] = digits['digit'] == digit_number
    
    # Split dataset into train/test (~50/50)
    n = int(len(digits)/2)
    df_train, df_test = digits.iloc[:n].copy(), digits.iloc[n:].copy()
    X_train, y_train = df_train[df_train.columns.drop('y')], df_train['y']
    X_test, y_test = df_test[df_test.columns.drop('y')], df_test['y']
    
    # Build Pipeline
    pipeline = Pipeline([
        ('masteralg', prediction_algo)    
    ])
    
    # Fit / Predict
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    # Generate and return F1 metric with associated digit_number
    return [sklearn.metrics.f1_score(y_test, y_pred), sklearn.metrics.precision_score(y_test, y_pred), sklearn.metrics.recall_score(y_test, y_pred), digit_number]

Create a simple dataframe to hold and display metrics, then load the metrics from KNN and Logistic Regression predictions using binary pixel values

In [8]:
metrics_df = pd.DataFrame(index=np.arange(0, 6), columns=['Technique', 'Worst Performing Digit', 'F1 Score of Digit Prediction'])

# KNN Prediction
worst_f1_result = min([digit_prediction(KNeighborsClassifier(), df_binary, digit_number=k) for k in range(10)])
metrics_df.loc[0] = ['KNN - Binary Pixel Values', worst_f1_result[3], worst_f1_result[0]]

# Logistic Regression
worst_f1_result = min([digit_prediction(LogisticRegression(), df_binary, digit_number=k) for k in range(10)])
metrics_df.loc[1] = ['Logistic Regression - Binary Pixel Values', worst_f1_result[3], worst_f1_result[0]]

metrics_df

Unnamed: 0,Technique,Worst Performing Digit,F1 Score of Digit Prediction
0,KNN - Binary Pixel Values,8.0,0.96875
1,Logistic Regression - Binary Pixel Values,3.0,0.88764
2,,,
3,,,
4,,,
5,,,


Copy dataframe for reducing dataset to 16 features for classification

In [9]:
df_featureReduction_16 = df.copy()

Define simple functions to generate summed row and column features

In [10]:
def sumPixelRow(df, pixel_row):
    pixelRowSum = 0.0
    for j in range(8):
        pixelRowSum += df[(pixel_row * 8) + j]
    df['sumRow'+str(pixel_row)] = pixelRowSum
    
def sumPixelCol(df, pixel_col):
    pixelColSum = 0.0
    for i in range(8):
        pixelColSum += df[(i * 8) + pixel_col]
    df['sumCol'+str(pixel_col)] = pixelColSum

Calculate sumed row and column features and remove original features

In [11]:
# Calculate features
for i in range(8):
    sumPixelRow(df_featureReduction_16, i)
    sumPixelCol(df_featureReduction_16, i)
    
# Trim dataset to reduced features
reducedFeatures = ['sumCol0','sumCol1','sumCol2','sumCol3','sumCol4','sumCol5','sumCol6','sumCol7','sumRow0','sumRow1','sumRow2','sumRow3','sumRow4','sumRow5','sumRow6','sumRow7','digit']
df_featureReduction_16 = df_featureReduction_16[reducedFeatures]

Load the metrics from KNN and Logistic Regression predictions using 8+8 features into results dataframe

In [12]:
# KNN Prediction
worst_f1_result = min([digit_prediction(KNeighborsClassifier(), df_featureReduction_16, digit_number=k) for k in range(10)])
metrics_df.loc[2] = ['KNN - 8+8 Feature Reduction', worst_f1_result[3], worst_f1_result[0]]

# Logistic Regression
worst_f1_result = min([digit_prediction(LogisticRegression(), df_featureReduction_16, digit_number=k) for k in range(10)])
metrics_df.loc[3] = ['Logistic Regression - 8+8 Feature Reduction', worst_f1_result[3], worst_f1_result[0]]

metrics_df

Unnamed: 0,Technique,Worst Performing Digit,F1 Score of Digit Prediction
0,KNN - Binary Pixel Values,8.0,0.96875
1,Logistic Regression - Binary Pixel Values,3.0,0.88764
2,KNN - 8+8 Feature Reduction,8.0,0.816754
3,Logistic Regression - 8+8 Feature Reduction,8.0,0.630303
4,,,
5,,,


Copy dataframe for reducing dataset to 8 selected features for classification and trim to manually selected features

In [13]:
df_featureReduction_8 = df.copy()

In [14]:
 # Selected a doughnut shaped collection of pixels near the top of the digit
df_featureReduction_8 = df_featureReduction_8[[19,20,21,27,29,35,36,37,'digit']]

Load the metrics from KNN and Logistic Regression predictions using 8 manually selected features into results dataframe

In [15]:
# KNN Prediction
worst_f1_result = min([digit_prediction(KNeighborsClassifier(), df_featureReduction_8, digit_number=k) for k in range(10)])
metrics_df.loc[4] = ['KNN - 8 Selected Features', worst_f1_result[3], worst_f1_result[0]]

# Logistic Regression
worst_f1_result = min([digit_prediction(LogisticRegression(), df_featureReduction_8, digit_number=k) for k in range(10)])
metrics_df.loc[5] = ['Logistic Regression - 8 Selected Features', worst_f1_result[3], worst_f1_result[0]]

metrics_df

Unnamed: 0,Technique,Worst Performing Digit,F1 Score of Digit Prediction
0,KNN - Binary Pixel Values,8,0.96875
1,Logistic Regression - Binary Pixel Values,3,0.88764
2,KNN - 8+8 Feature Reduction,8,0.816754
3,Logistic Regression - 8+8 Feature Reduction,8,0.630303
4,KNN - 8 Selected Features,4,0.765432
5,Logistic Regression - 8 Selected Features,4,0.616438


It would be an enormous number of combinations if we wanted to try all combinations of pixels. 

Your first pick would be out of 64 choices, your next out of 63, ..., and your last pick out of 57 choices, which gives us:
$$64 * 63 * 62 * 61 * 60 * 59 * 58 * 57$$
or the equation:
$${\frac {64!}{(64-8)!8!}}$$

and ultimately, a total number of 178,462,987,637,760 choices.

One approach that would be much more efficient and should be reasonably effective is to narrow the options down to a simple "doughnut" of pixels, for example [0,1,2,8,10,16,17,18], which could be shifted around the pixel matrix. This would provide a reasonable generalization of an area of the matrix. Furthermore, there are only __36__ possible shifts of the pixel doughnut on an 8x8 pixel matrix.

In [16]:
def determinePixelFeatures(df, predictionAlgo):
        
    best_features = []
    best_worst_f1 = []
    
    base_df = df.copy()
    
    
    # Sample pixel doughnut
    # curr_features = [0,1,2,8,10,16,17,18]
    
    curr_row = 0
    curr_col = 0
    
    for k in range(36): 
        
        # Update current features
        shift = ((curr_row * 8) + curr_col)
        curr_features = [(0 + shift), (1 + shift), (2 + shift), (8 + shift), (10 + shift), (16 + shift), (17 + shift), (18 + shift), 'digit']
        
        #print('*************************************')
        #print('Iteration ')+str(k)
        #print('curr_col: '+ str(curr_col))
        #print('curr_row: '+ str(curr_row))
        #print('currFeatures' + str(curr_features))
        
        # Update features in df
        curr_df = base_df[curr_features].copy()
        
        # Predict
        new_worst_f1 = min([digit_prediction(predictionAlgo, curr_df, digit_number=k) for k in range(10)])
        #print 'New F1: '+str(new_worst_f1) 
        if new_worst_f1 > best_worst_f1:
            #print 'F1 Improvement:'
            #print '   curr best: ' + str(best_worst_f1)
            #print '    new best: ' + str(new_worst_f1)
            best_worst_f1 = new_worst_f1
            best_features = curr_features
        
        #Update shift
        if curr_col == 5:
            curr_row += 1
            curr_col = 0
        else:
            curr_col += 1
    
    return [best_features, best_worst_f1]

Execute the feature selection algorithm using each prediction algorithm

In [17]:
best_eight_features_knn = determinePixelFeatures(df.copy(), KNeighborsClassifier())
best_eight_features_logReg = determinePixelFeatures(df.copy(), LogisticRegression())

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Print best features info, along with precision and recall for each digit

In [18]:
def printBestFeaturesMetrics(predAlg, best_features_ds):
    best_features = best_features_ds[0]
    best_features_metrics = best_features_ds[1]

    print '  ' + str(best_features)
    print '    Worst Digit: ' + str(best_features_metrics[3])
    print '    Worst Digit F1: ' + str(best_features_metrics[0])
    for k in range(10):
        results = digit_prediction(predAlg, df[best_features].copy(), digit_number=k)
        print '    Digit ' + str(results[3]) 
        print '      Precision: ' + str(results[1])
        print '      Recall: ' + str(results[2])

In [19]:
print 'KNN - Best 8 Features'
printBestFeaturesMetrics(KNeighborsClassifier(), best_eight_features_knn)
print ''
print 'Logistic Regression - Best 8 Features'
printBestFeaturesMetrics(LogisticRegression(), best_eight_features_logReg)

KNN - Best 8 Features
  [19, 20, 21, 27, 29, 35, 36, 37, 'digit']
    Worst Digit: 4
    Worst Digit F1: 0.765432098765
    Digit 0
      Precision: 0.976744186047
      Recall: 1.0
    Digit 1
      Precision: 0.887755102041
      Recall: 0.935483870968
    Digit 2
      Precision: 0.882352941176
      Recall: 0.903614457831
    Digit 3
      Precision: 0.8875
      Recall: 0.755319148936
    Digit 4
      Precision: 0.849315068493
      Recall: 0.696629213483
    Digit 5
      Precision: 0.883116883117
      Recall: 0.723404255319
    Digit 6
      Precision: 0.8125
      Recall: 0.876404494382
    Digit 7
      Precision: 0.808988764045
      Recall: 0.847058823529
    Digit 8
      Precision: 0.914634146341
      Recall: 0.773195876289
    Digit 9
      Precision: 0.923076923077
      Recall: 0.791208791209

Logistic Regression - Best 8 Features
  [19, 20, 21, 27, 29, 35, 36, 37, 'digit']
    Worst Digit: 4
    Worst Digit F1: 0.616438356164
    Digit 0
      Precision: 0.988235294