# Importing all Dependencies
### pandas - working with dataframe
### numpy - working with arrays
### matplotlib - plotting graphs

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.svm import SVC
import matplotlib.pyplot as plt
%matplotlib inline

  from numpy.core.umath_tests import inner1d


# Importing datasets

In [2]:
train = pd.read_csv('train.csv')

In [3]:
test = pd.read_csv('test.csv')

# Exploratory Data Analysis
### Looking at columns

In [3]:
train.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
test.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Looking at variance of each column
### We can see that all data is in range(0,255) meaning the darkness of a pixel
### It is done for checking if a column is high varied, because such values fit not so good in model

In [13]:
X.var()

pixel0        0.000000
pixel1        0.000000
pixel2        0.000000
pixel3        0.000000
pixel4        0.000000
pixel5        0.000000
pixel6        0.000000
pixel7        0.000000
pixel8        0.000000
pixel9        0.000000
pixel10       0.000000
pixel11       0.000000
pixel12       0.193657
pixel13       1.588149
pixel14       0.666514
pixel15       0.001157
pixel16       0.000000
pixel17       0.000000
pixel18       0.000000
pixel19       0.000000
pixel20       0.000000
pixel21       0.000000
pixel22       0.000000
pixel23       0.000000
pixel24       0.000000
pixel25       0.000000
pixel26       0.000000
pixel27       0.000000
pixel28       0.000000
pixel29       0.000000
               ...    
pixel754      0.000000
pixel755      0.000000
pixel756      0.000000
pixel757      0.000000
pixel758      0.000000
pixel759      0.000000
pixel760      0.164083
pixel761      2.539329
pixel762      5.803815
pixel763     14.995365
pixel764     22.175132
pixel765     36.269817
pixel766   

### Looking at shapes of datasets

In [4]:
train.shape

(42000, 785)

In [7]:
test.shape

(28000, 784)

# Data Preprocessing
### Splitting into input and target features

In [4]:
y_train = train.label
X_train = train.drop(labels = ["label"],axis = 1)

### Concatenating X_train and test. Made to simplify preprocessing (working with 1 dataset instead of 2)

In [9]:
X =  pd.concat(objs=[X_train, test], axis=0).reset_index(drop=True)

# Creating new dataset
### Classifying each 784 column by width (e.g. 0 to 27 in 1 by adding to each other and dividing by 28 to get mean)
### Classifying each 784 column by height (e.g. 0, 28, 56, ... in 1 by adding to each other and dividing by 28 to get mean)
### It is done for fitting model with less features

### height
###   |  
### 000 001 002 003 ... 026 027 - width
### 028 029 030 031 ... 054 055
### 056 057 058 059 ... 082 083
###    |      |      |      |   ...    |      |
### 728 729 730 731 ... 754 755
### 756 757 758 759 ... 782 783 

In [36]:
width_all = []
height_all = []

In [37]:
for i in range(len(X)):
    width_row = []
    width_sum = 0
    for y in range(len(X.columns)):
        width_sum+=X.iloc[i][y]
        if(y%28==27):
            width_row.append(width_sum)
            width_sum = 0    
    width_all.append(width_row)

In [68]:
for i in range(len(X)):
    height = np.zeros(28)
    ind = 0
    height_row = []
    for y in range(len(X.columns)):
        height[ind]+=X.iloc[i][y]
        ind+=1
        if(y%28==27):
            ind = 0
    height_all.append(height)

### Making width and height dataframes and further concatenating

In [80]:
width_df = pd.DataFrame(width_all)

In [84]:
height_df = pd.DataFrame(height_all)

In [96]:
all_df =  pd.concat(objs=[width_df, height_df], axis=1)

### Changing the names of columns

In [97]:
all_df.columns

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,  0,  1,  2,  3,  4,  5,
             6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
            23, 24, 25, 26, 27],
           dtype='int64')

In [98]:
cols = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', 
        '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', 
        '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55']
all_df.columns = cols

In [99]:
all_df.shape

(70000, 56)

### Dividing each value by 28 to get the mean value of width and height.

In [102]:
for i in all_df.columns:
    all_df[i] = all_df[i].map(lambda x: x/28 if x > 0 else 0)

### Changing all values to its logarithms to deal with variance

In [104]:
for i in all_df.columns:
    all_df[i] = all_df[i].map(lambda x: np.log(x) if x > 0 else 0)

In [116]:
all_df.to_csv('new.csv', index = False)

In [5]:
all_df = pd.read_csv('new.csv')

In [6]:
inputs = all_df[:len(X_train)]
test = all_df[len(X_train):]

In [7]:
test.shape

(28000, 56)

In [8]:
inputs.shape

(42000, 56)

# Creating a Classifier
### max_depth - None, for not limitting the tree depth. It will expanded with leaf size
### max_features - [ ], I gave a GridSearch to find the best fitting features
### min_samples_split - [ ], the minimum number of samples required to split also will be found by GridSearch
### bootstrap - False, to prevent using of bootstrap features in building trees
### n_estimators - [ ], number of trees in the forest. I gave several values to search the best fitting one
### criterion - gini, 

## GridSearchCV is an exhaustive search over specified parameter values for an estimator.
### We are finding the best fitting values of each parameter by using this approach

In [111]:
rfc = RandomForestClassifier()
rf_param_grid = {"max_depth": [None],
             "max_features": [45, 50, 56],
             "min_samples_split": [45, 50, 56],
             "min_samples_leaf": [45, 50, 56],
             "bootstrap": [False],
             "n_estimators" :[100, 300, 500]}

gsRFC = GridSearchCV(rfc,param_grid = rf_param_grid, scoring="accuracy", n_jobs= 4, verbose = 1)

In [113]:
#clf2.fit(inputs, y_train)
gsRFC.fit(inputs,y_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 50.3min
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed: 220.0min
[Parallel(n_jobs=4)]: Done 243 out of 243 | elapsed: 284.9min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'max_depth': [None], 'max_features': [45, 50, 56], 'min_samples_split': [45, 50, 56], 'min_samples_leaf': [45, 50, 56], 'bootstrap': [False], 'n_estimators': [100, 300, 500]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

### Testing the accuracy_score on train data

In [114]:
print('Training accuracy...', accuracy_score(y_train, gsRFC.predict(inputs)))

Training accuracy... 0.8732857142857143


# Making Prediction on test data and submission

In [115]:
pred_rfc = gsRFC.predict(test)

In [116]:
pred_rfc = pd.Series(pred_rfc,name="Label")

In [117]:
submission_rfc = pd.concat([pd.Series(range(1,28001),name = "ImageId"),pred_rfc],axis = 1)

In [118]:
submission_rfc.to_csv('rf.csv', index = False)

## Simple Random Forest gave 88% accuracy score.
## However after parameter tuning it went down, maybe I did it wrong)

# Next I tried XGBoostClassifier.
## Simple xgb gave me 90%, which is better than RF.
## After that I tuned parameters of XGBoost as shown below. 
## It compiled a bit more, however the result was 94%. 
## Hopefully it could be increased a bit more, with better tuning

In [16]:
xgbc = xgb.XGBClassifier(
 learning_rate =0.01,
 n_estimators=2500,
 max_depth=15,
 min_child_weight=7,
 subsample=0.8,
 colsample_bytree=0.5,
 nthread=15,
 scale_pos_weight=3,
 seed=20)

In [17]:
xgbc.fit(inputs, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.5, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=15, min_child_weight=7, missing=None, n_estimators=2500,
       n_jobs=1, nthread=15, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=3, seed=20, silent=True,
       subsample=0.8)

In [18]:
print('Training accuracy...', accuracy_score(y_train, xgbc.predict(inputs)))

Training accuracy... 1.0


  if diff:


In [19]:
pred_xgbc = xgbc.predict(test)

  if diff:


In [20]:
pred_xgbc = pd.Series(pred_xgbc,name="Label")

In [21]:
submission_xgbc = pd.concat([pd.Series(range(1,28001),name = "ImageId"),pred_xgbc],axis = 1)

In [22]:
submission_xgbc.to_csv('xgb.csv', index = False)

# Finally I tried Support Vector Machine, with simple parameters and gaussian kernel.
## Result was 90%, so maybe in better parameter tuning it could raise up to 90+.

In [128]:
svc = SVC(kernel = "rbf")

In [129]:
svc.fit(inputs, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [130]:
print('Training accuracy...', accuracy_score(y_train, svc.predict(inputs)))

Training accuracy... 0.9059285714285714


In [124]:
pred_svc = svc.predict(test)

In [125]:
pred_svc = pd.Series(pred_svc,name="Label")

In [126]:
submission_svc = pd.concat([pd.Series(range(1,28001),name = "ImageId"),pred_svc],axis = 1)

In [127]:
submission_svc.to_csv('svm.csv', index = False)