In [4]:
import PIL
from PIL import Image
import pandas as pd
import numpy as np
import cv2
import os
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn import tree
import graphviz
import pydotplus
import openpyxl
from openpyxl import load_workbook
import xlsxwriter

In [39]:
def vectorize_image(path):
    
    '''
        
    Using OS library,read the directory path,directory name and files names of the directory.For each of the image files,
    convert the image into pixel using the OpenCV library.
    This function performs the following activity
    Creates a dataframe where the number of columns = n x n (size of the image) and rows = number of images.
    Creates a list of labels corresponding to the digit represented by each of the images. 
    
    
    Arguments:-
    path =  The directory path which has the images to be converted to a corresponding dataframe 
            where each image is a vector of n*n
            where the image is of the size n x n pixels
    
    Returns :-
    df_images = Dataframe holding pixel values of each image
    labels = The digit represented by each of the images. 
    
    
    '''
                   
    labels=[]                  #List to hold lable name for image ie digit the images belongs to.      
    df_images=pd.DataFrame()   # Dataframe to hold the pixel values of every image
    
    for clustdir in os.listdir(path):
        lab_name=int(clustdir)
    
        for file_name in os.listdir(os.path.join(path, clustdir)):
        
            img=cv2.imread(os.path.join(path,clustdir,file_name),0)
            img_flat=img.flatten()                           #Flattern array to single dimension
            img_flat=pd.Series(img_flat)
            labels.append(lab_name) 
            df_images=df_images.append(img_flat,ignore_index=True)

    
    return df_images,pd.Series(labels)
    
    

In [40]:
def dir_user_input():
    '''
    Ask the user to enter the directory path where the folders containing the images are present.
    Keeps asking the user till the user enters a valid path 
    
    '''
    while True:    
        dir_path=input('Enter directory path with ,do not enclose in quotes       :-  ')
        if os.path.isdir(dir_path):
            return vectorize_image(dir_path)
        else:
            print ("Directory not exists. Re- Enter")

In [41]:
def pca_transform(fitdata,transformdata,variance):
    '''
    The function creates a PCA object. The training data is fit on the object. The training data is transformed into the same object. Test
    data is transformed on the same object on which training data was fit.
    
    Arguments:-
    fitdata= Dataframe holding pixel values
    transformdata :- Dataframe to transform the data to PCA data.
    variance= percentage of variance needed to 
    
    Returns :- Transformed data as per the percentage of variance 
    
    ''' 
    pca = PCA(variance)
    pca.fit(fitdata)
    data_transformed=pca.transform(transformdata)
    return pd.DataFrame(data_transformed)
  

In [42]:
def classify_cross_val(xdata,ydata,k_folds,clf):
    
    '''
    Arguments:-
    xdata :- pca transformed training data
    ydata :- series holding label of images
    k_folds :- number of n_split for StratifiedKFold
    clf :- object of classifier
    
    Return :-  
    scores :-Accuracy score
    
    Function uses StratifiedKFold to split data.
    
    Using the object of libraries of a given algorithm,training data is fit with ytrain lables to built a training model
    
    Using the same algorithm specific library object,test data is predicted and prediction for all the splits are stored in
    a series against index matching to test index.
    
    Accuracy of predicted data is given back to respective algorithm calling function.
    
    
    '''
    #Creating zeros series of final prediction
    skFold=StratifiedKFold(n_splits=k_folds)
    y_pred_final=pd.Series(np.zeros(ydata.shape[0]))  
    for train_index,test_index in skFold.split(xdata,ydata):
        
        x_train,x_test=xdata.iloc[train_index],xdata.iloc[test_index]
        y_train,y_test=ydata.iloc[train_index],ydata.iloc[test_index]
        
        y_pred,scores=classify_train_test(x_train,y_train,x_test,y_test,clf)
        
        #Put the predicted values as per the index locations of the test indices. 
        #At the end of all the folds we will have the predictions for the entire data set
        for i in test_index:
            y_pred_final[i]=y_pred[i]    
    
    scores=metrics.accuracy_score(ydata,y_pred_final)   
    return scores
   

In [43]:
def classify_train_test(x_train,y_train,x_test,y_test,clf):
    
    '''
    Based on the algorithm object, the respective fit method will fit the training data and Y train data. On the same object the
    test data is predicted. The accuracy score is calculated using predicted data and Y test data.
    
    Argument :-
    x_train- train data
    y_train - y train data
    x_test - x test data
    y_test - y test data
    clf - classifier algorithm object
    
    Return :- 
    ypred_final :- Predicted test data
    score :-accuracy score of predicted data and Y test data.
            
    '''
   
    clf.fit(x_train,y_train)
    ypred =clf.predict(x_test)
    ypred_final=pd.Series(ypred,index=y_test.index)
    
    score=metrics.accuracy_score(y_test,ypred_final)  
    
    return ypred_final,score
   

In [44]:
def classify_validation_data(best_clf,pca_var):
    

    '''
    This function is to get the accuracy score based on the best algorithm that we have obtained from GridSearch.The best 
    algorithm object and the best PCA variance is passed to this function.
    
    Argument :-
    best_clf :- object of the best classifier
    pca_var :- PCA variance
    
    Return :-
    
    scored :- Accuracy score report

    '''
  
    
        
    #READING TRAINING DATA AND CONVERTTING TO PIXEL VALUES.
    train_data,train_lst=dir_user_input()
        
    #TRANSFORM TRAIN DATA ON EXISTING PCA OBJECT
    train_data_transformed=pca_transform(train_data,train_data,pca_var)
        
    #READING TEST DATA AND CONVERTTING TO PIXEL VALUES.
    validation_data,validation_label=dir_user_input()
                
    #TRANSFORM TEST DATA ON EXISTING PCA OBJECT
    test_pca_data=pca_transform(train_data,validation_data,pca_var)
    
    y_pred_class,scores=classify_train_test(train_data_transformed,train_lst,test_pca_data,validation_label,best_clf)
    
    print("Classification report for classifier %s:\n%s\n"
      % (best_clf, metrics.classification_report(validation_label, y_pred_class)))
    
    return scores
   

In [45]:
def run_nb_algorithm(xdata,ydata,kfold,pca_var):
    
    '''
    The function is for calculated the accuracy score for the Naive Bayes algorithm over different PCA variance. The data frame is
    populated with accuracy score for different PCA variance.
    
    Argument :- 
    xdata :- pca transformed training data
    ydata :- series holding label of images
    kfold :- number of n_split for StratifiedKFold
    pca_var :- PCA variance
    
    Return :-
    nb_df :- Data Frame containing accuracy score for different PCA variance.
    
        
    '''
    nb_df=pd.DataFrame(columns=['PCA','score'])  #Dataframe to hold the parameter values,score,PCA
    nb_obj = GaussianNB()  
    nb_acc_score = classify_cross_val(xdata,ydata,kfold,nb_obj)
    nb_df.loc[len(nb_df)]=[comp,nb_acc_score]
    
    return nb_df


In [46]:
def run_decision_tree(xdata,ydata,kfold,pca_var,max_dep,min_spl,crit_value):
    
    '''
    
    This function is for executing the Decision Tree algorithm with permutation and combination of parameters and range of values.
    The output of each combination is stored in the data frame. For further analysis, the data frame is exported to an excel file.
    
    Arguement :-
    xdata :- pca transformed training data
    ydata :- series holding label of images
    kfold :- number of n_split for StratifiedKFold   
    pca_var  :- PCA Variance
    max_dep :- List holding a range of values for the max_depth parameter.
    min_spl :- List holding a range of values for the min_sample_split parameter.
    crit_value :- List holding a range of values for the criterion parameter.
    
    
    Return :-
    dt_df := Data frame
    dt_obj :- Object of Decision Tree.
    
    
    '''
    dt_df=pd.DataFrame(columns=['PCA','max_depth','min_split','score','criterion'])  #Dataframe to hold the parameter values,score,PCA
    
    
    for crit in crit_value:
        for x in max_dep: 
            
            for y in min_spl:
                dt_obj=DecisionTreeClassifier(criterion=crit,max_depth=x,min_samples_split=y,random_state=42)
                dt_score=classify_cross_val(xdata,ydata,kfold,dt_obj)
                dt_df.loc[len(dt_df)]=[comp,x,y,dt_score,crit]
    
    
    
    return dt_df,dt_obj
    

In [47]:
def dt_graph_display(dtobj,train_data,train_label):
    
    '''
    Function is to view decision tree graphiclly ,save the graph in a pdf file.
    
    Argument :-
    dtobj       - object of decision tree.
    train_data  - train transformed data.
    train_label -Series with actual digit number for every .
    
    '''
      
    os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'


    dot_data = tree.export_graphviz(dtobj, out_file=None)  #convert data to graphically create the decision tree using the.
      
    graph = pydotplus.graph_from_dot_data(dot_data)  # use the converted data to produce a graph.
    graph.write_pdf("image17_graph.png")             # write the graph to pdf format
    Image(graph.create_png())

In [48]:
def run_knn_algorithm(xdata,ydata,kfold,comp,knn_lst):
    
    '''
    The function is to create KNeightClassifer object for a range of knn values. Accuracy score for different knn values for a
    given PCA variance is stored in a data frame for further analysis.
    
    
    Argument :- 
    xdata :- pca transformed training data
    ydata :- series holding label of images
    kfold :- number of n_split for StratifiedKFold 
    comp :- PCA variance
    knn_lst   :- Range of knn values
    
    Return :-
    knn_df :- Data frame
    
    '''
    accur_score={}
    
   
    knn_df=pd.DataFrame(columns=['PCA','knn','score'])  #Dataframe to hold the parameter values,score,PCA
    
    for val in knn_lst:
        knn=KNeighborsClassifier(n_neighbors=val,weights='uniform',metric='euclidean')
        knn_score=classify_cross_val(xdata,ydata,kfold,knn)
        knn_df.loc[len(knn_df)]=[comp,val,knn_score]
    
    return knn_df
    

In [49]:
def run_random_forest(xdata,ydata,kfold,comp,est_lst,max_dep,min_spl,crit_value,):
    
    '''
    
    This function is for executing the RandomForest algorithm with permutation and combination of parameters and range of values.
    The output of each combination is stored in the data frame. For further analysis, the data frame is exported to an excel file.
    
    Arguement :-
    xdata :- pca transformed training data
    ydata :- series holding label of images
    kfold :- number of n_split for StratifiedKFold    
    comp  :- PCA Variance
    est_lst :- List holding a range of values for the n_estimator paramter
    max_dep :- List holding a range of values for the max_depth parameter.
    min_spl :- List holding a range of values for the min_sample_split parameter.
    crit_value :- List holding a range of values for the criterion parameter.
    
    
    Return :-
    rf_df :- Data Frame.
   
    '''
    rf_df=pd.DataFrame(columns=['PCA','n_estimator','max_depth','min_split','score','criterion'])  #Dataframe to hold the parameter values,score,PCA
    for crit in crit_value:
        for x in est_lst:  
            for y in max_lst:
                for z in min_samp_lst:
            
                    rfc_obj=RandomForestClassifier(criterion=crit,n_estimators=x,max_depth=y,min_samples_split=z,random_state=42)
                    rfc_score=classify_cross_val(xdata,ydata,kfold,rfc_obj)
                    rf_df.loc[len(rf_df)]=[comp,x,y,z,rfc_score,crit]
  
    return rf_df

In [50]:
def create_excel(dir_file):
    
    '''
    The function is to create an object of excelwriter pointing to a excel file name and location passed as argument.
    Argrument :-
    dir_file :- directory path with file name
    
    Return :-
    workbook :-Object of excel workbook
    '''
    
    workbook=pd.ExcelWriter(dir_file, engine='openpyxl',mode='a')
   

    return workbook

In [51]:
def write_to_excel(df,writer,sheetname,path):
    
    '''
    This function is to write the output to excel file.
    
    Argument :-
    df- Dataset to export to the excel file
    writer - object of the excel file
    sheetname - Name of the sheet in the excel file.
    path - Location of the excel file
    
    Return :-
   write :- Object of excel file
     
    '''
    lst1=['KNN','NB']
    lst2=['DT','RF']
    
    if os.path.exists(path):
        
        wb=load_workbook(path)
        if sheetname in wb.sheetnames:
            if sheetname in lst1:
                start_row=wb[sheetname].max_row
                start_row=start_row+2
                
                df.to_excel(writer,sheet_name=sheetname,startrow=start_row,startcol=0,engine='openpyxl')
                writer.close()
                
            elif sheetname in lst2:
                print(sheetname)
                start_col=wb[sheetname].max_column
                start_col=start_col+2
                
                df.to_excel(writer,sheet_name=sheetname,startrow=0,startcol=start_col,engine='openpyxl')
                writer.close()
        else:
            df.to_excel(writer,sheet_name=sheetname,startrow=0,startcol=0,engine='openpyxl')
            writer.close()
            
                        
    else:
            
        df.to_excel(writer,sheet_name=sheetname,startrow=0,startcol=0,engine='openpyxl')
        writer.close()
        
    df.iloc[0:0]
    df=df.dropna(axis=1,inplace=True)
    
    return writer
    
    

##### Based GridSearch classfier giving the highest accuracy score,we now take the whole Training data ,use the PCA variance that was best with gridsearch,process,fit,tranform train data with pca object.Transform the test data on the same PCA object.Fit the pca transormed train data , train data label with classifier object defined with tuned parameter ,predict the test data on classifer object and find the final accuracy score

In [52]:
def get_best_model(xdata,ydata):
    
    '''
    The function uses the GridSearch method to find the best model.
    Multiple classifiers are passed with their respective parameters and a range of values.
    
    Argument :-
    
    xdata :- pca transformed training data
    ydata :- series holding label of images
    
    Return 
    clf.best_score_ :-Best model score
    clf.best_estimator_:-Best estimator object
    
    
    '''
    
    k_range = list(range(3,10,2))
    m_depth=list(range(10,14))
    m_split=list(range(10,14))
    est_range = list(range(100,250,50))
   
    
    pipe= Pipeline([('classifier', KNeighborsClassifier())])


    param_grid = [{
        'classifier':[KNeighborsClassifier()],
        'classifier__n_neighbors': k_range
        },
        {
        'classifier':[DecisionTreeClassifier()],
        'classifier__max_depth': m_depth,
        'classifier__min_samples_split':m_split,
        'classifier__criterion':['entropy','gini'],
        'classifier__random_state':[42]   
       },
        {
         'classifier':[RandomForestClassifier()],   
         'classifier__n_estimators' :est_range,
         'classifier__max_depth': m_depth,
         'classifier__min_samples_split':m_split,
         'classifier__criterion':['entropy','gini'],
         'classifier__random_state':[42]   
                   
        },
        {
         'classifier':[GaussianNB()]   
        
        }] 

    clf=GridSearchCV(pipe,param_grid,cv=5)
    clf.fit(xdata,ydata)
    
    return clf.best_score_,clf.best_estimator_
   

In [None]:
#User input for directory path holding images.

train_pixel_data,train_label_lst=dir_user_input()

In [None]:
#User input for PCA variance

#comp= Variance of PCA

comp=input('Enter the number of PCA variance       :- ')
if '.' in comp:
    comp=float(comp)
    train_pca_data=pca_transform(train_pixel_data,train_pixel_data,comp)
    

In [55]:
#kfold :- number of n_split for StratifiedKFold
kfold=5

In [56]:
#Provide the name and location to create an excel file
excel_path ='D:\\excel_folder\\Supervised.xlsx'
workbook = create_excel(excel_path)


In [None]:
# Decision Tree Algorithm
max_dep=[10,12,14]       #Range of max_depth
min_samp_lst=[2,3]       #Range of min_sample_split
criterion=['gini','entropy']

df_dt,obj_dt=run_decision_tree(train_pca_data,train_label_lst,kfold,comp,max_dep,min_samp_lst,criterion)

# Input from the user whether to export data frame data to an excel.Enter in smaller case without quotes

flag=input('Enter yes to export dataframe data to an excel file else enter no       :-  ')

if flag=='yes':
    sheet='DT'
    workbook=write_to_excel(df_dt,workbook,sheet,excel_path)


In [58]:
#To view Decision Tree graphically in a file format
dt_graph_display(obj_dt,train_pca_data,train_label_lst)

In [None]:
#Random Forest Algorithm

est_lst=[100,150]  #Range of estimator values
max_lst=[10,12]                       #Range of max_depth
min_samp_lst=[3,10,15,17]             #Range of min_sample_split

df_rf=run_random_forest(train_pca_data,train_label_lst,kfold,comp,est_lst,max_dep,min_samp_lst,criterion)

# Input from the user whether to export data frame data to an excel.Enter in smaller case without quotes

flag=input('Enter yes to export dataframe data to an excel file else enter no       :-  ')

if flag=='yes':

    sheet='RF'
    workbook=write_to_excel(df_rf,workbook,sheet,excel_path)


In [None]:

#Check the Accuracy score for Naive Baye's with Cross validation.

df_nb=run_nb_algorithm(train_pca_data,train_label_lst,kfold,comp)

# Input from the user whether to export data frame data to an excel.Enter in smaller case without quotes

flag=input('Enter yes to export dataframe data to an excel file else enter no       :-  ')

if flag=='yes':
    sheet='NB'
    workbook=write_to_excel(df_nb,workbook,sheet,excel_path)


In [None]:

#Check the Accuracy score for k-nearest Neighbors with Cross validation.

knn_list=list(range(3,10,2))
df_knn=run_knn_algorithm(train_pca_data,train_label_lst,kfold,comp,knn_list)


flag=input('Enter yes to export dataframe data to an excel file else enter no       :-  ')

if flag=='yes':
    sheet='KNN'
    workbook=write_to_excel(df_knn,workbook,sheet,excel_path)

In [None]:
# After individually executing the classier with permutation and combination of tuning parameter and range of PCA variance,now
#use the grid search to find the best classifier.

pca_var_dict={}
#pca_var=[0.96,0.97,0.98,0.99]
pca_var=[0.97]
for var in pca_var:
   
    train_pca_data=pca_transform(train_pixel_data,train_pixel_data,var)
    score,best_clf=get_best_model(train_pca_data,train_label_lst)
    pca_var_dict.update({var:score})
    
max_v=max(zip(pca_var_dict.values(), pca_var_dict.keys())) 
    

###### Now we take complete training data and convert to pixel values. Using PCA object and variance value we tranform the data to new set of component values. Same process is followed with test images.Training data is then fit on best classifier object. Test data is predicted over classifier object to get the accuracy score.

In [64]:
# By calling grid_search we get the object of the best classifier. This will then be passed to classify_validation_data to get the 
#accuracy score for best algorithm for this data set 
max_v=max(zip(pca_var_dict.values(), pca_var_dict.keys()))
accuracy_score=classify_validation_data(best_clf,max_v[1])
print(accuracy_score)

Enter directory path with ,do not enclose in quotes       :-  D:\Project-Hand-Written-Charcter-reco\training-images
Enter directory path with ,do not enclose in quotes       :-  D:\Project-Hand-Written-Charcter-reco\test-images
Classification report for classifier Pipeline(memory=None,
     steps=[('classifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=12, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=11,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False))]):
              precision    recall  f1-score   support

           0       0.77      0.94      0.85        50
           1       0.88      0.98      0.92        50
           2       0.95      0.67      0.78        54
           3       0.88      0.86      0.87        50

