In [2]:
import numpy as np 
import seaborn as sns
import pandas as pd 
from tqdm import tqdm 

#from sklearn.preprocessing import LabelEncoder, StandardScaler 
#Library for data preprocessing
from sklearn.metrics import accuracy_score, precision_score, f1_score 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder, StandardScaler #Library for data preprocessing
from sklearn.model_selection import RandomizedSearchCV 

from sklearn.ensemble import RandomForestClassifier

In [3]:
#Reading the csv file and renamin columns
df_Histrogram = pd.read_csv('/content/EdgeHistogram.csv') 
df_Histrogram.columns = ['Image_ID'] + ['Feature_' + str(i) for i in range(1, 81)]
#Dropping the image id column
df_Histrogram.drop('Image_ID', axis=1, inplace=True) 
print(df_Histrogram)

      Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  Feature_6  \
0             1          1          1          2          2          2   
1             2          2          1          0          3          0   
2             5          1          6          2          6          3   
3             0          0          0          0          0          0   
4             1          6          4          2          2          0   
...         ...        ...        ...        ...        ...        ...   
9139          5          3          1          4          1          2   
9140          0          0          5          1          2          1   
9141          3          3          2          1          2          0   
9142          0          0          0          0          0          0   
9143          5          3          0          1          1          3   

      Feature_7  Feature_8  Feature_9  Feature_10  ...  Feature_71  \
0             4          4          2    

In [4]:
#reading the image labels csv file and renaming the columns
df_Images = pd.read_csv('/content/Images.csv') 
df_Images.columns = ['Image_ID','Image_Label']
#Dropping the image id columns
df_Images.drop('Image_ID', axis=1, inplace=True) 
print(df_Images)

      Image_Label
0       binocular
1           chair
2            tick
3         minaret
4           Faces
...           ...
9139  cougar_face
9140    accordion
9141        Faces
9142  grand_piano
9143   Motorbikes

[9144 rows x 1 columns]


In [5]:
#Concatinating the labels with the image features and printing the dataframe
Concate_DF = pd.concat([df_Images,df_Histrogram], axis=1) 
print(Concate_DF)

      Image_Label  Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  \
0       binocular          1          1          1          2          2   
1           chair          2          2          1          0          3   
2            tick          5          1          6          2          6   
3         minaret          0          0          0          0          0   
4           Faces          1          6          4          2          2   
...           ...        ...        ...        ...        ...        ...   
9139  cougar_face          5          3          1          4          1   
9140    accordion          0          0          5          1          2   
9141        Faces          3          3          2          1          2   
9142  grand_piano          0          0          0          0          0   
9143   Motorbikes          5          3          0          1          1   

      Feature_6  Feature_7  Feature_8  Feature_9  ...  Feature_71  Feature_72  \
0     

In [6]:
#Creating a copy of the dataframe
DF = Concate_DF.copy(deep=True) 

In [7]:
#Checking for null values in the dataset
DF.isnull().sum() 

Image_Label    0
Feature_1      0
Feature_2      0
Feature_3      0
Feature_4      0
              ..
Feature_76     0
Feature_77     0
Feature_78     0
Feature_79     0
Feature_80     0
Length: 81, dtype: int64

In [8]:
# Statistical description of the dataset
DF.describe()

Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,...,Feature_71,Feature_72,Feature_73,Feature_74,Feature_75,Feature_76,Feature_77,Feature_78,Feature_79,Feature_80
count,9144.0,9144.0,9144.0,9144.0,9144.0,9144.0,9144.0,9144.0,9144.0,9144.0,...,9144.0,9144.0,9144.0,9144.0,9144.0,9144.0,9144.0,9144.0,9144.0,9144.0
mean,1.588583,1.612423,2.036089,1.746938,1.93318,1.487314,2.370297,2.7465,2.348097,2.602253,...,1.634077,3.059274,3.076662,2.843176,3.06671,1.505359,2.272857,2.467192,2.250437,2.533574
std,1.73209,1.756488,2.108666,2.104001,2.045051,1.630754,1.931569,2.195378,2.135593,2.037395,...,1.628375,2.025536,2.191361,2.086794,2.002768,1.571015,1.964027,2.036798,2.171386,2.08214
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0
50%,1.0,1.0,1.0,1.0,1.0,1.0,2.0,3.0,2.0,2.0,...,1.0,3.0,3.0,3.0,3.0,1.0,2.0,2.0,2.0,2.0
75%,3.0,3.0,4.0,3.0,3.0,3.0,4.0,5.0,4.0,4.0,...,3.0,5.0,5.0,5.0,5.0,3.0,4.0,4.0,4.0,4.0
max,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,...,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0


In [9]:
#A global variable to keep a check for normalization
Global_Check_Preprocess = False 

def Normalize(DF):
  #Standardize features by removing the mean and scaling to unit variance
    Standardizer = StandardScaler() 
    #Fitting the data to the standardizer & return normalized dataframe
    DF = Standardizer.fit_transform(DF) 
    return DF 

#Option to normalize data or not
def Preprocess(DF): 
    Option_Select = int(input('Do you want to perform normalization on the dataset?\n1. Yes\n2. No\n'))
    #Using the global variable
    global Global_Check_Preprocess 
    #If user wants to normalize the data
    if Option_Select == 1: 
        Global_Check_Preprocess = True
        #Calling the normalize function 
        DF = Normalize(DF) 
    else: 
        pass 
        #return the preprocessed dataframe
    return DF 

In [10]:
#Calling the preprocess function
DF.iloc[:,1:] = Preprocess(DF.iloc[:,1:]) 
#Printing the global variable to see if the data is normalized or not
print('Global_Check_Preprocess:', Global_Check_Preprocess) 

Do you want to perform normalization on the dataset?
1. Yes
2. No
1
Global_Check_Preprocess: True


In [11]:
#Creating a dataframe to store the performance of the models
Gbl_Per_DF = pd.DataFrame(columns=['Training Image','Model','Parameters','Accuracy','Precision','F1 Score']) 

RF_Parameter = {'n_estimators': [int(x) for x in np.linspace(start = 3, stop = 14, num = 10)],
                'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
                'n_jobs': [-1]}

#Random Search 
RandomizedSrchCV_Params = {'n_iter': 10,
                                'cv': 3,
                                'verbose': 0,
                                'n_jobs': -1,
                                'random_state': 42,
                                'scoring': 'f1_macro'}

#Hyperparameter optimization of MLP and SVC Classifiers
def HyperparameterTuning(name,model,X_train, y_train, X_test, y_test):
    RandomizedSearchCV_Model = RandomizedSearchCV(model, RF_Parameter, **RandomizedSrchCV_Params)
    
    # Fit the random search model
    RandomizedSearchCV_Model.fit(X_train, y_train)
    #Print the best parameters of the model
    print('Best Parameters of the model:',name,'are:', RandomizedSearchCV_Model.best_params_)
    #Return the best parameters of the model 
    return  RandomizedSearchCV_Model.best_params_ 

#create dictionary of models
def get_models(): 
   # Adding required model to the dictionary
    models = dict()
  #Using Random Forest Classifier
    models['RFC'] = RandomForestClassifier()  
    return models

#Calling the get_models function
Models = get_models() 
# Get Performance of Model
def get_performance(name,model, X_train, y_train, X_test, y_test,Gbl_Per_DF,Image_In_Training):
  #Calling the hyperparameter tuning function
    BestParams = HyperparameterTuning(name,model,X_train, y_train, X_test, y_test)
    #Creating the model with best parameters 
    Model = RandomForestClassifier(**BestParams) 
    #Fitting the model
    Model.fit(X_train, y_train)
    #Predicting the labels of the test data
    y_pred = Model.predict(X_test) 
    #Calculating the accuracy of the model
    Accuracy = accuracy_score(y_test, y_pred)*100 
    Precision = precision_score(y_test, y_pred, average='macro', zero_division=1)*100 
    #Calculating the f1 score of the model
    F1_Score = f1_score(y_test, y_pred, average='macro', zero_division=1)*100 
    #Appending the evauation metrics to the dataframe
    Gbl_Per_DF = Gbl_Per_DF.append({'Training Image':Image_In_Training,'Model':Model,'Parameters':BestParams,'Accuracy':Accuracy,'Precision':Precision,'F1 Score':F1_Score}, ignore_index=True)
    #Return dataframe which has the performances of models
    return Gbl_Per_DF 
    

#Test size kept to 0.2, because we want the test data to be 20% of the total data.

#Keep this many images of each label in the train data
for Image_In_Training in [3,5,10,15]: 
#Printing the number of images in training set
    print('Number of images in training set:',Image_In_Training) 

    #Undersampling the data to a fixed number of images per class label
    DF_Undersampled = pd.DataFrame(columns=DF.columns) 
    #Creating a new dataframe to store the undersampled data
    for Class in DF['Image_Label'].unique():
      #considering the undersampled data to the dataframe
        DF_Undersampled = DF_Undersampled.append(DF[DF['Image_Label']==Class].sample(n=Image_In_Training, random_state=42), ignore_index=True) 
   # Shuffle the dataset
    DF_Undersampled = DF_Undersampled.sample(frac=1, random_state=42).reset_index(drop=True) 

    #Splitting the data into train and test
    xtrain, xtest, ytrain, ytest = train_test_split(DF_Undersampled.iloc[:,1:], DF_Undersampled['Image_Label'], test_size=0.05, random_state=42)

    #Iterating for every model & Calling the get_performance function for every model
    for name, model in Models.items():

        Gbl_Per_DF = get_performance(name,model, xtrain, ytrain, xtest, ytest, Gbl_Per_DF,Image_In_Training) 
#Sorting the dataframe by F1 Score
Gbl_Per_DF = Gbl_Per_DF.sort_values(by=['Training Image','F1 Score'], ascending=False).reset_index(drop=True) 

Number of images in training set: 3




Best Parameters of the model: RFC are: {'n_jobs': -1, 'n_estimators': 7, 'max_depth': 70}
Number of images in training set: 5
Best Parameters of the model: RFC are: {'n_jobs': -1, 'n_estimators': 12, 'max_depth': 70}
Number of images in training set: 10
Best Parameters of the model: RFC are: {'n_jobs': -1, 'n_estimators': 12, 'max_depth': 80}
Number of images in training set: 15
Best Parameters of the model: RFC are: {'n_jobs': -1, 'n_estimators': 12, 'max_depth': 80}


In [12]:
#Saving the dataframe as csv file
#Name of the csv file
NameOfCSV = 'Performance Of Models' 
if Global_Check_Preprocess == True:
  NameOfCSV += ' With Preprocessing'
else:
  NameOfCSV += ' Without Preprocessing'
NameOfCSV += '.csv'
#Saving the dataframe as csv file
Gbl_Per_DF.to_csv(NameOfCSV) 

In [13]:
#Function to display the performance of the best model
def Display_Stats_Of_Best_Model(DF,Index): 
    print('Best Model Number for 15 images per label:', Index+1)
    print('Model Used:', DF['Model'][Index])
    print('Precision:', DF['Precision'][Index])
    print('Accuracy:', DF['Accuracy'][Index])
    print('F1 Score:', DF['F1 Score'][Index])
    print()
#Displaying the performance of the top model with respect to the number of images in training set
Display_Stats_Of_Best_Model(Gbl_Per_DF,0) 

Best Model Number for 15 images per label: 1
Model Used: RandomForestClassifier(max_depth=80, n_estimators=12, n_jobs=-1)
Precision: 39.52991452991453
Accuracy: 13.157894736842104
F1 Score: 9.82905982905983

