In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# plots
import seaborn as sns
import matplotlib.pyplot as plt

#classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

#train test split
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

#feature selection
from sklearn.feature_selection import SelectKBest, chi2 , f_classif , f_regression

%matplotlib inline

In [2]:
# import data
df_features = pd.read_csv('x_train_gr_smpl.csv') #read features csv as dataframe
df_classes = pd.read_csv('y_train_smpl.csv') #read classes csv as dataframe
df_classes.columns = ['class'] #rename classes header to 'class'
df_concat = pd.concat([df_features,df_classes],axis=1) #concatinate features dataframe with class dataframe column

FileNotFoundError: [Errno 2] File b'x_train_gr_smpl.csv' does not exist: b'x_train_gr_smpl.csv'

In [None]:
df_concat.head(10)

In [None]:
df_concat['class'].value_counts() #find how many pictures each class has

In [None]:
np.random.seed(0) #use the same random sample each time

In [None]:
class7_lenght = len(df_concat[df_concat['class']==7]) #take the lenght of the smallest class (class7)
indices_class7 = df_concat[df_concat['class']==7].index #take the indices of the class7 images

In [None]:
# random indices class 5
indices_class5 = df_concat[df_concat['class']==5].index #take the indices of the class5 images
random_class5_indices = np.random.choice(indices_class5,class7_lenght, replace=False) #take 240 random indices from class5

In [None]:
# random indices class 4
indices_class4 = df_concat[df_concat['class']==4].index
random_class4_indices = np.random.choice(indices_class4,class7_lenght, replace=False)

In [None]:
# random indices class 8
indices_class8 = df_concat[df_concat['class']==8].index
random_class8_indices = np.random.choice(indices_class8,class7_lenght, replace=False)

In [None]:
# random indices class 1
indices_class1 = df_concat[df_concat['class']==1].index
random_class1_indices = np.random.choice(indices_class1,class7_lenght, replace=False)

In [None]:
# random indices class 0
indices_class0 = df_concat[df_concat['class']==0].index
random_class0_indices = np.random.choice(indices_class0,class7_lenght, replace=False)

In [None]:
# random indices class 3
indices_class3 = df_concat[df_concat['class']==3].index
random_class3_indices = np.random.choice(indices_class3,class7_lenght, replace=False)

In [None]:
# random indices class 6
indices_class6 = df_concat[df_concat['class']==6].index
random_class6_indices = np.random.choice(indices_class6,class7_lenght, replace=False)

In [None]:
# random indices class 2
indices_class2 = df_concat[df_concat['class']==2].index
random_class2_indices = np.random.choice(indices_class2,class7_lenght, replace=False)

In [None]:
# random indices class 9
indices_class9 = df_concat[df_concat['class']==9].index
random_class9_indices = np.random.choice(indices_class9,class7_lenght, replace=False)

In [None]:
#concatinate random indices from all classes to one array
indices_combined = np.concatenate([random_class5_indices,random_class4_indices,random_class8_indices,random_class1_indices, random_class0_indices,random_class3_indices, random_class6_indices,random_class2_indices, random_class9_indices, indices_class7])

In [None]:
len(indices_combined) #random indices array length (10*240)

In [None]:
#find all these random indices from the df_concat dataframe and put them in a new dataframe df_balanced
#now we have a balanced dataframe with 240 images of each class
df_balanced = df_concat.loc[indices_combined] 

In [None]:
df_balanced.reset_index(inplace = True) #reset the indices of the new dataframe

In [None]:
dataset_big = df_balanced.drop(['index'],axis=1) #drop the index column from the new dataframe

In [None]:
dataset_big.head(10)

In [None]:
dataset_big.values # with .values we are taking the dataframe as an array

In [None]:
X = dataset_big.iloc[:,:-1].values #we drop the last value which is the class to create a features dataframe
y = dataset_big.iloc[:,2304].values #we keep only the last value to create a class dataframe

In [None]:
min_max = MinMaxScaler() #initialize min_max as a MinMaxScaler()
X_scaled = min_max.fit_transform(X) #we perform a min-max scale to features in order to have values from 0 to 1

In [None]:
X_scaled # the values after the transformation

In [None]:
#we split the sample to train and test with an 80-20 analogy
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y) 

In [None]:
#ploting histograms to see if the data follows normal distibution
plt.hist(X_scaled[50])
plt.show()

In [None]:
#we also tried the GaussianNB to see its results even though we knew that the MultinominalNB fits our model best
classifier_ga = GaussianNB() #initialize classifier_ga as a GaussianNB()
classifier_ml = MultinomialNB() #initialize classifier_ml as a MultinomialNB()
#train the model with classifier_ga
classifier_ga.fit(X_train,y_train)
#test the model with classifier_ga
y_pred_ga = classifier_ga.predict(X_test)
cm = confusion_matrix(y_test,y_pred_ga)

In [None]:
cm #confusion matrix 

In [None]:
#we transform the confusion matrix to a heatmap
fig, ax = plt.subplots(figsize=(5,5))
ax = sns.heatmap(cm,annot=True,cmap="RdPu",fmt=".0f",cbar=False)
plt.show()

In [None]:
#classidication report
print(classification_report(y_test,y_pred_ga))

In [None]:
# ten fold cross validation
# scoring='precision' , scoring='recall' , scoring='f1'
accuracy = cross_val_score(estimator = classifier_ga, X = X_scaled, y = y, cv =10)

In [None]:
#accuracy.mean after running the model for 10 times 
accuracy.mean()

In [None]:
#train the model with classifier_ml
classifier_ml.fit(X_train,y_train)
#test the model with classifier_ml
y_pred_ml = classifier_ml.predict(X_test)
cm_ml = confusion_matrix(y_test,y_pred_ml)

#we transform the confusion matrix to a heatmap
fig, ax = plt.subplots(figsize=(5,5))
ax = sns.heatmap(cm_ml,annot=True,cmap="RdPu",fmt=".0f",cbar=False)
plt.show()

In [None]:
#classidication report
print(classification_report(y_test,y_pred_ml))

In [None]:
# ten fold cross validation
# scoring='precision' , scoring='recall' , scoring='f1'
accuracy = cross_val_score(estimator = classifier_ml, X = X_scaled, y = y, cv =10)

In [None]:
#accuracy.mean after running the model for 10 times 
accuracy.mean()

In [None]:
#taking the best k features of each class and add them in columns which is an array of arrays
best = [2, 5 ,10]
columns = []
cols =[]

#all the images that belong to the class that we want to take its k-best features are in class0 and all the other images are in class1
for x in range (0,10):
    file_name = "y_train_smpl_" + str(x) + ".csv" #find the name of each class' binary csv
    df_class = pd.read_csv(file_name) #read each class' binary csv as dataframe
    df_class.columns = ['class'] #rename the header of each class' binary csv as 'class'
    df_concatBinary = pd.concat([df_features,df_class],axis=1) #concatinate the features dataframe with the class' binary dataframe 
    class_ = len(df_concat[df_concat['class']==0]) #set the length of the sample we're taking to the number of instances in that class 
    index_class_non = df_concatBinary[df_concatBinary['class']==1].index #take all the images from class1
    random_class_non_indices = np.random.choice(index_class_non, class_, replace=False) #take a random sample of indices from class1 images equal to class0 length
    indices_class = df_concatBinary[df_concatBinary['class']==0].index #take class0 indices
    indices_combined_class = np.concatenate([indices_class, random_class_non_indices]) #create an array with equal indices from class0 & class1
    df_balanced_class = df_concatBinary.loc[indices_combined_class] #take a dataframe with only these indices
    df_balanced_class.reset_index(inplace = True) #reset indices 
    dataset_class = df_balanced_class.drop(['index'],axis=1) #drop the index column from the new dataframe
                                                                       
    dataset_class = dataset_class.sample(frac=1).reset_index(drop=True) #this randomises the dataframe
                                                
    
    #select k best features
    X = dataset_class.iloc[:,:-1] #we drop the last value which is the class to create a features dataframe
    y = dataset_class.iloc[:,2304] #we keep only the last value to create a class dataframe


    for value in best:
        selector = SelectKBest(f_regression, k=value) #initialize selector as SelectKBest() with the f_regression
        selector.fit(X,y) #use the selector in our model
        cols_int = selector.get_support(indices=True) #take an array of headers fot the k-best features as integers
        cols_str = list(map(str,cols_int)) #transform integers to strings
        columns.append(cols_str) #add the headers in an array of arrays


In [None]:
top2 = []
top5 = []
top10 = []

for x in range(0,30): #place the header arrays into individual dataframes to allow analysis
    if x%3 == 0:
        top2.append(columns[x])    
    elif x%3 == 1:
        top5.append(columns[x])
    else:
        top10.append(columns[x])



In [None]:


A = dataset_big #take the original FULL dataframe
top2X = pd.DataFrame() #create an empty dataframe for storing the top 2 correlated pixels
top5X = pd.DataFrame() #create an empty dataframe for storing the top 5 correlated pixels
top10X = pd.DataFrame() #create an empty dataframe for storing the top 10 correlated pixels

for val in top2:   #take array for each class containing two pixel labels
    for x in val:       #take the 2 ints from each val array
        column = A[str(x)]    # find the column of values corresponding to that pixel
        top2X[str(x)] = column    # add that column to the new dataframe with the same label
top2X['class'] = A.iloc[:,2304].values    #adding the class colum to the completed dataframe

for val in top5:   #take array for each class containing two pixel labels
    for x in val:       #take the 2 ints from each val array
        column = A[str(x)]    # find the column of values corresponding to that pixel
        top5X[str(x)] = column    # add that column to the new dataframe with the same label
top5X['class'] = A.iloc[:,2304].values    #adding the class colum to the completed dataframe

for val in top10:   #take array for each class containing two pixel labels
    for x in val:       #take the 2 ints from each val array
        column = A[str(x)]    # find the column of values corresponding to that pixel
        top10X[str(x)] = column    # add that column to the new dataframe with the same label
top10X['class'] = A.iloc[:,2304].values    #adding the class colum to the completed dataframe

X2 = top2X.iloc[:,:-1].values     #we drop the last value which is the class to create a features dataframe
Y2 = top2X.iloc[:,20].values    #we keep only the last value to create a class dataframe
X_train, X_test, y_train, y_test = train_test_split(X2, Y2, test_size=0.2, stratify=Y2) #creating testing and training sets

#train the model with classifier_ml
classifier_ml.fit(X_train,y_train)
#test the model with classifier_ml
y_pred_ml = classifier_ml.predict(X_test)
cm_ml = confusion_matrix(y_test,y_pred_ml)

#we transform the confusion matrix to a heatmap
fig, ax = plt.subplots(figsize=(5,5))
ax = sns.heatmap(cm_ml,annot=True,cmap="RdPu",fmt=".0f",cbar=False)
plt.show()


In [None]:
print(classification_report(y_test,y_pred_ml))

In [None]:
X5 = top5X.iloc[:,:-1].values     #we drop the last value which is the class to create a features dataframe
Y5 = top5X.iloc[:,50].values    #we keep only the last value to create a class dataframe
X_train, X_test, y_train, y_test = train_test_split(X5, Y5, test_size=0.2, stratify=Y2) #creating testing and training sets

#train the model with classifier_ml
classifier_ml.fit(X_train,y_train)
#test the model with classifier_ml
y_pred_ml = classifier_ml.predict(X_test)
cm_ml = confusion_matrix(y_test,y_pred_ml)

#we transform the confusion matrix to a heatmap
fig, ax = plt.subplots(figsize=(5,5))
ax = sns.heatmap(cm_ml,annot=True,cmap="RdPu",fmt=".0f",cbar=False)
plt.show()

In [None]:
print(classification_report(y_test,y_pred_ml))


In [None]:
X10 = top10X.iloc[:,:-1].values     #we drop the last value which is the class to create a features dataframe
Y10 = top10X.iloc[:,95].values    #we keep only the last value to create a class dataframe
X_train, X_test, y_train, y_test = train_test_split(X10, Y10, test_size=0.2, stratify=Y2) #creating testing and training sets

#train the model with classifier_ml
classifier_ml.fit(X_train,y_train)
#test the model with classifier_ml
y_pred_ml = classifier_ml.predict(X_test)
cm_ml = confusion_matrix(y_test,y_pred_ml)

#we transform the confusion matrix to a heatmap
fig, ax = plt.subplots(figsize=(5,5))
ax = sns.heatmap(cm_ml,annot=True,cmap="RdPu",fmt=".0f",cbar=False)
plt.show()

In [None]:
print(classification_report(y_test,y_pred_ml))