# Image Classification

In [1]:
# Image Classification
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, VotingClassifier
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_selection import SelectKBest, f_classif

In [2]:
# Reading traning data, labels and test data.
train_data = pd.read_csv('train.dat', ' ', header=None, float_precision='high')
test_data= pd.read_csv('test.dat', ' ', header=None, float_precision='high')
train_labels = pd.read_csv('train.labels', header=None, names=['labels'])

In [3]:
test_data.shape

(5296, 887)

In [4]:
train_data.shape

(21186, 887)

# Feature Selection

In [5]:
# Feature selection using SelectKBest with score function as f_classify
featureSelector = SelectKBest(f_classif, k=48)
X_train = featureSelector.fit_transform(train_data,train_labels)
print(X_train.shape)
# Transforming test data
X_test = featureSelector.transform(test_data)
print(X_test.shape)

(21186, 48)
(5296, 48)


# Handling Unbalanced data

In [7]:
#For unbalanced data using smote technique
spl = SMOTE(random_state=42,k_neighbors=1, n_jobs=1)
X_smote = np.array(X_train)
y_smote = np.array(train_labels)
X, y = spl.fit_resample(X_smote, y_smote)

In [8]:
X.shape

(103750, 48)

# Splitting the data 

In [9]:
#Spliting Input data intot Training data and validation data in the ratio of 70:30.
trainset_X,valid_X,trainset_y,valid_y = train_test_split(X,y, test_size=0.3)

# DecisionTree Classifier

In [14]:
#Decision Tree Classifier
alg_DF = DecisionTreeClassifier(random_state = 0)
alg_DF.fit(trainset_X, trainset_y)
pred_DF = alg_DF.predict(valid_X)
score_DF = metrics.f1_score(pred_DF,valid_y, average='macro')

print('F1 score for Decision Tree Classifier:', score_DF)

F1 score for Decision Tree Classifier: 0.9050744032097476


# Random Forest Classifier

In [37]:
#Random Forest Classifier
alg_RF = RandomForestClassifier(n_estimators=120, random_state = 0)
alg_RF.fit(trainset_X, trainset_y)
pred_RF = alg_RF.predict(valid_X)
score_RF = metrics.f1_score(pred_RF,valid_y, average='macro')

print('F1 score for Random Forest Classifier:', score_RF)

F1 score for Random Forest Classifier: 0.9518800815972736


# KNN Classifier

In [64]:
#KNN classifier with neighbor count of 3
from sklearn.neighbors import KNeighborsClassifier
alg_KNN = KNeighborsClassifier(n_neighbors=3)
alg_KNN.fit(trainset_X, trainset_y)
pred_KNN = alg_KNN.predict(valid_X)
score_KNN = f1_score(pred_KNN,valid_y, average='macro')
print("F1 score for K-Nearest Neighbour classifier :",score_KNN)

F1 score for K-Nearest Neighbour classifier : 0.9432464135213554


# Extra Tree Classifier

In [11]:
# Extra Tree Classifier
alg_ext = ExtraTreesClassifier(n_estimators=650)
alg_ext.fit(trainset_X, trainset_y)
pred_Ext = alg_ext.predict(valid_X)
score_Ext = metrics.f1_score(pred_Ext,valid_y, average='macro')
print('F1 score for Extra Tree Classifier :', score_Ext )


F1 score for Extra Tree Classifier : 0.9611535069753913


# AdaBoost Classifier

In [20]:
# Abadoost classifier with base estimator as extra tree classifier
alg_AB = AdaBoostClassifier(base_estimator=alg_ext, random_state=0)
alg_AB.fit(trainset_X, trainset_y)
pred_AB = alg_AB.predict(valid_X)
score_AB = metrics.f1_score(pred_AB,valid_y, average='macro')
print('ADA BOOST Classifier Testing F1 score :', score_AB )


ADA BOOST Classifier Testing F1 score : 0.9605631285359773


training the entire dataset using best working algorithm

In [17]:
# Fiting the algorithm for entire data
final_alg = ExtraTreesClassifier(n_estimators=650)
final_alg.fit(X, y)


AdaBoostClassifier(base_estimator=ExtraTreesClassifier(n_estimators=650),
                   n_estimators=650, random_state=0)

# Predicting labels for test data

In [18]:
# Predicting for test data
pred_final = final_alg.predict(X_test)

In [19]:
# insert prediction to Output file 
pred_final.tofile('output_ext2.dat', sep="\n", format="%d")
