In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn import metrics
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_selection import SelectKBest, f_classif

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Read the input data
trainData = pd.read_csv('/content/drive/My Drive/Colab/255/Data/train.dat', ' ', header=None, float_precision='high')
testData= pd.read_csv('/content/drive/My Drive/Colab/255/Data/test.dat', ' ', header=None, float_precision='high')
labels = pd.read_csv('/content/drive/My Drive/Colab/255/Data/train.labels', header=None, names=['labels'])

In [4]:
#Normalizing the data.
scaler = StandardScaler()
train_x = StandardScaler().fit_transform(trainData)
test_x = StandardScaler().fit_transform(testData)

In [8]:
#Dimensionality Reduction

# pca = PCA(n_components=0.90) 
# pca_train_x = pca.fit_transform(train_x)
# pca_test_x = pca.transform(test_x)

#reduce dimensionality of train data 
featureSelector = SelectKBest(f_classif, k=48)
Xtrain = featureSelector.fit_transform(trainData,labels)
print(Xtrain.shape)
#reduce dimensionality of test data 
Xtest = featureSelector.transform(testData)
print(Xtest.shape)

(18000, 48)
(3000, 48)


In [9]:
#Handling unbalance data
spl = SMOTE(random_state=42,k_neighbors=1, n_jobs=1)
trainX = np.array(Xtrain)
trainY = np.array(labels)
x_result, y_result = spl.fit_sample(trainX, trainY)

In [10]:
x_result.shape

(88410, 48)

In [11]:
#Spliting Input data intot Training data and validation data.
training_data, validation_data,training_labels, validation_labels = train_test_split(x_result, y_result, test_size=0.2)

In [36]:
#Random Forest Classifier
# changes from 100 to 200 remove the random-state = 0
rfClf = RandomForestClassifier(n_estimators=100, random_state = 0, n_jobs=-1)
rfClf.fit(training_data, training_labels)
predictionsRF = rfClf.predict(validation_data)
f1ScoreRf = metrics.f1_score(predictionsRF,validation_labels, average='macro')
print(classification_report(validation_labels,predictionsRF))
print('Random Forest Classifier Testing F1 score =', f1ScoreRf)

              precision    recall  f1-score   support

           1       0.81      0.80      0.81      1803
           2       0.80      0.77      0.79      1801
           3       0.95      0.98      0.96      1802
           4       0.99      1.00      1.00      1722
           5       1.00      1.00      1.00      1725
           6       1.00      1.00      1.00      1751
           7       1.00      1.00      1.00      1786
           8       0.98      0.99      0.98      1781
          10       1.00      1.00      1.00      1784
          11       1.00      1.00      1.00      1727

    accuracy                           0.95     17682
   macro avg       0.95      0.95      0.95     17682
weighted avg       0.95      0.95      0.95     17682

Random Forest Classifier Testing F1 score = 0.9541933635487496


In [37]:
#KNN classifier
from sklearn.neighbors import KNeighborsClassifier
knnClf = KNeighborsClassifier(n_neighbors=3)
knnClf.fit(training_data,training_labels)
predictionsKnn = knnClf.predict(validation_data)
f1ScoreKnn = metrics.f1_score(predictionsKnn,validation_labels, average='macro')
print(classification_report(validation_labels,predictionsKnn))
print('K Nearest Neigbhor Testing F1 score =', f1ScoreKnn )

              precision    recall  f1-score   support

           1       0.84      0.68      0.75      1803
           2       0.78      0.72      0.75      1801
           3       0.88      0.99      0.93      1802
           4       0.97      1.00      0.99      1722
           5       0.99      1.00      0.99      1725
           6       1.00      1.00      1.00      1751
           7       1.00      1.00      1.00      1786
           8       0.91      1.00      0.95      1781
          10       1.00      1.00      1.00      1784
          11       1.00      1.00      1.00      1727

    accuracy                           0.94     17682
   macro avg       0.94      0.94      0.94     17682
weighted avg       0.94      0.94      0.93     17682

K Nearest Neigbhor Testing F1 score = 0.9358065214273672


In [38]:
#Extra Tree Classifier
extClf = ExtraTreesClassifier(n_estimators=500)
extClf.fit(training_data,training_labels)
predictionsExt = extClf.predict(validation_data)
f1ScoreExt = metrics.f1_score(predictionsExt,validation_labels, average='macro')
print(classification_report(validation_labels,predictionsExt))
print('Extra Tree Classifier Testing F1 score =', f1ScoreExt )

              precision    recall  f1-score   support

           1       0.83      0.82      0.83      1803
           2       0.83      0.81      0.82      1801
           3       0.97      0.99      0.98      1802
           4       1.00      1.00      1.00      1722
           5       1.00      1.00      1.00      1725
           6       1.00      1.00      1.00      1751
           7       1.00      1.00      1.00      1786
           8       0.98      1.00      0.99      1781
          10       1.00      1.00      1.00      1784
          11       1.00      1.00      1.00      1727

    accuracy                           0.96     17682
   macro avg       0.96      0.96      0.96     17682
weighted avg       0.96      0.96      0.96     17682

Extra Tree Classifier Testing F1 score = 0.9611614270044064


In [None]:
# #Extra Tree Classifier
# from sklearn import svm
# svmClf = svm.SVC()
# svmClf.fit(training_data,training_labels)
# predictionsExt = svmClf.predict(validation_data)
# f1ScoreExt = metrics.f1_score(predictionsExt,validation_labels, average='macro')
# print(classification_report(validation_labels,predictionsExt))
# print('Extra Tree Classifier Testing F1 score =', f1ScoreExt )

              precision    recall  f1-score   support

           1       0.35      0.40      0.37      1762
           2       0.33      0.20      0.25      1774
           3       0.30      0.38      0.34      1768
           4       0.45      0.30      0.36      1767
           5       0.60      0.50      0.55      1805
           6       0.97      1.00      0.98      1791
           7       0.78      0.83      0.80      1752
           8       0.34      0.45      0.39      1724
          10       0.99      1.00      1.00      1800
          11       0.82      0.87      0.84      1739

    accuracy                           0.59     17682
   macro avg       0.59      0.59      0.59     17682
weighted avg       0.59      0.59      0.59     17682

Extra Tree Classifier Testing F1 score = 0.5873791877426872


In [39]:
#voting Classifier.
votingClf = VotingClassifier(estimators=[('rf', rfClf), ('knn', knnClf), ('ext', extClf)], voting='hard')
votingClf.fit(training_data,training_labels)
predictionsVoting = votingClf.predict(validation_data)
f1score = metrics.f1_score(predictionsVoting,validation_labels, average='macro')
print(classification_report(validation_labels,predictionsVoting))
print(' Voting F1 score =', f1score )

              precision    recall  f1-score   support

           1       0.83      0.82      0.82      1803
           2       0.83      0.79      0.81      1801
           3       0.96      0.99      0.97      1802
           4       1.00      1.00      1.00      1722
           5       1.00      1.00      1.00      1725
           6       1.00      1.00      1.00      1751
           7       1.00      1.00      1.00      1786
           8       0.98      1.00      0.99      1781
          10       1.00      1.00      1.00      1784
          11       1.00      1.00      1.00      1727

    accuracy                           0.96     17682
   macro avg       0.96      0.96      0.96     17682
weighted avg       0.96      0.96      0.96     17682

 Voting F1 score = 0.9587869142684035


In [42]:
#Applying voting Classifier for whole training dataset.
votingClfFinal = VotingClassifier(estimators=[('rf', rfClf), ('knn', knnClf), ('ext', extClf)], voting='hard')
votingClfFinal.fit(x_result,y_result)

VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
        

In [43]:
#predicting for the test data set.
predictions = votingClfFinal.predict(Xtest)

In [44]:
#printing the submission file.
ItemID=[]
for i in range(1,testData.shape[0]+1):
    ItemID.append(i);
with open('output.csv', 'w') as fw:
    print("{0}{1}{2}".format('ImageID',',','Class'),file=fw)
    for i,v in zip(ItemID,predictions):
        print("{0}{1}{2}".format(i,',',v), file=fw)