In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)

In [2]:
#Training Sets
X = pd.read_csv('train_data.csv')
y = pd.read_csv('train_labels.csv')
#Testing Sets
ActualX = pd.read_csv('test_data.csv')

#Convert train_labels from string -> numbers
vals_to_replace = {'ARIAL':0, 'TIMES':1, 'SERIF':2, 'CAMBRIA': 3, 'CALIBRI': 4,'TAHOMA': 5 }
y['Font'] = y['Font'].map(vals_to_replace)

#Split training data into Training, Testing subsets; Introduce randomness in them with shuffle and random_state
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True,random_state=100)

#To preserve orginal y_test shape for accuracy_score during Test
aftery_test = y_test 

In [3]:
X_train_rc = X_train.iloc[:, 7:len(X_train)]/255
X_train_cat = X_train.iloc[:, 1:3]
X_train_num = X_train.iloc[:, np.r_[0,3:7]]

X_test_rc = X_test.iloc[:, 7:len(X_test)]/255
X_test_cat = X_test.iloc[:, 1:3]
X_test_num = X_test.iloc[:, np.r_[0,3:7]]

ActualX_rc = ActualX.iloc[:, 7:len(ActualX)]/255
ActualX_cat = ActualX.iloc[:, 1:3]
ActualX_num = ActualX.iloc[:, np.r_[0,3:7]]

In [4]:
#changing shape of y
y = y.values.ravel()

#split training data into 2
from sklearn.model_selection import train_test_split
# Split data into train and test subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True,random_state=100)

#segregate features ; Normalized
X_train_rc = X_train.iloc[:, 7:len(X_train)]/255
X_train_core = X_train.iloc[:, 0:7]

X_test_rc = X_test.iloc[:, 7:len(X_test)]/255
X_test_core = X_test.iloc[:, 0:7]

In [5]:
import skimage
import skimage.feature

#creating empty arrays
ed_X_train_rc = np.zeros((len(X_train_rc),20,20))
ed_X_test_rc = np.zeros((len(X_test_rc),20,20))
ed_ActualX_rc = np.zeros((len(ActualX_rc),20,20))

def imgprep(df, newarr):
    for i in range(len(df)):
        img= np.array(df.iloc[i]).reshape(20,20)
        ed = skimage.feature.canny(image= img,sigma = 0.15)
        newarr[i] = ed    

#training set, testing set, actual test set
imgprep(X_train_rc, ed_X_train_rc)
imgprep(X_test_rc, ed_X_test_rc)
imgprep(ActualX_rc, ed_ActualX_rc)

# Pre-processing Categorical Data

In [6]:
# Change 2) Categorical Data into [0,1] - i.e. only 'strength' column
str_bool_replace = {0.4:0, 0.7:1}
X_train_cat["strength"] = X_train_cat["strength"].map(str_bool_replace)
X_test_cat["strength"] = X_test_cat["strength"].map(str_bool_replace)
ActualX_cat["strength"] = ActualX_cat["strength"].map(str_bool_replace)

# Change 3) Numerical Data into the range [0,1]
from sklearn.preprocessing import MinMaxScaler
traincs = MinMaxScaler(feature_range =(0, 1))
X_train_num = traincs.fit_transform(X_train_num)
X_test_num = traincs.transform(X_test_num)  
ActualX_num = traincs.transform(ActualX_num)  

#Concatenating 2) Categorical and 3) Numerical Tgt
X_train_core = np.hstack([X_train_num, X_train_cat])
X_test_core = np.hstack([X_test_num, X_test_cat])
ActualX_core = np.hstack([ActualX_num, ActualX_cat])

## Ensemble Base Models for Categorical Inputs

In [68]:
newabc_core = AdaBoostClassifier(DecisionTreeClassifier(random_state=30),n_estimators=300, learning_rate=0.7)
# Train Adaboost Classifer
newadamodel_core = newabc_core.fit(X_train_core, y_train)
newadamodel_core.score(X_test_core, y_test)

0.8668717948717949

In [11]:
rf_core = RandomForestClassifier(n_estimators=800)
rfmodel_core = rf_core.fit(X_train_core, y_train)
rfmodel_core.score(X_test_core, y_test)

0.8597948717948718

In [12]:
et_core = ExtraTreesClassifier(n_estimators=800)
etmodel_core = rf_core.fit(X_train_core, y_train)
etmodel_core.score(X_test_core, y_test)

0.8602564102564103

In [80]:
from xgboost import XGBClassifier
xgbc_core = XGBClassifier(n_estimators=4000,learning_rate = 0.7)
xgbcmodel_core = xgbc_core.fit(X_train_core, y_train)
xgbcmodel_core.score(X_test_core, y_test)

0.8826666666666667

# Voting Classifier

In [14]:
from sklearn.ensemble import VotingClassifier 

In [97]:
eclf3 = VotingClassifier(estimators=[
       ('lr', newadamodel_core), ('rf', rfmodel_core), ('gnb', etmodel_core), ('xgbc', xgbcmodel_core)],
        voting='soft', 
        weights=[4,2,2.5,4],
       flatten_transform=True)

eclf3 = eclf3.fit(X_train_core, y_train)

In [98]:
y_pred = eclf3.predict(X_test_core) 
from sklearn.metrics import accuracy_score  
# using accuracy_score 
score = accuracy_score(y_test, y_pred) 

In [99]:
y_pred

array([4, 0, 0, ..., 0, 5, 0])

In [100]:
score

0.884051282051282

# Submission 

In [101]:
#actualresults = eclf3.predict(ActualX_core)
actualresults = xgbcmodel_core.predict(ActualX_core)

In [102]:
#Convert labels from 'int' back to 'str'
stractualresults = []
for i in range(len(actualresults)):
    if actualresults[i] == 0:
        stractualresults.append('ARIAL')
    elif actualresults[i] == 1:
        stractualresults.append('TIMES')
    elif actualresults[i] == 2:
        stractualresults.append('SERIF')
    elif actualresults[i] == 3:
        stractualresults.append('CAMBRIA')
    elif actualresults[i] == 4: 
        stractualresults.append('CALIBRI')
    elif actualresults[i] == 5:
        stractualresults.append('TAHOMA')

In [103]:
pdresults = pd.Series(stractualresults,name="Font")
newsubmission = pd.concat([pd.Series(range(1,29222),name = "ID"),pdresults],axis = 1)
newsubmission.to_csv("12_10_88.csv",index=False)
newsubmission

Unnamed: 0,ID,Font
0,1,ARIAL
1,2,SERIF
2,3,TAHOMA
3,4,TAHOMA
4,5,TAHOMA
...,...,...
29216,29217,TAHOMA
29217,29218,TIMES
29218,29219,ARIAL
29219,29220,TAHOMA
