In [3]:
import pandas as pd
import numpy as np
raw_data = pd.read_csv('data/train.csv')
raw_data.head()

Unnamed: 0,id,bone_length,rotting_flesh,hair_length,has_soul,color,type
0,0,0.354512,0.350839,0.465761,0.781142,clear,Ghoul
1,1,0.57556,0.425868,0.531401,0.439899,green,Goblin
2,2,0.467875,0.35433,0.811616,0.791225,black,Ghoul
3,4,0.776652,0.508723,0.636766,0.884464,black,Ghoul
4,5,0.566117,0.875862,0.418594,0.636438,green,Ghost


In [4]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371 entries, 0 to 370
Data columns (total 7 columns):
id               371 non-null int64
bone_length      371 non-null float64
rotting_flesh    371 non-null float64
hair_length      371 non-null float64
has_soul         371 non-null float64
color            371 non-null object
type             371 non-null object
dtypes: float64(4), int64(1), object(2)
memory usage: 20.4+ KB


In [37]:
from sklearn.grid_search import GridSearchCV
def clean_data(df):
    # Create dummy variables for the categorical feature 'color'
    df = pd.concat([df, pd.get_dummies(df['color'], prefix='color')], axis=1)
    df.drop('color', axis=1, inplace=True)
    return df


def grid_search(estimator, params, X, y, v=3):
    grid_search = GridSearchCV(estimator, params, cv=5, verbose=v, n_jobs=-1)
    grid_search.fit(X, y)
    sorted(grid_search.grid_scores_, key=lambda x: x.mean_validation_score)
    print("Best score equals to " + str(grid_search.best_score_))
    return grid_search.best_params_


cdf = clean_data(raw_data)
cdf['type'] = cdf['type'].map({'Ghoul':0, 'Goblin':1, 'Ghost':2}).astype(int)
y = cdf['type'].values
cdf.drop('type', axis=1, inplace=True)
print(cdf.head())
X = cdf.values

   id  bone_length  rotting_flesh  hair_length  has_soul  color_black  \
0   0     0.354512       0.350839     0.465761  0.781142          0.0   
1   1     0.575560       0.425868     0.531401  0.439899          0.0   
2   2     0.467875       0.354330     0.811616  0.791225          1.0   
3   4     0.776652       0.508723     0.636766  0.884464          1.0   
4   5     0.566117       0.875862     0.418594  0.636438          0.0   

   color_blood  color_blue  color_clear  color_green  color_white  
0          0.0         0.0          1.0          0.0          0.0  
1          0.0         0.0          0.0          1.0          0.0  
2          0.0         0.0          0.0          0.0          0.0  
3          0.0         0.0          0.0          0.0          0.0  
4          0.0         0.0          0.0          1.0          0.0  


In [38]:
from sklearn.ensemble import RandomForestClassifier
rfc_params = {
    'max_features': [0.5, 1.],
    'max_depth': [5., None],
    'n_estimators': [75, 100, 125]
}
rfc = RandomForestClassifier()
best_rfc_params = grid_search(rfc, rfc_params, X[:, 1:], y, 1)
rfc = RandomForestClassifier(max_features=best_rfc_params['max_features'],
        max_depth=best_rfc_params['max_depth'],
        n_estimators=best_rfc_params['n_estimators'])
rfc.fit(X[:, 1:], y)
# Let's see, which features are the most important
importances = rfc.feature_importances_
df_header = cdf.drop('id', axis=1).columns.values
print('Features importances:')
for index, importance in enumerate(importances):
    print("%d. %s = %.2f%%" % (index + 1, df_header[index], importance * 100))

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    2.2s finished


Best score equals to 0.7196765498652291
Features importances:
1. bone_length = 14.74%
2. rotting_flesh = 15.21%
3. hair_length = 37.43%
4. has_soul = 30.38%
5. color_black = 0.57%
6. color_blood = 0.11%
7. color_blue = 0.31%
8. color_clear = 0.58%
9. color_green = 0.26%
10. color_white = 0.42%


In [None]:
# From the previous cell we see, that color is not significant feature for classification. 
# So, let's try perform classification without color features.

In [39]:
rfc = RandomForestClassifier()
best_rfc_params = best_rfc_params = grid_search(rfc, rfc_params, X[:, 1:5], y, 1)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    2.2s finished


Best score equals to 0.7250673854447439


In [47]:
# There is a small improvement of the accuracy.
# Now let's try to perform classification using LogisticRegression with and without color features.
from sklearn.linear_model import LogisticRegression
lrc_params = {
    'C': [1, 10, 100, 1000],
    'class_weight': ['balanced', None],
    'solver': ['newton-cg', 'sag', 'lbfgs'],
    'max_iter': [500],
    'multi_class' : ['ovr', 'multinomial'],
}
lrc = LogisticRegression()
best_lrc_params = grid_search(lrc, lrc_params, X[:, 1:], y, 1)


Fitting 5 folds for each of 48 candidates, totalling 240 fits


  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backen

Best score equals to 0.7358490566037735


[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:    1.2s finished


In [48]:
best_lrc_params = grid_search(lrc, lrc_params, X[:, 1:5], y, 1)


Fitting 5 folds for each of 48 candidates, totalling 240 fits


  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backen

Best score equals to 0.7547169811320755


[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:    0.7s finished


In [49]:
# Obviously, LogisticRegression is better for our dataset, so we use it to make a submission.
model = LogisticRegression(C=best_lrc_params['C'], class_weight=best_lrc_params['class_weight'],
                           solver=best_lrc_params['solver'], max_iter=best_lrc_params['max_iter'])
model.fit(X[:, 1:5], y)
test_data = pd.read_csv('data/test.csv')
ctdf = clean_data(test_data)
predicted = model.predict(ctdf.values[:, 1:5])
r = np.c_[[test_data.values[:,0].astype(int), predicted.astype(int)]]

df_result = pd.DataFrame(r[:,:].T, columns=['id', 'type'])
df_result['type'] = df_result['type'].map({0:'Ghoul', 1:'Goblin', 2:'Ghost'})

   id  bone_length  rotting_flesh  hair_length  has_soul  color
0   3     0.471774       0.387937     0.706087  0.698537  black
1   6     0.427332       0.645024     0.565558  0.451462  white
2   9     0.549602       0.491931     0.660387  0.449809  black
3  10     0.638095       0.682867     0.471409  0.356924  white
4  13     0.361762       0.583997     0.377256  0.276364  black


In [50]:
df_result.to_csv('results/monsters_1-0.csv', index=False)