In [93]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline

train_data = pd.read_csv('Dataset/train.csv', index_col='pet_id')
test_data = pd.read_csv('Dataset/test.csv', index_col='pet_id')

In [92]:
sum(train_data.condition.isnull())

1477

In [106]:
from sklearn.preprocessing import LabelEncoder


np.random.seed(42)
encoder = LabelEncoder()

train_data['color_type_encoded'] = encoder.fit(train_data.color_type)
def preprocess(data):

    data['color_type_encoded'] = encoder.transform(data.color_type)
    
    data_condition=np.random.choice(np.arange(0, 3), p=[6281/17357,6819/17357,4257/17357], size=sum(data.condition.isnull()))
    data['condition_filled'] = 0
    data.loc[data.condition.isnull(), 'condition_filled'] = 1
    data.loc[data.condition.isnull(), 'condition'] = data_condition

    data['issue_date'] = pd.to_datetime(data.issue_date)
    data['listing_date'] = pd.to_datetime(data.listing_date)

    data['issue_year'] = data.issue_date.dt.year
    data['issue_month'] = data.issue_date.dt.month
    data['issue_day'] = data.issue_date.dt.day

    data['listing_year'] = data.listing_date.dt.year
    data['listing_month'] = data.listing_date.dt.month
    data['listing_day'] = data.listing_date.dt.day

    data['total_days'] = np.array(data.listing_date-data.issue_date).astype('timedelta64[D]').astype('int64')
    
    data['length(cm)'] = data['length(m)']*100
    data['rec_area'] = data['length(cm)'] * data['height(cm)']
    
    
    return data  #sort_values(by='issue_date')

In [107]:
train_data = preprocess(train_data)
test_data = preprocess(test_data)

In [96]:
condition_features = ['length(cm)', 'height(cm)','rec_area', 'X1', 'X2', 'color_type_encoded',
           'issue_year', 'issue_month', 'issue_day', 'listing_year', 'listing_month', 'listing_day', 'total_days']
y_condition = train_data.loc[train_data.condition.isnull()==False,'condition'].values
X_condition = train_data.loc[train_data.condition.isnull()==False, condition_features].values

In [35]:
from sklearn.ensemble import GradientBoostingClassifier

condition_clf = GradientBoostingClassifier(random_state=13)
condition_clf.fit(X_condition, y_condition)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=13, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [63]:
train_condition_predicted = condition_clf.predict(train_data.loc[train_data.condition.isnull(), condition_features].values)

In [64]:
test_condition_predicted = condition_clf.predict(test_data.loc[test_data.condition.isnull(), condition_features].values)

In [82]:
train_data.loc[train_data.condition.isnull(),'condition'] = train_condition_predicted
test_data.loc[test_data.condition.isnull(),'condition'] = test_condition_predicted

In [122]:
features = ['condition', 'length(cm)', 'height(cm)','rec_area', 'X1', 'X2', 'color_type_encoded',
           'issue_year', 'issue_month', 'issue_day', 'listing_year', 'listing_month', 'listing_day', 'total_days']


X = train_data[features].values
y_pet = train_data['pet_category'].values
y_breed = train_data['breed_category']


X_train = X[:int(np.round(X.shape[0]*0.5+1))]
X_valid = X[int(np.round(X.shape[0]*0.5+1)):]

y_pet_train = y_pet[:int(np.round(y_pet.shape[0]*0.5+1))]
y_pet_valid = y_pet[int(np.round(y_pet.shape[0]*0.5+1)):]

y_breed_train = y_breed[:int(np.round(y_breed.shape[0]*0.5+1))]
y_breed_valid = y_breed[int(np.round(y_breed.shape[0]*0.5+1)):]

assert ((X_train.shape[0]+X_valid.shape[0])==X.shape[0])&\
((y_pet_train.shape[0]+y_pet_valid.shape[0])==y_pet.shape[0])&\
((y_breed_train.shape[0]+y_breed_valid.shape[0])==y_breed.shape[0]),\
"Error in train validation split"

In [123]:
pet_gbt = GradientBoostingClassifier(random_state=13)
pet_gbt.fit(X_train, y_pet_train).score(X_valid, y_pet_valid)

#0.8690918746680828

0.8642735768903993

In [124]:
breed_gbt = GradientBoostingClassifier(random_state=13)
breed_gbt.fit(X_train, y_breed_train).score(X_valid, y_breed_valid)

#0.834306956983537

0.8297578589634664

In [111]:
from sklearn.ensemble import BaggingClassifier

pet_bag = BaggingClassifier(random_state=13)
breed_bag = BaggingClassifier(random_state=13)

In [125]:
pet_bag.fit(X_train, y_pet_train).score(X_valid, y_pet_valid)

#0.8300584174190122

0.8053313508920985

In [126]:
breed_bag.fit(X_train, y_breed_train).score(X_valid, y_breed_valid)

#0.8228890069038768

0.824660152931181

In [115]:
from sklearn.ensemble import ExtraTreesClassifier

pet_extra = ExtraTreesClassifier(random_state=13)
breed_extra = ExtraTreesClassifier(random_state=13)

In [127]:
pet_extra.fit(X_train, y_pet_train).score(X_valid, y_pet_valid)

#0.7198619224641529

0.6983857264231096

In [128]:
breed_extra.fit(X_train, y_breed_train).score(X_valid, y_breed_valid)

#0.7944768985661179

0.8046941376380629

In [119]:
from sklearn.ensemble import RandomForestClassifier

pet_rf = RandomForestClassifier(random_state=13,n_jobs=4)
breed_rf = RandomForestClassifier(random_state=13, n_jobs=4)

In [129]:
pet_rf.fit(X_train, y_pet_train).score(X_valid, y_pet_valid)

#0.8204992033988316

0.8196686491079015

In [130]:
breed_rf.fit(X_train, y_breed_train).score(X_valid, y_breed_valid)

#0.8263409453000531

0.830607476635514

In [133]:
pet_final = np.column_stack([pet_gbt.predict(X_valid),
                           pet_bag.predict(X_valid),
                           pet_extra.predict(X_valid),
                           pet_rf.predict(X_valid)])

In [136]:
from sklearn.svm import SVC

pet_final_model = SVC()
pet_final_model.fit(pet_final, y_pet_valid)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [142]:
pet_preds = np.column_stack([pet_gbt.predict(test_data[features].values),
                           pet_bag.predict(test_data[features].values),
                           pet_extra.predict(test_data[features].values),
                           pet_rf.predict(test_data[features].values)])
y_pet = final_model.predict(pet_preds)

In [154]:
breed_final = np.column_stack([breed_gbt.predict(X_valid),
                           breed_bag.predict(X_valid),
                           breed_extra.predict(X_valid),
                           breed_rf.predict(X_valid)])

In [155]:
breed_final_model = SVC()
breed_final_model.fit(breed_final, y_breed_valid)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [156]:
breed_preds = np.column_stack([breed_gbt.predict(test_data[features].values),
                           breed_bag.predict(test_data[features].values),
                           breed_extra.predict(test_data[features].values),
                           breed_rf.predict(test_data[features].values)])
y_breed = final_model.predict(pet_preds)

In [157]:
breed_preds

array([[2., 0., 0., 0.],
       [1., 1., 1., 1.],
       [0., 0., 0., 0.],
       ...,
       [0., 1., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]])

In [158]:
y_breed

array([1, 1, 1, ..., 2, 2, 2], dtype=int64)

In [159]:
stacked = pd.DataFrame({'pet_id':test_data.index, 'breed_category':y_breed, 'pet_category':y_pet})

In [160]:
stacked = stacked.set_index('pet_id', drop=True)

In [161]:
stacked.to_csv("stacked.csv")

In [162]:
s = pd.read_csv('stacked.csv')

In [163]:
s

Unnamed: 0,pet_id,breed_category,pet_category
0,ANSL_61962,1,1
1,ANSL_50526,1,1
2,ANSL_63393,1,1
3,ANSL_71049,2,2
4,ANSL_51808,2,2
...,...,...,...
8067,ANSL_76504,2,2
8068,ANSL_76510,2,2
8069,ANSL_76498,2,2
8070,ANSL_76497,2,2
