In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline

train_data = pd.read_csv('Dataset/train.csv', index_col='pet_id')
test_data = pd.read_csv('Dataset/test.csv', index_col='pet_id')



from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

train_data['color_type_encoded'] = encoder.fit_transform(train_data.color_type)
test_data['color_type_encoded'] = encoder.transform(test_data.color_type)


np.random.seed(42)

train_condition=np.random.choice(np.arange(0, 3), p=[6281/17357,6819/17357,4257/17357], size=1477)
test_condition = np.random.choice(np.arange(0,3), p=[2685/7453,2928/7453,1840/7453], size=619)

train_data['condition_filled'] = 0
test_data['condition_filled'] = 0

train_data.loc[train_data.condition.isnull(), 'condition_filled'] = 1
test_data.loc[test_data.condition.isnull(), 'condition_filled'] = 1

train_data.loc[train_data.condition.isnull(), 'condition'] = train_condition
test_data.loc[test_data.condition.isnull(), 'condition'] = test_condition

train_data['issue_date'] = pd.to_datetime(train_data.issue_date)
train_data['listing_date'] = pd.to_datetime(train_data.listing_date)
                                            
test_data['issue_date'] = pd.to_datetime(test_data.issue_date)
test_data['listing_date'] = pd.to_datetime(test_data.listing_date)

train_data['issue_year'] = train_data.issue_date.dt.year
train_data['issue_month'] = train_data.issue_date.dt.month
train_data['issue_day'] = train_data.issue_date.dt.day

train_data['listing_year'] = train_data.listing_date.dt.year
train_data['listing_month'] = train_data.listing_date.dt.month
train_data['listing_day'] = train_data.listing_date.dt.day

test_data['issue_year'] = test_data.issue_date.dt.year
test_data['issue_month'] = test_data.issue_date.dt.month
test_data['issue_day'] = test_data.issue_date.dt.day

test_data['listing_year'] = test_data.listing_date.dt.year
test_data['listing_month'] = test_data.listing_date.dt.month
test_data['listing_day'] = test_data.listing_date.dt.day

train_data['total_days'] = np.array(train_data.listing_date-train_data.issue_date).astype('timedelta64[D]').astype('int64')
test_data['total_days'] = np.array(test_data.listing_date-test_data.issue_date).astype('timedelta64[D]').astype('int64')

train_data['length(cm)'] = train_data['length(m)']*100
test_data['length(cm)'] = test_data['length(m)']*100

train_data['rec_area'] = train_data['length(cm)'] * train_data['height(cm)']
test_data['rec_area'] = test_data['length(cm)'] * test_data['height(cm)']

In [2]:
train_data.head()

Unnamed: 0_level_0,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category,...,condition_filled,issue_year,issue_month,issue_day,listing_year,listing_month,listing_day,total_days,length(cm),rec_area
pet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ANSL_69903,2016-07-10,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1,...,0,2016,7,10,2016,9,21,73,80.0,622.4
ANSL_66892,2013-11-21,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,0.0,2,...,0,2013,11,21,2018,12,27,1862,72.0,1021.68
ANSL_69750,2014-09-28,2016-10-19 08:24:00,1.0,Brown,0.15,40.9,15,4,2.0,4,...,1,2014,9,28,2016,10,19,752,15.0,613.5
ANSL_71623,2016-12-31,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1,0.0,2,...,0,2016,12,31,2019,1,25,755,62.0,1104.84
ANSL_57969,2017-09-28,2017-11-19 09:38:00,2.0,Black,0.5,11.06,18,4,0.0,1,...,0,2017,9,28,2017,11,19,52,50.0,553.0


In [3]:
train_data['X1X2'] = train_data.X1*train_data.X2

In [4]:
train_data['X1mX2'] = train_data.X1-train_data.X2

In [5]:
train_data.head()

Unnamed: 0_level_0,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category,...,issue_month,issue_day,listing_year,listing_month,listing_day,total_days,length(cm),rec_area,X1X2,X1mX2
pet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ANSL_69903,2016-07-10,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1,...,7,10,2016,9,21,73,80.0,622.4,117,4
ANSL_66892,2013-11-21,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,0.0,2,...,11,21,2018,12,27,1862,72.0,1021.68,117,4
ANSL_69750,2014-09-28,2016-10-19 08:24:00,1.0,Brown,0.15,40.9,15,4,2.0,4,...,9,28,2016,10,19,752,15.0,613.5,60,11
ANSL_71623,2016-12-31,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1,0.0,2,...,12,31,2019,1,25,755,62.0,1104.84,0,-1
ANSL_57969,2017-09-28,2017-11-19 09:38:00,2.0,Black,0.5,11.06,18,4,0.0,1,...,9,28,2017,11,19,52,50.0,553.0,72,14


In [9]:
train_data.X2.unique()

array([9, 4, 1, 7, 6, 2, 8, 3, 5, 0], dtype=int64)

In [10]:
train_data['X1C'] = train_data.X1*train_data.condition
train_data['X2C'] = train_data.X2*train_data.condition

In [11]:
train_data.head()

Unnamed: 0_level_0,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category,...,listing_year,listing_month,listing_day,total_days,length(cm),rec_area,X1X2,X1mX2,X1C,X2C
pet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ANSL_69903,2016-07-10,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1,...,2016,9,21,73,80.0,622.4,117,4,26.0,18.0
ANSL_66892,2013-11-21,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,0.0,2,...,2018,12,27,1862,72.0,1021.68,117,4,13.0,9.0
ANSL_69750,2014-09-28,2016-10-19 08:24:00,1.0,Brown,0.15,40.9,15,4,2.0,4,...,2016,10,19,752,15.0,613.5,60,11,15.0,4.0
ANSL_71623,2016-12-31,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1,0.0,2,...,2019,1,25,755,62.0,1104.84,0,-1,0.0,1.0
ANSL_57969,2017-09-28,2017-11-19 09:38:00,2.0,Black,0.5,11.06,18,4,0.0,1,...,2017,11,19,52,50.0,553.0,72,14,36.0,8.0


In [12]:
train_data['X1mX2C'] = train_data.X1mX2*train_data.condition

In [13]:
train_data.head()

Unnamed: 0_level_0,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category,...,listing_month,listing_day,total_days,length(cm),rec_area,X1X2,X1mX2,X1C,X2C,X1mX2C
pet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ANSL_69903,2016-07-10,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1,...,9,21,73,80.0,622.4,117,4,26.0,18.0,8.0
ANSL_66892,2013-11-21,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,0.0,2,...,12,27,1862,72.0,1021.68,117,4,13.0,9.0,4.0
ANSL_69750,2014-09-28,2016-10-19 08:24:00,1.0,Brown,0.15,40.9,15,4,2.0,4,...,10,19,752,15.0,613.5,60,11,15.0,4.0,11.0
ANSL_71623,2016-12-31,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1,0.0,2,...,1,25,755,62.0,1104.84,0,-1,0.0,1.0,-1.0
ANSL_57969,2017-09-28,2017-11-19 09:38:00,2.0,Black,0.5,11.06,18,4,0.0,1,...,11,19,52,50.0,553.0,72,14,36.0,8.0,28.0


In [14]:
train_data['CRec'] = train_data.condition*train_data.rec_area

In [15]:
features = ['condition', 'length(cm)', 'height(cm)','rec_area', 'X1', 'X2', 'color_type_encoded',
           'issue_year', 'issue_month', 'issue_day', 'listing_year', 'listing_month', 'listing_day', 'total_days', 'CRec',
           'X1mX2C', 'X2C', 'X1C', 'X1mX2', 'X1X2']
X = train_data[features].values
y_pet = np.array(train_data['pet_category'])
y_breed = np.array(train_data['breed_category'])

In [16]:
from sklearn.model_selection import train_test_split

X_pet_train, X_pet_valid, y_pet_train, y_pet_valid = train_test_split(X, y_pet, test_size=0.20)

In [17]:
from  sklearn.ensemble import GradientBoostingClassifier

pet_gbt = GradientBoostingClassifier(random_state=13)

In [18]:
pet_gbt.fit(X_pet_train, y_pet_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=13, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [19]:
pet_gbt.score(X_pet_valid, y_pet_valid)

0.8813379346960446