In [1]:
#some imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline

#Reading Train and Test Data
train_data = pd.read_csv('Dataset/train.csv', index_col='pet_id')
test_data = pd.read_csv('Dataset/test.csv', index_col='pet_id')

In [2]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

#Label Encoding The Color_type Column
train_data['color_type_encoded'] = encoder.fit_transform(train_data.color_type)
test_data['color_type_encoded'] = encoder.transform(test_data.color_type)

#Filling in the NAN values for Condition Column with random array with same distribution
train_condition=np.random.choice(np.arange(0, 3), p=[6281/17357,6819/17357,4257/17357], size=1477)
test_condition = np.random.choice(np.arange(0,3), p=[2685/7453,2928/7453,1840/7453], size=619)

train_data['condition_filled'] = 0
test_data['condition_filled'] = 0

train_data.loc[train_data.condition.isnull(), 'condition_filled'] = 1
test_data.loc[test_data.condition.isnull(), 'condition_filled'] = 1

#Adding new column indicating whether the value in Condition Column is filled 
train_data.loc[train_data.condition.isnull(), 'condition'] = -99
test_data.loc[test_data.condition.isnull(), 'condition'] = -99

#Converting Dates column to Pandas DateTime type
train_data['issue_date'] = pd.to_datetime(train_data.issue_date)
train_data['listing_date'] = pd.to_datetime(train_data.listing_date)
                                            
test_data['issue_date'] = pd.to_datetime(test_data.issue_date)
test_data['listing_date'] = pd.to_datetime(test_data.listing_date)

#Adding three new columns for day, month, year for each date columns
train_data['issue_year'] = train_data.issue_date.dt.year
train_data['issue_month'] = train_data.issue_date.dt.month
train_data['issue_day'] = train_data.issue_date.dt.day

train_data['listing_year'] = train_data.listing_date.dt.year
train_data['listing_month'] = train_data.listing_date.dt.month
train_data['listing_day'] = train_data.listing_date.dt.day

test_data['issue_year'] = test_data.issue_date.dt.year
test_data['issue_month'] = test_data.issue_date.dt.month
test_data['issue_day'] = test_data.issue_date.dt.day

test_data['listing_year'] = test_data.listing_date.dt.year
test_data['listing_month'] = test_data.listing_date.dt.month
test_data['listing_day'] = test_data.listing_date.dt.day

#Adding a new column for Difference in Issue and Listing Dates
train_data['total_days'] = np.array(train_data.listing_date-train_data.issue_date).astype('timedelta64[D]').astype('int64')
test_data['total_days'] = np.array(test_data.listing_date-test_data.issue_date).astype('timedelta64[D]').astype('int64')

#Adding length(cm) column, converting the values in length(m) from meter to centimeters
train_data['length(cm)'] = train_data['length(m)']*100
test_data['length(cm)'] = test_data['length(m)']*100

#Adding a new column for representing the rectangular area for the given length and height
train_data['rec_area'] = train_data['length(cm)'] * train_data['height(cm)']
test_data['rec_area'] = test_data['length(cm)'] * test_data['height(cm)']

In [3]:
features = ['condition', 'length(cm)', 'height(cm)','rec_area', 'X1', 'X2', 'color_type_encoded',
           'issue_year', 'issue_month', 'issue_day', 'listing_year', 'listing_month', 'listing_day', 'total_days']
X = train_data[features].values
y_pet = train_data['pet_category'].values
y_breed = train_data['breed_category']


In [27]:
X_train = X[:int(np.round(X.shape[0]*0.8+1))]
X_valid = X[int(np.round(X.shape[0]*0.8+1)):]

y_pet_train = y_pet[:int(np.round(y_pet.shape[0]*0.8+1))]
y_pet_valid = y_pet[int(np.round(y_pet.shape[0]*0.8+1)):]

y_breed_train = y_breed[:int(np.round(y_breed.shape[0]*0.8+1))]
y_breed_valid = y_breed[int(np.round(y_breed.shape[0]*0.8+1)):]

assert ((X_train.shape[0]+X_valid.shape[0])==X.shape[0])&\
((y_pet_train.shape[0]+y_pet_valid.shape[0])==y_pet.shape[0])&\
((y_breed_train.shape[0]+y_breed_valid.shape[0])==y_breed.shape[0]),\
"Error in train validation split"

In [24]:
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, \
GradientBoostingClassifier, RandomForestClassifier, StackingClassifier

pet_bagg = BaggingClassifier(random_state=13)
pet_extra = ExtraTreesClassifier(random_state=13)
pet_gbr = GradientBoostingClassifier(random_state=13)
pet_rfr = RandomForestClassifier(random_state=13)

In [25]:
estimators = [
    ('bagg', pet_bagg),
    ('extra', pet_extra),
    ('gbr', pet_gbr),
    ('pet', pet_rfr)
]

In [30]:
breed_stack = StackingClassifier(estimators=estimators,
                           n_jobs=4,
                           verbose=1)
breed_stack.fit(X_train, y_pet_train).score(X_valid, y_pet_valid)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8993627190653213

In [31]:
breed_bagg = BaggingClassifier(random_state=13)
breed_extra = ExtraTreesClassifier(random_state=13)
breed_gbr = GradientBoostingClassifier(random_state=13)
breed_rfr = RandomForestClassifier(random_state=13)

estimators = [
    ('bagg', breed_bagg),
    ('extra', breed_extra),
    ('gbr', breed_gbr),
    ('pet', breed_rfr)
]

breed_tack = StackingClassifier(estimators=estimators,
                           n_jobs=4,
                           verbose=1)
breed_stack.fit(X_train, y_breed_train).score(X_valid, y_breed_valid)

0.9004248539564524

In [38]:
breed_ada = AdaBoostClassifier(base_estimator=breed_stack,
                            random_state=13)

In [39]:
breed_ada.fit(X_train, y_breed_train).score(X_valid, y_breed_valid)

0.817578332448221

In [8]:
pet_bagg.fit(X_train, y_pet_train)
pet_extra.fit(X_train, y_pet_train)
pet_gbr.fit(X_train, y_pet_train)
pet_rfr.fit(X_train, y_pet_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=13, verbose=0, warm_start=False)