In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline

train_data = pd.read_csv('Dataset/train.csv', index_col='pet_id')
test_data = pd.read_csv('Dataset/test.csv', index_col='pet_id')



from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

train_data['color_type_encoded'] = encoder.fit_transform(train_data.color_type)
test_data['color_type_encoded'] = encoder.transform(test_data.color_type)


np.random.seed(42)

train_condition=np.random.choice(np.arange(0, 3), p=[6281/17357,6819/17357,4257/17357], size=1477)



test_condition = np.random.choice(np.arange(0,3), p=[2685/7453,2928/7453,1840/7453], size=619)

train_data['condition_filled'] = 0
test_data['condition_filled'] = 0

train_data.loc[train_data.condition.isnull(), 'condition_filled'] = 1
test_data.loc[test_data.condition.isnull(), 'condition_filled'] = 1

train_data.loc[train_data.condition.isnull(), 'condition'] = train_condition
test_data.loc[test_data.condition.isnull(), 'condition'] = test_condition

train_data['issue_date'] = pd.to_datetime(train_data.issue_date)
train_data['listing_date'] = pd.to_datetime(train_data.listing_date)
                                            
test_data['issue_date'] = pd.to_datetime(test_data.issue_date)
test_data['listing_date'] = pd.to_datetime(test_data.listing_date)

train_data['issue_year'] = train_data.issue_date.dt.year
train_data['issue_month'] = train_data.issue_date.dt.month
train_data['issue_day'] = train_data.issue_date.dt.day

train_data['listing_year'] = train_data.listing_date.dt.year
train_data['listing_month'] = train_data.listing_date.dt.month
train_data['listing_day'] = train_data.listing_date.dt.day

test_data['issue_year'] = test_data.issue_date.dt.year
test_data['issue_month'] = test_data.issue_date.dt.month
test_data['issue_day'] = test_data.issue_date.dt.day

test_data['listing_year'] = test_data.listing_date.dt.year
test_data['listing_month'] = test_data.listing_date.dt.month
test_data['listing_day'] = test_data.listing_date.dt.day

train_data['total_days'] = np.array(train_data.listing_date-train_data.issue_date).astype('timedelta64[D]').astype('int64')
test_data['total_days'] = np.array(test_data.listing_date-test_data.issue_date).astype('timedelta64[D]').astype('int64')

train_data['length(cm)'] = train_data['length(m)']*100
test_data['length(cm)'] = test_data['length(m)']*100

train_data['rec_area'] = train_data['length(cm)'] * train_data['height(cm)']
test_data['rec_area'] = test_data['length(cm)'] * test_data['height(cm)']

In [18]:
features = ['condition', 'length(cm)', 'height(cm)','rec_area', 'X1', 'X2', 'color_type_encoded',
           'issue_year', 'issue_month', 'issue_day', 'listing_year', 'listing_month', 'listing_day', 'total_days']
X = np.array(train_data[features])
y_pet = np.array(train_data['pet_category'])
y_breed = np.array(train_data['breed_category'])

In [19]:
from sklearn.model_selection import train_test_split

X_pet_train, X_pet_valid, y_pet_train, y_pet_valid = train_test_split(X, y_pet, test_size=0.20)

In [20]:
from  sklearn.ensemble import GradientBoostingClassifier

pet_gbt = GradientBoostingClassifier(random_state=13)

In [21]:
pet_gbt.fit(X_pet_train, y_pet_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=13, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [22]:
pet_gbt.score(X_pet_valid, y_pet_valid)

0.8800106185293337

In [23]:
pet_gbt.fit(X, y_pet)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=13, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [24]:
X_breed_train, X_breed_valid, y_breed_train, y_breed_valid = train_test_split(np.array(train_data[features]), y_breed, test_size=0.20)

In [25]:
breed_gbt = GradientBoostingClassifier(random_state=13)
breed_gbt.fit(X_breed_train, y_breed_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=13, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [26]:
breed_gbt.score(X_breed_valid, y_breed_valid)

0.8778869126625962

In [27]:
breed_gbt.fit(X, y_breed)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=13, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [28]:
X_test = np.array(test_data[features])

In [29]:
pet_gbt_pred = pet_gbt.predict(X_test)
breed_gbt_pred = breed_gbt.predict(X_test)

In [None]:
gbt_gbt_preds = pd.DataFrame({'pet_id':test_data.index, 'breed_category':breed_gbt_pred, 'pet_category':pet_gbt_pred})

In [None]:
gbt_gbt_preds = gbt_gbt_preds.set_index('pet_id', drop=True)

In [None]:
gbt_gbt_preds.to_csv("gbt_gbt_preds.csv")

In [None]:
breed_features = features+['pet_category']
pet_features = features+['breed_category']
breed_from_pet_gbt = GradientBoostingClassifier(random_state=13)

pet_from_breed = GradientBoostingClassifier(random_state=13)

In [None]:
features

In [None]:
breed_features

In [None]:
from  sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

scores = []
def fit_models():
    features = ['condition', 'length(cm)', 'height(cm)','rec_area', 'X1', 'X2', 'color_type_encoded',
                'issue_year', 'issue_month', 'issue_day', 'listing_year', 'listing_month', 'listing_day', 'total_days']
    y_pet = np.array(train_data['pet_category'])
    y_breed = np.array(train_data['breed_category'])
    
    X_pet_train, X_pet_valid, y_pet_train, y_pet_valid = train_test_split(np.array(train_data[features]), y_pet, test_size=0.20)
    pet_gbt = GradientBoostingClassifier(random_state=13)
    pet_gbt.fit(X_pet_train, y_pet_train)
    scores.append(pet_gbt.score(X_pet_valid, y_pet_valid))
    pet_gbt.fit(np.array(train_data[features]), y_pet)
    
    X_breed_train, X_breed_valid, y_breed_train, y_breed_valid = train_test_split(np.array(train_data[features+['pet_category']]), y_breed, test_size=0.20)
    breed_gbt = GradientBoostingClassifier(random_state=13)
    breed_gbt.fit(X_breed_train, y_breed_train)
    scores.append(breed_gbt.score(X_breed_valid, y_breed_valid))
    breed_gbt.fit(train_data[features+['pet_category']], y_breed)
    
    X_pet_train, X_pet_valid, y_pet_train, y_pet_valid = train_test_split(np.array(train_data[features+['breed_category']]), y_pet, test_size=0.20)
    pet_gbt_2 = GradientBoostingClassifier(random_state=13)
    pet_gbt_2.fit(X_pet_train, y_pet_train)
    scores.append(pet_gbt_2.score(X_pet_valid, y_pet_valid))
    pet_gbt_2.fit(train_data[features+['breed_category']], y_pet)
    
    X_breed_train, X_breed_valid, y_breed_train, y_breed_valid = train_test_split(np.array(train_data[features+['pet_category']]), y_breed, test_size=0.20)
    breed_gbt_2 = GradientBoostingClassifier(random_state=13)
    breed_gbt_2.fit(X_breed_train, y_breed_train)
    scores.append(breed_gbt_2.score(X_breed_valid, y_breed_valid))
    breed_gbt.fit(train_data[features+['pet_category']], y_breed)

In [None]:
fit_models()

In [37]:
from  sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split


features = ['condition', 'length(cm)', 'height(cm)','rec_area', 'X1', 'X2', 'color_type_encoded',
            'issue_year', 'issue_month', 'issue_day', 'listing_year', 'listing_month', 'listing_day', 'total_days']
y_pet = np.array(train_data['pet_category'])
y_breed = np.array(train_data['breed_category'])
class models:
    def __init__(self):
        self.pet_gbt = GradientBoostingClassifier(random_state=13)
        self.breed_gbt = GradientBoostingClassifier(random_state=13)
        self.pet_gbt_2 = GradientBoostingClassifier(random_state=13)
        self.breed_gbt_2 = GradientBoostingClassifier(random_state=13)
        
    def evaluate_combination(self):
        scores = []
        
        X_pet_train, X_pet_valid, y_pet_train, y_pet_valid = train_test_split(np.array(train_data[features]), 
                                                                              y_pet, test_size=0.20)
        #self.pet_gbt = GradientBoostingClassifier(random_state=13)
        self.pet_gbt.fit(X_pet_train, y_pet_train)
        scores.append(self.pet_gbt.score(X_pet_valid, y_pet_valid))
        
        
        X_breed_train, X_breed_valid, y_breed_train, y_breed_valid = train_test_split(np.array(train_data[features+['pet_category']]), 
                                                                                      y_breed, test_size=0.20)
        #self.breed_gbt = GradientBoostingClassifier(random_state=13)
        self.breed_gbt.fit(X_breed_train, y_breed_train)
        scores.append(self.breed_gbt.score(X_breed_valid, y_breed_valid))
        
        
        X_pet_train, X_pet_valid, y_pet_train, y_pet_valid = train_test_split(np.array(train_data[features+['breed_category']]),
                                                                              y_pet, test_size=0.20)
        #self.pet_gbt_2 = GradientBoostingClassifier(random_state=13)
        self.pet_gbt_2.fit(X_pet_train, y_pet_train)
        scores.append(self.pet_gbt_2.score(X_pet_valid, y_pet_valid))
        
        
        X_breed_train, X_breed_valid, y_breed_train, y_breed_valid = train_test_split(np.array(train_data[features+['pet_category']]),
                                                                                      y_breed, test_size=0.20)
        #self.breed_gbt_2 = GradientBoostingClassifier(random_state=13)
        self.breed_gbt_2.fit(X_breed_train, y_breed_train)
        scores.append(self.breed_gbt_2.score(X_breed_valid, y_breed_valid))
        
        
        
        
        
        '''X_breed_train, X_breed_valid, y_breed_train, y_breed_valid = train_test_split(np.array(train_data[features]),
                                                                                      y_breed, test_size=0.20)
        self.breeds_gbt.fit(X_breed_train, y_breed_train)
        scores.append(self.breeds_gbt.score(X_breed_valid, y_breed_valid))
        
        X_pet_train, X_pet_valid, y_pet_train, y_pet_valid = train_test_split(np.array(train_data[features+['breed_category']]),
                                                                              y_pet, test_size=0.20)
        self.pets_gbt.fit(X_pet_train, y_pet_train)
        scores.append(self.pets_gbt.score(X_pet_valid, y_pet_valid))
        
        X_breed_train, X_breed_valid, y_breed_train, y_breed_valid = train_test_split(np.array(train_data[features+['pet_category']]), 
                                                                                      y_breed, test_size=0.20)
        self.breeds_gbt_2.fit(X_breed_train, y_breed_train)
        scores.append(self.breeds_gbt_2.score(X_breed_valid, y_breed_valid))'''
        
        return scores
    
    def fit_model(self):
        self.pet_gbt.fit(np.array(train_data[features]), y_pet)
        self.breed_gbt.fit(np.array(train_data[features+['pet_category']]), y_breed)
        self.pet_gbt_2.fit(np.array(train_data[features+['breed_category']]), y_pet)
        self.breed_gbt_2.fit(np.array(train_data[features+['pet_category']]), y_breed)
        
        
    def predict(self):
        for i in range(5):
            pet_category = self.pet_gbt.predict(np.array(test_data[features]))
            test_data['pet_category'] = pet_category
            self.pet_gbt.fit(test_data[features], pet_category)

            breed_category = self.breed_gbt.predict(np.array(test_data[features+['pet_category']]))
            test_data['breed_category'] = breed_category
            self.breed_gbt.fit(test_data[features+['pet_category']], breed_category)


            pet_category = self.pet_gbt_2.predict(np.array(test_data[features+['breed_category']]))
            test_data['pet_category'] = pet_category
            self.pet_gbt_2.fit(test_data[features+['breed_category']], pet_category)

            breed_category = self.breed_gbt_2.predict(np.array(test_data[features+['pet_category']]))
            test_data['breed_category'] = breed_category
            self.breed_gbt_2.fit(test_data[features+['pet_category']], breed_category)
            
            print("Iteration:", i)


        
    def prepare_submission(self, n):
        breeds = test_data.breed_category.values
        pets = test_data.pet_category.values
        submission_dataframe = pd.DataFrame({'pet_id':test_data.index, 'breed_category':breeds, 'pet_category':pets})
        submission_dataframe = submission_dataframe.set_index('pet_id', drop=True)
        submission_dataframe.to_csv("submission"+str(n)+".csv")
 

In [38]:
model = models()

In [30]:
model.evaluate_combination()

[0.8938147066631271,
 0.8829307140960977,
 0.8999203610299974,
 0.8853198831961774]

In [39]:
model.fit_model()

In [40]:
model.predict()

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4


In [41]:
model.prepare_submission(10)

In [None]:
np.array(train_data[features+['breed_category']])

In [None]:
train_data[features+['breed_category']]

In [42]:
tmp = pd.read_csv("submission10.csv")

In [43]:
tmp.head()

Unnamed: 0,pet_id,breed_category,pet_category
0,ANSL_75005,1.0,2
1,ANSL_76663,0.0,1
2,ANSL_58259,0.0,2
3,ANSL_67171,0.0,2
4,ANSL_72871,0.0,2


In [None]:
test_data.head()

In [None]:
test_data.index

In [44]:
train_data['prod']=train_data.pet_category*train_data.breed_category

In [45]:
# predict pet1
# predict breed1 
# calculate pet1*breed1
#use pet1*breed1 and breed1 to predict pet2
#use pet2*breed1 to predict breed2
train_data.head()


Unnamed: 0_level_0,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category,...,issue_year,issue_month,issue_day,listing_year,listing_month,listing_day,total_days,length(cm),rec_area,prod
pet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ANSL_69903,2016-07-10,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1,...,2016,7,10,2016,9,21,73,80.0,622.4,0.0
ANSL_66892,2013-11-21,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,0.0,2,...,2013,11,21,2018,12,27,1862,72.0,1021.68,0.0
ANSL_69750,2014-09-28,2016-10-19 08:24:00,1.0,Brown,0.15,40.9,15,4,2.0,4,...,2014,9,28,2016,10,19,752,15.0,613.5,8.0
ANSL_71623,2016-12-31,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1,0.0,2,...,2016,12,31,2019,1,25,755,62.0,1104.84,0.0
ANSL_57969,2017-09-28,2017-11-19 09:38:00,2.0,Black,0.5,11.06,18,4,0.0,1,...,2017,9,28,2017,11,19,52,50.0,553.0,0.0


In [47]:
pet1 = GradientBoostingClassifier(random_state=13)
breed1 = GradientBoostingClassifier(random_state=13)
prod = GradientBoostingClassifier(random_state=13)
pet2 = GradientBoostingClassifier(random_state=13)
breed2 = GradientBoostingClassifier(random_state=13)

In [51]:
features = features = ['condition', 'length(cm)', 'height(cm)','rec_area', 'X1', 'X2', 'color_type_encoded',
            'issue_year', 'issue_month', 'issue_day', 'listing_year', 'listing_month', 'listing_day', 'total_days']
y_pet = train_data['pet_category'].values
y_breed = train_data['breed_category'].values

In [52]:
X_p1_train, X_p1_valid, y_p1_train, y_p1_valid = train_test_split(train_data[features].values, 
                                                                              y_pet, test_size=0.20)

In [None]:
scores = []
    
pet1.fit(X_p1_train, y_p1_train)
scores.append(pet1.score(X_p1_valid, y_p1_valid))
#pet1.fit(np.array(train_data[features]), y_pet)

X_b1_train, X_b1_valid, y_b1_train, y_b1_valid = train_test_split(train_data[features].values, 
                                                                              y_breed, test_size=0.20)

breed1.fit(X_b1_train, y_b1_train)
scores.append(breed1.score(X_b1_valid, y_b1_valid))

In [56]:
y_prod = train_data['prod'].values

In [57]:
X_prod_train, X_prod_valid, y_prod_train, y_prod_valid = train_test_split(train_data[features].values,
                                                                         y_prod, test_size=0.2)
prod.fit(X_prod_train, y_prod_train)
scores.append(prod.score(X_prod_valid, y_prod_valid))

In [58]:
scores

[0.8823997876294133, 0.8733740376957791, 0.8378019644279268]