In [2181]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')


import category_encoders as ce

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [2108]:
train = pd.read_csv('../data/dbs/train.csv')
test = pd.read_csv('../data/dbs/test.csv')

In [2109]:
train.head()

Unnamed: 0,pet_sale_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,ANSL_36614,2011-01-12 00:00:00,2018-02-18 15:56:00,0.0,Torbie,0.11,9.51,13,9,1,1
1,ANSL_22109,2015-07-24 00:00:00,2015-10-14 12:06:00,2.0,Brown Tabby,0.42,48.74,0,2,0,1
2,ANSL_27582,2016-12-02 00:00:00,2017-02-11 14:24:00,2.0,Black,0.9,34.45,13,9,1,1
3,ANSL_637,2015-07-05 00:00:00,2015-10-13 17:40:00,1.0,Flame Point,0.94,48.91,7,1,0,1
4,ANSL_27541,2016-01-16 00:00:00,2017-08-31 18:06:00,1.0,Yellow Brindle,0.78,17.07,0,7,0,2


## Basic Exploration

In [2110]:
print(f"The number of records in train: {train.shape}")
print(f"The number of records in test: {test.shape}")

The number of records in train: (13991, 11)
The number of records in test: (5997, 9)


In [2111]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13991 entries, 0 to 13990
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   pet_sale_id     13991 non-null  object 
 1   issue_date      13991 non-null  object 
 2   listing_date    13991 non-null  object 
 3   condition       12935 non-null  float64
 4   color_type      13991 non-null  object 
 5   length(m)       13991 non-null  float64
 6   height(cm)      13991 non-null  float64
 7   X1              13991 non-null  int64  
 8   X2              13991 non-null  int64  
 9   breed_category  13991 non-null  int64  
 10  pet_category    13991 non-null  int64  
dtypes: float64(3), int64(4), object(4)
memory usage: 1.2+ MB


### Missing value

In [2112]:
train.isnull().sum()

pet_sale_id          0
issue_date           0
listing_date         0
condition         1056
color_type           0
length(m)            0
height(cm)           0
X1                   0
X2                   0
breed_category       0
pet_category         0
dtype: int64

There are some missing value in the 'condition' column. Need to impute with some value 

### Lets explore each column one by one

In [2113]:
train.columns

Index(['pet_sale_id', 'issue_date', 'listing_date', 'condition', 'color_type',
       'length(m)', 'height(cm)', 'X1', 'X2', 'breed_category',
       'pet_category'],
      dtype='object')

In [2114]:
train.head()

Unnamed: 0,pet_sale_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,ANSL_36614,2011-01-12 00:00:00,2018-02-18 15:56:00,0.0,Torbie,0.11,9.51,13,9,1,1
1,ANSL_22109,2015-07-24 00:00:00,2015-10-14 12:06:00,2.0,Brown Tabby,0.42,48.74,0,2,0,1
2,ANSL_27582,2016-12-02 00:00:00,2017-02-11 14:24:00,2.0,Black,0.9,34.45,13,9,1,1
3,ANSL_637,2015-07-05 00:00:00,2015-10-13 17:40:00,1.0,Flame Point,0.94,48.91,7,1,0,1
4,ANSL_27541,2016-01-16 00:00:00,2017-08-31 18:06:00,1.0,Yellow Brindle,0.78,17.07,0,7,0,2


1. PET SALE ID

In [2115]:
train['pet_sale_id'].nunique()

13991

In [2116]:
train[train['pet_sale_id'] == 'ANSL_27541']

Unnamed: 0,pet_sale_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
4,ANSL_27541,2016-01-16 00:00:00,2017-08-31 18:06:00,1.0,Yellow Brindle,0.78,17.07,0,7,0,2


In [2117]:
# Lets keep this column for now as this is going to be our input column for our final prediction. 
# Note: There is a little confusion here on how just the sale id going to predict

2. Condition

In [2118]:
train['condition'].unique()

array([ 0.,  2.,  1., nan])

In [2119]:
train['condition'].value_counts()

1.0    4997
0.0    4611
2.0    3327
Name: condition, dtype: int64

In [2120]:
train['condition'].mode()

0    1.0
Name: condition, dtype: float64

Lets impute the null values with mode values of this columns

In [2121]:
value = train['condition'].mode()[0]

print(value)

train['condition'].fillna(value, inplace=True)

1.0


In [2122]:
train['condition'].value_counts()

1.0    6053
0.0    4611
2.0    3327
Name: condition, dtype: int64

In [2123]:
train.isnull().sum()

pet_sale_id       0
issue_date        0
listing_date      0
condition         0
color_type        0
length(m)         0
height(cm)        0
X1                0
X2                0
breed_category    0
pet_category      0
dtype: int64

3. Color Type

In [2124]:
train['color_type'].value_counts()

Black              3364
White              1703
Brown              1427
Brown Tabby        1180
Tan                 930
Blue                624
Orange Tabby        585
Red                 417
Brown Brindle       398
Tricolor            390
Blue Tabby          328
Tortie              281
Calico              260
Chocolate           231
Gray                225
Torbie              178
Sable               143
Yellow              134
Cream Tabby         131
Cream               116
Buff                 98
Fawn                 94
Lynx Point           89
Seal Point           74
Blue Merle           66
Black Brindle        50
Gold                 42
Black Tabby          39
Flame Point          37
Blue Tick            32
Gray Tabby           31
Black Smoke          28
Silver               26
Red Merle            25
Red Tick             25
Brown Merle          24
Orange               19
Yellow Brindle       17
Lilac Point          16
Apricot              16
Silver Tabby         14
Tortie Point    

4. length

In [2125]:
train['length(m)'].describe()

count    13991.000000
mean         0.499457
std          0.289461
min          0.000000
25%          0.250000
50%          0.500000
75%          0.750000
max          1.000000
Name: length(m), dtype: float64

5. height(cm)

In [2126]:
train['height(cm)'].describe()

count    13991.000000
mean        27.291664
std         13.048867
min          5.000000
25%         16.025000
50%         27.140000
75%         38.535000
max         50.000000
Name: height(cm), dtype: float64

6. X1

In [2127]:
sorted(train['X1'].unique())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19]

In [2128]:
train['X1'].describe()

count    13991.000000
mean         5.803016
std          6.690626
min          0.000000
25%          0.000000
50%          0.000000
75%         13.000000
max         19.000000
Name: X1, dtype: float64

7. X2

In [2129]:
sorted(train['X2'].unique())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [2130]:
train['X2'].describe()

count    13991.000000
mean         4.770281
std          3.515977
min          0.000000
25%          1.000000
50%          4.000000
75%          9.000000
max          9.000000
Name: X2, dtype: float64

## Target columns 

In [2131]:
train['breed_category'].unique()

array([1, 0, 2])

In [2132]:
train['pet_category'].unique()

array([1, 2, 4, 0, 3])

## Data Transformation for Model

In [2133]:
train.head()

Unnamed: 0,pet_sale_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,ANSL_36614,2011-01-12 00:00:00,2018-02-18 15:56:00,0.0,Torbie,0.11,9.51,13,9,1,1
1,ANSL_22109,2015-07-24 00:00:00,2015-10-14 12:06:00,2.0,Brown Tabby,0.42,48.74,0,2,0,1
2,ANSL_27582,2016-12-02 00:00:00,2017-02-11 14:24:00,2.0,Black,0.9,34.45,13,9,1,1
3,ANSL_637,2015-07-05 00:00:00,2015-10-13 17:40:00,1.0,Flame Point,0.94,48.91,7,1,0,1
4,ANSL_27541,2016-01-16 00:00:00,2017-08-31 18:06:00,1.0,Yellow Brindle,0.78,17.07,0,7,0,2


In [2134]:
train.columns

Index(['pet_sale_id', 'issue_date', 'listing_date', 'condition', 'color_type',
       'length(m)', 'height(cm)', 'X1', 'X2', 'breed_category',
       'pet_category'],
      dtype='object')

In [2135]:
train_breed_category = train[['pet_sale_id', 'issue_date', 'listing_date', 'condition', 'color_type',
       'length(m)', 'height(cm)', 'X1', 'X2', 'breed_category']]

train_pet_category = train[['pet_sale_id', 'issue_date', 'listing_date', 'condition', 'color_type',
       'length(m)', 'height(cm)', 'X1', 'X2', 'pet_category']]

test_breed_category = test.copy()
test_pet_category = test.copy()

In [2136]:
test.columns

Index(['pet_sale_id', 'issue_date', 'listing_date', 'condition', 'color_type',
       'length(m)', 'height(cm)', 'X1', 'X2'],
      dtype='object')

In [2137]:
test_breed_category

Unnamed: 0,pet_sale_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2
0,ANSL_1354,2015-06-19 00:00:00,2015-07-24 13:20:00,2.0,Black,0.27,22.75,13,9
1,ANSL_22102,2015-08-28 00:00:00,2015-12-05 14:13:00,1.0,Black,0.57,21.17,0,1
2,ANSL_12994,2015-06-21 00:00:00,2015-10-30 18:08:00,1.0,Orange Tabby,0.00,34.34,0,1
3,ANSL_35930,2010-04-14 00:00:00,2018-05-10 14:55:00,0.0,Tan,0.46,44.66,13,9
4,ANSL_41868,2016-08-16 00:00:00,2016-09-16 19:06:00,,Black,0.89,14.72,13,9
...,...,...,...,...,...,...,...,...,...
5992,ANSL_9471,2015-10-06 00:00:00,2015-11-04 16:50:00,2.0,Orange Tabby,0.60,32.68,13,9
5993,ANSL_49040,2017-04-13 00:00:00,2017-07-08 19:00:00,1.0,Chocolate,0.40,48.06,0,1
5994,ANSL_16530,2015-11-21 00:00:00,2016-03-12 12:37:00,1.0,Black,0.53,10.97,7,1
5995,ANSL_42416,2016-09-07 00:00:00,2017-12-04 12:22:00,1.0,Red,0.13,19.53,0,1


In [2138]:
def data_transformation():

    # Dropping the issue date and listing date from the data
    train_breed_category.drop(columns = ['issue_date','listing_date'], inplace=True)
    train_pet_category.drop(columns = ['issue_date','listing_date'], inplace=True)

    test_breed_category.drop(columns = ['issue_date','listing_date'], inplace=True)
    test_pet_category.drop(columns = ['issue_date','listing_date'], inplace=True)


    #Imputing the null values in condition columns
    breed_condition_value = train_breed_category['condition'].mode()[0]
    pet_condition_value = train_pet_category['condition'].mode()[0]

    test_breed_condition_value = test_breed_category['condition'].mode()[0]
    test_pet_condition_value = test_pet_category['condition'].mode()[0]
    
    train_breed_category['condition'].fillna(breed_condition_value, inplace=True)
    train_pet_category['condition'].fillna(pet_condition_value, inplace=True)

    test_breed_category['condition'].fillna(test_breed_condition_value, inplace=True)
    test_pet_category['condition'].fillna(test_pet_condition_value, inplace=True)


    traget_encoding_cols = ['pet_sale_id', 'color_type']

    for items in traget_encoding_cols:

        # Initialize the TargetEncoder for pet_sale_id
        encoder_breed = ce.TargetEncoder(cols=[items])
        encoder_pet = ce.TargetEncoder(cols=[items])

        # Fit the encoder on the data (pet_sale_id) and target (breed_category)
        encoder_breed.fit(train_breed_category[items], train_breed_category['breed_category'])
        encoder_pet.fit(train_pet_category[items], train_pet_category['pet_category'])
        

        # Transform the 'pet_sale_id' column to target-encoded values
        train_breed_category[items] = encoder_breed.transform(train_breed_category[items])
        test_breed_category[items] = encoder_breed.transform(test_breed_category[items])

        train_pet_category[items] = encoder_pet.transform(train_pet_category[items])
        test_pet_category[items] = encoder_pet.transform(test_pet_category[items])



    # Perform one-hot encoding
    train_breed = pd.get_dummies(train_breed_category, columns=['condition'], prefix=['condition'])
    test_breed = pd.get_dummies(test_breed_category, columns=['condition'], prefix=['condition'])
    train_pet = pd.get_dummies(train_pet_category, columns=['condition'], prefix=['condition'])
    test_pet = pd.get_dummies(test_pet_category, columns=['condition'], prefix=['condition'])
    


    return train_breed,test_breed,train_pet,test_pet


train_breed,test_breed,train_pet,test_pet = data_transformation()



In [2149]:
train_breed.columns

Index(['pet_sale_id', 'color_type', 'length(m)', 'height(cm)', 'X1', 'X2',
       'breed_category', 'condition_0.0', 'condition_1.0', 'condition_2.0'],
      dtype='object')

In [2179]:
def get_variable_name(variable):
    for name in globals():
        if id(globals()[name]) == id(variable):
            return name
    for name in locals():
        if id(locals()[name]) == id(variable):
            return name
    return None

## Model building (Breed Category/Pet Category)

In [2162]:
X_breed = train_breed[['pet_sale_id', 'color_type', 'length(m)', 'height(cm)', 'X1', 'X2','condition_0.0', 'condition_1.0', 'condition_2.0']]
y_breed = train_breed['breed_category']

X_pet = train_pet[['pet_sale_id', 'color_type', 'length(m)', 'height(cm)', 'X1', 'X2','condition_0.0', 'condition_1.0', 'condition_2.0']]
y_pet = train_pet['pet_category']


In [2182]:
X = [X_breed, X_pet]
y = [y_breed, y_pet]

def model_building(X,y):


    for input, target in zip(X,y):

        print(get_variable_name(input))
        
        X_train, X_test, y_train, y_test = train_test_split(input, target, test_size=0.33, random_state=42)


        print('Shape of the Training and Test Data')
        print(X_train.shape, X_test.shape)
        print(y_train.shape,y_test.shape)
        print("---------------------------")
        

        scaler = StandardScaler()

        scaler.fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        X_test_scaled = scaler.transform(X_test)


        print('Shape of the SCALED Training and Test Data')
        print(X_train_scaled.shape, X_test_scaled.shape)
        print(y_train.shape,y_test.shape)
        print("---------------------------\n")


        # Model 
        # Initialize and train the Random Forest classifier
        clf = RandomForestClassifier(n_estimators=100, random_state=42)
        clf.fit(X_train_scaled, y_train)

        # Make predictions on the test data
        y_pred = clf.predict(X_test_scaled)

        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        print("Accuracy:", accuracy)

        # Print classification report

        report = classification_report(y_test, y_pred)
        print("Classification Report:\n", report)


model_building(X,y)

X_breed
Shape of the Training and Test Data
(9373, 9) (4618, 9)
(9373,) (4618,)
---------------------------
Shape of the SCALED Training and Test Data
(9373, 9) (4618, 9)
(9373,) (4618,)
---------------------------

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2212
           1       1.00      1.00      1.00      2086
           2       1.00      1.00      1.00       320

    accuracy                           1.00      4618
   macro avg       1.00      1.00      1.00      4618
weighted avg       1.00      1.00      1.00      4618

X_pet
Shape of the Training and Test Data
(9373, 9) (4618, 9)
(9373,) (4618,)
---------------------------
Shape of the SCALED Training and Test Data
(9373, 9) (4618, 9)
(9373,) (4618,)
---------------------------

Accuracy: 0.9997834560415765
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00