In [182]:
# Importing Required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC 

import gc
import cv2

In [183]:
# Converting Training Data into Pandas DataFrame
Train = pd.read_csv("/content/train.csv")


In [184]:
#Accessing the 1st 5 rows of the dataset
Train.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,ANSL_69903,2016-07-10 00:00:00,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1
1,ANSL_66892,2013-11-21 00:00:00,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,0.0,2
2,ANSL_69750,2014-09-28 00:00:00,2016-10-19 08:24:00,,Brown,0.15,40.9,15,4,2.0,4
3,ANSL_71623,2016-12-31 00:00:00,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1,0.0,2
4,ANSL_57969,2017-09-28 00:00:00,2017-11-19 09:38:00,2.0,Black,0.5,11.06,18,4,0.0,1


In [185]:
# Finding out the total days between listing_date and issue_date and storing them in a new column "total_days"
Train["issue_date"] = pd.to_datetime(Train["issue_date"],format="%Y-%m-%d")
Train["listing_date"] = pd.to_datetime(Train["listing_date"],format="%Y-%m-%d")
Train["days"] = Train["listing_date"] - Train["issue_date"] 
Train["total_days"] = Train["days"].dt.days 

In [186]:
Train.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category,days,total_days
0,ANSL_69903,2016-07-10,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1,73 days 16:25:00,73
1,ANSL_66892,2013-11-21,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,0.0,2,1862 days 17:47:00,1862
2,ANSL_69750,2014-09-28,2016-10-19 08:24:00,,Brown,0.15,40.9,15,4,2.0,4,752 days 08:24:00,752
3,ANSL_71623,2016-12-31,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1,0.0,2,755 days 18:30:00,755
4,ANSL_57969,2017-09-28,2017-11-19 09:38:00,2.0,Black,0.5,11.06,18,4,0.0,1,52 days 09:38:00,52


In [187]:
# Null Values exist in the "condition" column which is filled by -1
Train['condition'] = Train['condition'].fillna(-1)


In [188]:
# Dropping columns "pet_id","issue_date","listing_date","days"
Train = Train.drop(["pet_id","issue_date","listing_date","days",],axis=1)

In [189]:
Train.head()

Unnamed: 0,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category,total_days
0,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1,73
1,1.0,White,0.72,14.19,13,9,0.0,2,1862
2,-1.0,Brown,0.15,40.9,15,4,2.0,4,752
3,1.0,White,0.62,17.82,0,1,0.0,2,755
4,2.0,Black,0.5,11.06,18,4,0.0,1,52


In [194]:
# Assigning Train dataframe to buddy
buddy = Train

In [190]:
# Selecting the target variables
y_1 = Train["breed_category"]
y_2 = Train["pet_category"]
y = pd.concat([y_1,y_2],axis =1)

In [191]:
# Accessing the first 5 rows of target variable
y.head()

Unnamed: 0,breed_category,pet_category
0,0.0,1
1,0.0,2
2,2.0,4
3,0.0,2
4,0.0,1


In [195]:
# Checking the data distribution between different classes
buddy.breed_category.astype('category').value_counts()

0.0    9000
1.0    8357
2.0    1477
Name: breed_category, dtype: int64

In [None]:
# Checking the data distribution between different classes
buddy.pet_category.astype('category').value_counts()

2    10621
1     7184
4      941
0       88
Name: pet_category, dtype: int64

In [196]:
# Checking the dimensions
buddy.shape

(18834, 9)

In [197]:
buddy.head()

Unnamed: 0,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category,total_days
0,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1,73
1,1.0,White,0.72,14.19,13,9,0.0,2,1862
2,-1.0,Brown,0.15,40.9,15,4,2.0,4,752
3,1.0,White,0.62,17.82,0,1,0.0,2,755
4,2.0,Black,0.5,11.06,18,4,0.0,1,52


In [198]:
# Dropping the target variables from buddy
buddy=buddy.drop(["breed_category","pet_category"],axis =1)


In [199]:
# Label Encoding the categorical features in column "color_type"
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
buddy.loc[:,["color_type"]]= buddy.loc[:,["color_type"]].apply(enc.fit_transform)

In [None]:
buddy.shape

(18834, 59)

In [201]:
# Rescaling the features 
from sklearn.preprocessing import scale
buddy= scale(buddy)
buddy = pd.DataFrame(buddy)

In [202]:
buddy.head()

Unnamed: 0,0,1,2,3,4,5,6
0,1.410592,-0.337623,1.030021,-1.510729,1.161013,1.257279,-0.713363
1,0.29489,1.463576,0.752914,-1.018388,1.161013,1.257279,0.917975
2,-1.936513,-0.492012,-1.221472,1.033161,1.465326,-0.164116,-0.094203
3,0.29489,1.463576,0.406531,-0.739574,-0.817018,-1.016954,-0.091467
4,1.410592,-1.161029,-0.009129,-1.258797,1.921795,-0.164116,-0.732512


In [203]:
buddy.shape

(18834, 7)

In [204]:
# Dimensions 
y.shape

(18834, 2)

In [205]:
# train test split with train_size=80% and test size=20%
x_train, x_test, y_train, y_test = train_test_split(buddy, y, train_size=0.8, random_state=101)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(15067, 7)
(3767, 7)
(15067, 2)
(3767, 2)


In [206]:
x_train.head()

Unnamed: 0,0,1,2,3,4,5,6
18650,-0.820811,1.463576,0.025509,-1.519177,-0.817018,-1.016954,-0.406063
10036,-0.820811,0.794559,0.094786,-1.219625,-0.817018,-1.016954,-0.025812
16223,0.29489,-0.080309,1.237851,1.459446,-0.817018,-1.016954,3.235952
17147,0.29489,-1.161029,-1.221472,0.288889,-0.817018,-1.016954,-0.701509
13465,0.29489,1.206262,-1.152195,-1.278768,-0.817018,-1.016954,-0.673241


In [207]:
y_train.head()

Unnamed: 0,breed_category,pet_category
18650,1.0,2
10036,1.0,2
16223,0.0,2
17147,0.0,1
13465,0.0,2


### Model Building

In [208]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(x_test)
clf.fit(x_train,y_train)
# Prediction
y_pred=clf.predict(x_test)

In [209]:
# Converting predictions into dataframe
predictions = pd.DataFrame(y_pred)
predictions.head()

Unnamed: 0,0,1
0,1.0,1.0
1,1.0,2.0
2,1.0,2.0
3,0.0,1.0
4,0.0,2.0


In [210]:
pred_breed = predictions[0]
pred_pet = predictions[1]
pred_pet.head()

0    1.0
1    2.0
2    2.0
3    1.0
4    2.0
Name: 1, dtype: float64

In [211]:
y_test.head()

Unnamed: 0,breed_category,pet_category
18327,1.0,1
16483,1.0,2
5868,1.0,2
7261,1.0,1
17244,0.0,2


In [212]:
y_test_breed = y_test['breed_category']
y_test_pet = y_test['pet_category']
y_test_pet.head()

18327    1
16483    2
5868     2
7261     1
17244    2
Name: pet_category, dtype: int64

### Finding accuracy, recall, precision on 20% of data 

In [213]:
# For breed_category prediction
# accuracy
from sklearn import metrics
from sklearn.metrics import accuracy_score
print(metrics.accuracy_score(y_true=y_test_breed, y_pred=pred_breed))
# class-wise accuracy
class_wise = metrics.classification_report(y_true=y_test_breed, y_pred=pred_breed)
print(class_wise)

0.9065569418635518
              precision    recall  f1-score   support

         0.0       0.89      0.92      0.90      1787
         1.0       0.91      0.88      0.89      1673
         2.0       1.00      1.00      1.00       307

    accuracy                           0.91      3767
   macro avg       0.93      0.93      0.93      3767
weighted avg       0.91      0.91      0.91      3767



In [214]:
#For pet_category prediction
# accuracy 
print(metrics.accuracy_score(y_true=y_test_pet, y_pred=pred_pet))
# class-wise accuracy
class_wise = metrics.classification_report(y_true=y_test_pet, y_pred=pred_pet)
print(class_wise)

0.8680647730289355
              precision    recall  f1-score   support

           0       0.50      0.14      0.22        14
           1       0.87      0.81      0.84      1426
           2       0.87      0.92      0.89      2124
           4       0.89      0.81      0.85       203

    accuracy                           0.87      3767
   macro avg       0.78      0.67      0.70      3767
weighted avg       0.87      0.87      0.87      3767



In [215]:
# Importing test data and converting it into a pandas dataframe "buddy_pc"
buddy_pc = pd.read_csv("/content/test.csv")

In [216]:
# Finding out total days between listing_date and issue_date and storing it to a column "total_days"
# Droping the "pet_id","issue_date","listing_date","days" columns
buddy_pc["issue_date"] = pd.to_datetime(buddy_pc["issue_date"],format="%Y-%m-%d")
buddy_pc["listing_date"] = pd.to_datetime(buddy_pc["listing_date"],format="%Y-%m-%d")
buddy_pc["days"] = buddy_pc["listing_date"] - buddy_pc["issue_date"] 
buddy_pc["total_days"] = buddy_pc["days"].dt.days 
buddy_pc = buddy_pc.drop(["pet_id","issue_date","listing_date","days",],axis=1)


In [217]:
buddy_pc.head()

Unnamed: 0,condition,color_type,length(m),height(cm),X1,X2,total_days
0,0.0,Black,0.87,42.73,0,7,4404
1,1.0,Orange Tabby,0.06,6.71,0,1,174
2,1.0,Black,0.24,41.21,0,7,1999
3,1.0,Black,0.29,8.46,7,1,1148
4,1.0,Brown,0.71,30.92,0,7,463


In [218]:
# Replacing the null values present in "condition" column as -1
buddy_pc['condition'] = buddy_pc['condition'].fillna(-1)


In [219]:
buddy_pc.head()

Unnamed: 0,condition,color_type,length(m),height(cm),X1,X2,total_days
0,0.0,Black,0.87,42.73,0,7,4404
1,1.0,Orange Tabby,0.06,6.71,0,1,174
2,1.0,Black,0.24,41.21,0,7,1999
3,1.0,Black,0.29,8.46,7,1,1148
4,1.0,Brown,0.71,30.92,0,7,463


In [223]:
# Label Encoding the "color_type" column  due to presence of categorical values
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
buddy_pc.loc[:,["color_type"]]= buddy_pc.loc[:,["color_type"]].apply(enc.fit_transform)

In [224]:
buddy_pc.head()

Unnamed: 0,condition,color_type,length(m),height(cm),X1,X2,total_days
0,0.0,2,0.87,42.73,0,7,4404
1,1.0,36,0.06,6.71,0,1,174
2,1.0,2,0.24,41.21,0,7,1999
3,1.0,2,0.29,8.46,7,1,1148
4,1.0,14,0.71,30.92,0,7,463


In [225]:
buddy_pc.shape

(8072, 7)

In [226]:
# Rescaling the features
from sklearn.preprocessing import scale
buddy_pc = scale(buddy_pc)
buddy_pc = pd.DataFrame(buddy_pc)
buddy_pc.head()

Unnamed: 0,0,1,2,3,4,5,6
0,-0.829145,-1.141113,1.252553,1.182838,-0.807684,0.70804,3.214818
1,0.28838,0.686693,-1.544439,-1.605713,-0.807684,-0.994885,-0.618018
2,0.28838,-1.141113,-0.922885,1.065164,-0.807684,0.70804,1.035629
3,0.28838,-1.141113,-0.750231,-1.470234,0.268339,-0.994885,0.264531
4,0.28838,-0.496005,0.700061,0.268546,-0.807684,0.70804,-0.356153


### Prediction on the given test data by using the model built

In [227]:
# Predictions
prediction  = clf.predict(buddy_pc)
prediction =pd.DataFrame(prediction)
prediction

Unnamed: 0,0,1
0,1.0,2.0
1,0.0,1.0
2,0.0,2.0
3,0.0,2.0
4,0.0,2.0
...,...,...
8067,0.0,2.0
8068,1.0,2.0
8069,1.0,2.0
8070,2.0,4.0


In [228]:
test = pd.read_csv("/content/test.csv")

In [229]:
# Accessing the 'pet_id' column from test data
pet_id = test['pet_id']

In [230]:
# Concatanating the pet_id and prediction file
Prediction_file = pd.concat([pet_id,prediction],axis =1)
Prediction_file.head()

Unnamed: 0,pet_id,0,1
0,ANSL_75005,1.0,2.0
1,ANSL_76663,0.0,1.0
2,ANSL_58259,0.0,2.0
3,ANSL_67171,0.0,2.0
4,ANSL_72871,0.0,2.0


In [231]:
# Exporting from google colab to local drive as 'predictionfile14.csv'
from google.colab import files
Prediction_file.to_csv('predictionfile14.csv') 
files.download('predictionfile14.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>