In [1]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# sklearn :: utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold

# sklearn :: models
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.svm import SVC
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.naive_bayes import GaussianNB

# sklearn :: evaluation metrics
from sklearn.metrics import cohen_kappa_score

sns.set_style('whitegrid')

# Problem definition

Predict when a pet will be adopted

# Load the data

In [2]:
df_train = pd.read_csv('Data/train.csv')
df_test = pd.read_csv('Data/test.csv')
print(df_train.shape, df_test.shape)

(10000, 24) (4993, 23)


# Feature Engineering

In [3]:
print(df_train.columns)
df_train.head()

Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID',
       'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed'],
      dtype='object')


Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed
0,1,â¥â¥â¥ Lily â¥â¥â¥,36,307,0,2,2,7,0,2,...,1,1,0,41326,337914b09c2fa5460e195197e994ef98,0,Adorable 3 year old Lily looking for a forever...,3f8824a3b,1.0,4
1,2,Cookie,3,266,0,1,6,7,0,2,...,1,1,0,41327,4bb1ebb92158078ad54a6bb23c10dffc,0,i rescue this stary kitten from market near my...,9238eb7fc,1.0,2
2,2,Favour Speedy Abundance And Courage,7,250,252,1,1,2,0,2,...,1,4,0,41327,99ba8ce53b4d8515e417e7921563d923,0,The mother was a Burmese cross and had since p...,f0a1f2b90,2.0,4
3,1,,3,307,0,1,2,0,0,3,...,1,1,0,41327,3f3ef74c486beba3bc87f6dbaee772bf,0,This puppy is: 1. Male 2. 3 months old 3. Brow...,7d028bdea,4.0,2
4,2,Abandoned Kitty,1,266,0,1,1,6,7,1,...,1,1,0,41401,844f03ab8054007d4be6686f3a9702b9,0,Mother cat gave birth to a litter of 3 and too...,8377bfe97,0.0,2


In [4]:
df_train.dtypes

Type               int64
Name              object
Age                int64
Breed1             int64
Breed2             int64
Gender             int64
Color1             int64
Color2             int64
Color3             int64
MaturitySize       int64
FurLength          int64
Vaccinated         int64
Dewormed           int64
Sterilized         int64
Health             int64
Quantity           int64
Fee                int64
State              int64
RescuerID         object
VideoAmt           int64
Description       object
PetID             object
PhotoAmt         float64
AdoptionSpeed      int64
dtype: object

In [5]:
# Check for missing values
df_train.isnull().sum(axis = 0)

Type               0
Name             842
Age                0
Breed1             0
Breed2             0
Gender             0
Color1             0
Color2             0
Color3             0
MaturitySize       0
FurLength          0
Vaccinated         0
Dewormed           0
Sterilized         0
Health             0
Quantity           0
Fee                0
State              0
RescuerID          0
VideoAmt           0
Description        8
PetID              0
PhotoAmt           0
AdoptionSpeed      0
dtype: int64

In [6]:
# # Correlation
# df_temp = df_train.filter(['log_price','accommodates', 'bathrooms', 'bedrooms', 'beds', 'Real Bed', 'Shared room', 
#                            'Entire home/apt', 'Private room', 'cleaning_fee', 'review_scores_rating', 
#                            'host_since_year', 'Boston', 'Chicago', 'DC', 'LA', 'NYC', 'SF', 'c_distance', 'Condominium', 'pets', 'event',
#                           'kitchen', 'heating', 'gym', 'elevator', 'pool', '#amenities'], axis=1)
df_train.corr()

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,VideoAmt,PhotoAmt,AdoptionSpeed
Type,1.0,-0.150024,0.055922,-0.048857,0.06258,0.093889,0.25073,0.199668,-0.172627,-0.009966,0.11075,0.034795,0.00671,-0.003571,0.046327,-0.045208,0.127153,-0.002877,0.050386,-0.096588
Age,-0.150024,1.0,-0.317141,-0.042328,-0.128883,0.089449,-0.043082,-0.051003,0.094532,0.156118,-0.13899,-0.056361,-0.194933,0.089627,-0.112622,0.099678,0.024376,-0.021741,-0.081627,0.105835
Breed1,0.055922,-0.317141,1.0,-0.159807,0.070733,-0.030621,-0.01334,0.007075,-0.00828,-0.119483,0.042104,0.010842,0.062735,-0.036438,0.089211,-0.200489,-0.025743,0.00946,0.034484,0.111364
Breed2,-0.048857,-0.042328,-0.159807,1.0,0.071215,-0.020326,0.004285,0.038584,0.05462,0.103603,0.010243,-0.00184,-0.006333,-0.021425,0.042083,0.008631,-0.044799,0.004433,0.054457,-0.021178
Gender,0.06258,-0.128883,0.070733,0.071215,1.0,-0.114045,0.028391,0.256542,-0.093383,-0.032118,0.071064,0.091829,0.030318,-0.051646,0.496795,-0.049542,0.003222,0.020802,0.091525,0.057663
Color1,0.093889,0.089449,-0.030621,-0.020326,-0.114045,1.0,-0.114525,-0.281832,-0.025359,0.075806,-0.017658,-0.020026,-0.035473,0.032545,-0.110077,0.055972,0.030674,0.000661,-0.041995,-0.045613
Color2,0.25073,-0.043082,-0.01334,0.004285,0.028391,-0.114525,1.0,0.084761,-0.079657,-0.011525,0.02713,0.008896,0.004854,-0.002449,0.022109,-0.021506,0.034442,0.028056,0.059475,-0.041183
Color3,0.199668,-0.051003,0.007075,0.038584,0.256542,-0.281832,0.084761,1.0,-0.051689,0.011624,0.054279,0.049366,0.029254,-0.031123,0.269975,-0.017541,0.003037,0.027171,0.103766,-0.003366
MaturitySize,-0.172627,0.094532,-0.00828,0.05462,-0.093383,-0.025359,-0.079657,-0.051689,1.0,0.099199,-0.074013,-0.05892,-0.057781,-0.015047,-0.043203,0.045344,-0.060916,0.023799,0.011683,0.04777
FurLength,-0.009966,0.156118,-0.119483,0.103603,-0.032118,0.075806,-0.011525,0.011624,0.099199,1.0,-0.012485,0.013764,0.028363,0.035957,-0.039007,0.159591,-0.025696,-0.016959,-0.025926,-0.083989


## Feature: Type

In [7]:
print(df_train['Type'].value_counts())

# apply dummies on the training set
col = 'Type'
df_dummies = pd.get_dummies(df_train[col])
df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
df_train = pd.concat([df_train, df_dummies], axis=1)

# apply the same dummies on the test set
col = 'Type'
df_dummies = pd.get_dummies(df_test[col])
df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
df_test = pd.concat([df_test, df_dummies], axis=1)

1    5442
2    4558
Name: Type, dtype: int64


## Feature: Name

In [8]:
top_name= df_train['Name'].value_counts().head(10)
top_name

Lucky      45
Baby       44
Brownie    43
No Name    39
Mimi       35
Puppy      32
Kitty      28
Max        27
Kittens    25
Snowy      25
Name: Name, dtype: int64

In [9]:
df_train['Name'].str.len()

0       24.0
1        6.0
2       35.0
3        NaN
4       15.0
5        4.0
6        4.0
7       21.0
8        NaN
9        7.0
10      24.0
11       5.0
12       6.0
13       4.0
14       5.0
15      25.0
16       NaN
17       9.0
18       NaN
19       5.0
20       4.0
21       NaN
22       6.0
23       5.0
24       5.0
25      20.0
26       6.0
27       4.0
28       5.0
29       5.0
        ... 
9970    10.0
9971     7.0
9972     4.0
9973    25.0
9974    35.0
9975    26.0
9976     4.0
9977     5.0
9978     7.0
9979    18.0
9980     5.0
9981    17.0
9982     3.0
9983     7.0
9984     4.0
9985     8.0
9986    35.0
9987     6.0
9988     6.0
9989     6.0
9990     4.0
9991     6.0
9992     8.0
9993    27.0
9994     4.0
9995     4.0
9996     NaN
9997     8.0
9998     6.0
9999     NaN
Name: Name, Length: 10000, dtype: float64

In [10]:
# Create bew feature with Name or No-Name
df_train['Name Status'] = np.where((df_train['Name'].isnull()) | (df_train['Name'].str.len()<=2), 0, 1)

# apply the same dummies on the test set
df_test['Name Status'] = np.where((df_test['Name'].isnull())  | (df_test['Name'].str.len()<=2), 0, 1)

In [11]:
# top_names = ['Lucky', 'Baby', 'Brownie', 'No Name', 'Mimi', 'Puppy', 'Kitty', 'Max', 'Kittens', 'Snowy']

# df_train['Lucky'] = np.where(df_train['Name'].str.contains("Lucky")==True, '1', '0')
# df_train['Lucky'] = df_train['Lucky'].astype(int)

# df_test['Lucky'] = np.where(df_test['Name'].str.contains("Lucky")==True, '1', '0')
# df_test['Lucky'] = df_test['Lucky'].astype(int)

# df_train['Baby'] = np.where(df_train['Name'].str.contains("Baby")==True, '1', '0')
# df_train['Baby'] = df_train['Baby'].astype(int)

# df_test['Baby'] = np.where(df_test['Name'].str.contains("Baby")==True, '1', '0')
# df_test['Baby'] = df_test['Baby'].astype(int)

# df_train['Brownie'] = np.where(df_train['Name'].str.contains("Brownie")==True, '1', '0')
# df_train['Brownie'] = df_train['Brownie'].astype(int)

# df_test['Brownie'] = np.where(df_test['Name'].str.contains("Brownie")==True, '1', '0')
# df_test['Brownie'] = df_test['Brownie'].astype(int)

# df_train['Mimi'] = np.where(df_train['Name'].str.contains("Mimi")==True, '1', '0')
# df_train['Mimi'] = df_train['Mimi'].astype(int)

# df_test['Mimi'] = np.where(df_test['Name'].str.contains("Mimi")==True, '1', '0')
# df_test['Mimi'] = df_test['Mimi'].astype(int)



## Feature: Age

In [12]:
df_train['Age'] = df_train['Age'].astype(int)

print(df_train['Age'].value_counts())

# print('mean', np.mean(df_train['Age']))
# print('std', np.std(df_train['Age']))
# plt.hist(df_train['Age'], bins=30)
# plt.show()

# convert Age into groups
def convert_age_group(val):
    if val ==1:
        return 'M1'
    elif val ==2:
        return 'M2'
    elif val ==3:
        return 'M3'
    elif val ==4:
        return 'M4'
    elif val ==5:
        return 'M5'
    elif val ==6:
        return 'M6'
    elif val >=7 and val <= 12:
        return 'M7-M12'
    elif val >=13 and val <= 24:
        return 'M13-M24'
    elif val >= 25 and val <= 72:
        return 'M25-M72'
    elif val >= 73 and val <= 120:
        return 'M73-M120'
    elif val >= 121 and val <= 168:
        return 'M212-M168'
    else:
        return 'M168+'
        
df_train['Age'] = df_train['Age'].apply(convert_age_group)
print ('Unique Values in the Column:', np.unique(df_train['Age']))
# print(df_train['review_scores_rating'].value_counts())

# Create new features from review_scores_rating
convert_age_group = pd.get_dummies(df_train['Age']).astype(int)
df_train = pd.concat([df_train, convert_age_group], axis=1)

# # Delete column from df_train
# del df_train['Age']

# apply the same dummies on the test set
df_test['Age'] = df_test['Age'].astype(int)

# convert Age into groups
def convert_age_group(val):
    if val ==1:
        return 'M1'
    elif val ==2:
        return 'M2'
    elif val ==3:
        return 'M3'
    elif val ==4:
        return 'M4'
    elif val ==5:
        return 'M5'
    elif val ==6:
        return 'M6'
    elif val >=7 and val <= 12:
        return 'M7-M12'
    elif val >=13 and val <= 24:
        return 'M13-M24'
    elif val >= 25 and val <= 72:
        return 'M25-M72'
    elif val >= 73 and val <= 120:
        return 'M73-M120'
    elif val >= 121 and val <= 168:
        return 'M212-M168'
    else:
        return 'M168+'
            
df_test['Age'] = df_test['Age'].apply(convert_age_group)

# Create new features from review_scores_rating
convert_age_group = pd.get_dummies(df_test['Age']).astype(int)
df_test = pd.concat([df_test, convert_age_group], axis=1)

# # Delete column from df_train
# del df_test['Age']



2      2329
1      1539
3      1310
4       726
12      643
24      444
6       390
5       373
36      290
8       203
7       177
48      158
60      141
0       121
10      119
9       119
18      115
72       77
11       65
84       64
14       57
15       47
17       40
30       38
16       30
13       29
96       27
120      20
20       20
29       20
       ... 
52        2
92        2
56        2
180       2
212       2
77        2
64        2
168       1
144       1
112       1
88        1
255       1
81        1
74        1
47        1
238       1
102       1
117       1
69        1
45        1
156       1
68        1
44        1
135       1
123       1
75        1
43        1
122       1
82        1
147       1
Name: Age, Length: 99, dtype: int64
Unique Values in the Column: ['M1' 'M13-M24' 'M168+' 'M2' 'M212-M168' 'M25-M72' 'M3' 'M4' 'M5' 'M6'
 'M7-M12' 'M73-M120']


## Feature: Breed1

In [13]:
print(df_train['Breed1'].value_counts().head(20))

# # apply dummies on the training set
# col = 'Breed1'
# df_dummies = pd.get_dummies(df_train[col])
# df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
# df_train = pd.concat([df_train, df_dummies], axis=1)

# # apply the same dummies on the test set
# col = 'Breed1'
# df_dummies = pd.get_dummies(df_test[col])
# df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
# df_test = pd.concat([df_test, df_dummies], axis=1)

307    3979
266    2423
265     840
299     230
264     188
292     162
285     152
141     137
205     124
179     117
109      97
218      96
243      70
103      70
254      64
189      61
213      60
20       56
283      51
78       46
Name: Breed1, dtype: int64


In [14]:
# convert Breed1 into groups
def convert_Breed1(val):
    if val == 307:
        return 'T1'
    elif val == 266:
        return 'T2'
    elif val == 265:
        return 'T3'
    elif val == 299:
        return 'T4'
    elif val == 264:
        return 'T5'
    elif val == 292:
        return 'T6'
    elif val == 285:
        return 'T7'
    elif val == 141:
        return 'T8'
    elif val == 205:
        return 'T9'
    elif val == 179:
        return 'T10'
    elif val == 109:
        return 'T11'
    elif val == 218:
        return 'T12'
    elif val == 243:
        return 'T13'
    elif val == 103:
        return 'T14'
    elif val == 254:
        return 'T15'
    elif val == 189:
        return 'T16'
    elif val == 213:
        return 'T17'
    elif val == 20:
        return 'T18'
    elif val == 283:
        return 'T19'
    elif val == 78:
        return 'T20'
    else:
        return 'TX'
        
df_train['Breed1'] = df_train['Breed1'].apply(convert_Breed1)
print ('Unique Values in the Column:', np.unique(df_train['Breed1']))
# print(df_train['review_scores_rating'].value_counts())

# Create new features from review_scores_rating
convert_Breed1 = pd.get_dummies(df_train['Breed1']).astype(int)
df_train = pd.concat([df_train, convert_Breed1], axis=1)

# apply the same dummies on the test set

# convert Breed1 into groups
def convert_Breed1(val):
    if val == 307:
        return 'T1'
    elif val == 266:
        return 'T2'
    elif val == 265:
        return 'T3'
    elif val == 299:
        return 'T4'
    elif val == 264:
        return 'T5'
    elif val == 292:
        return 'T6'
    elif val == 285:
        return 'T7'
    elif val == 141:
        return 'T8'
    elif val == 205:
        return 'T9'
    elif val == 179:
        return 'T10'
    elif val == 109:
        return 'T11'
    elif val == 218:
        return 'T12'
    elif val == 243:
        return 'T13'
    elif val == 103:
        return 'T14'
    elif val == 254:
        return 'T15'
    elif val == 189:
        return 'T16'
    elif val == 213:
        return 'T17'
    elif val == 20:
        return 'T18'
    elif val == 283:
        return 'T19'
    elif val == 78:
        return 'T20'
    else:
        return 'TX'
            
df_test['Breed1'] = df_test['Breed1'].apply(convert_Breed1)

# Create new features from review_scores_rating
convert_Breed1 = pd.get_dummies(df_test['Breed1']).astype(int)
df_test = pd.concat([df_test, convert_Breed1], axis=1)


Unique Values in the Column: ['T1' 'T10' 'T11' 'T12' 'T13' 'T14' 'T15' 'T16' 'T17' 'T18' 'T19' 'T2'
 'T20' 'T3' 'T4' 'T5' 'T6' 'T7' 'T8' 'T9' 'TX']


## Feature: Breed2

In [15]:
print(df_train['Breed2'].value_counts().head(10))

0      7212
307    1144
266     379
265     212
264      87
299      85
292      79
218      65
141      57
285      54
Name: Breed2, dtype: int64


In [16]:
# # convert Breed1 into groups
# def convert_Breed2(val):
#     if val == 0:
#         return 'B2-1'
#     elif val == 307:
#         return 'B2-2'
#     elif val == 266:
#         return 'B2-3'
#     elif val == 265:
#         return 'B2-4'
#     elif val == 264:
#         return 'B2-5'
#     else:
#         return 'B2-X'
        
# df_train['Breed2'] = df_train['Breed2'].apply(convert_Breed2)
# print ('Unique Values in the Column:', np.unique(df_train['Breed2']))
# # print(df_train['review_scores_rating'].value_counts())

# # Create new features from review_scores_rating
# convert_Breed2 = pd.get_dummies(df_train['Breed2']).astype(int)
# df_train = pd.concat([df_train, convert_Breed2], axis=1)

# # apply the same dummies on the test set

# # convert Breed1 into groups
# def convert_Breed2(val):
#     if val == 0:
#         return 'B2-1'
#     elif val == 307:
#         return 'B2-2'
#     elif val == 266:
#         return 'B2-3'
#     elif val == 265:
#         return 'B2-4'
#     elif val == 264:
#         return 'B2-5'
#     else:
#         return 'B2-X'
            
# df_test['Breed2'] = df_test['Breed2'].apply(convert_Breed2)

# # Create new features from review_scores_rating
# convert_Breed2 = pd.get_dummies(df_test['Breed2']).astype(int)
# df_test = pd.concat([df_test, convert_Breed2], axis=1)


## Feature: Gender

In [17]:
print(df_train['Gender'].value_counts())

# apply dummies on the training set
col = 'Gender'
df_dummies = pd.get_dummies(df_train[col])
df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
df_train = pd.concat([df_train, df_dummies], axis=1)

# apply the same dummies on the test set
col = 'Gender'
df_dummies = pd.get_dummies(df_test[col])
df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
df_test = pd.concat([df_test, df_dummies], axis=1)

2    4866
1    3685
3    1449
Name: Gender, dtype: int64


## Feature: Color1

In [18]:
# df_train['Color1'] = df_train['Color1'].astype(str)
# df_train['Color2'] = df_train['Color2'].astype(str)
# df_train['Color3'] = df_train['Color3'].astype(str)

# df_train['Color'] = df_train['Color1'] + df_train['Color2'] + df_train['Color3']

# print(df_train['Color'].value_counts())

# df_test['Color1'] = df_test['Color1'].astype(str)
# df_test['Color2'] = df_test['Color2'].astype(str)
# df_test['Color3'] = df_test['Color3'].astype(str)

# df_test['Color'] = df_test['Color1'] + df_test['Color2'] + df_test['Color3']


# # apply dummies on the training set
# col = 'Color'
# df_dummies = pd.get_dummies(df_train[col])
# df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
# df_train = pd.concat([df_train, df_dummies], axis=1)

# # apply the same dummies on the test set
# col = 'Color'
# df_dummies = pd.get_dummies(df_test[col])
# df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
# df_test = pd.concat([df_test, df_dummies], axis=1)

In [19]:
# print(df_train['Color1'].value_counts())

# # apply dummies on the training set
# col = 'Color1'
# df_dummies = pd.get_dummies(df_train[col])
# df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
# df_train = pd.concat([df_train, df_dummies], axis=1)

# # apply the same dummies on the test set
# col = 'Color1'
# df_dummies = pd.get_dummies(df_test[col])
# df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
# df_test = pd.concat([df_test, df_dummies], axis=1)

## Feature: Color 2

In [20]:
# print(df_train['Color2'].value_counts())

# # apply dummies on the training set
# col = 'Color2'
# df_dummies = pd.get_dummies(df_train[col])
# df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
# df_train = pd.concat([df_train, df_dummies], axis=1)

# # apply the same dummies on the test set
# col = 'Color2'
# df_dummies = pd.get_dummies(df_test[col])
# df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
# df_test = pd.concat([df_test, df_dummies], axis=1)

## Feature: Color3

In [21]:
# print(df_train['Color3'].value_counts())

# # apply dummies on the training set
# col = 'Color3'
# df_dummies = pd.get_dummies(df_train[col])
# df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
# df_train = pd.concat([df_train, df_dummies], axis=1)

# # apply the same dummies on the test set
# col = 'Color3'
# df_dummies = pd.get_dummies(df_test[col])
# df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
# df_test = pd.concat([df_test, df_dummies], axis=1)

## Feature: MaturitySize

In [22]:
print(df_train['MaturitySize'].value_counts())

# apply dummies on the training set
col = 'MaturitySize'
df_dummies = pd.get_dummies(df_train[col])
df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
df_train = pd.concat([df_train, df_dummies], axis=1)

# apply the same dummies on the test set
col = 'MaturitySize'
df_dummies = pd.get_dummies(df_test[col])
df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
df_test = pd.concat([df_test, df_dummies], axis=1)

2    6879
1    2261
3     842
4      18
Name: MaturitySize, dtype: int64


## Feature: FurLength

In [23]:
print(df_train['FurLength'].value_counts())

# apply dummies on the training set
col = 'FurLength'
df_dummies = pd.get_dummies(df_train[col])
df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
df_train = pd.concat([df_train, df_dummies], axis=1)

# apply the same dummies on the test set
col = 'FurLength'
df_dummies = pd.get_dummies(df_test[col])
df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
df_test = pd.concat([df_test, df_dummies], axis=1)

1    5865
2    3591
3     544
Name: FurLength, dtype: int64


## Feature: Vaccinated

In [24]:
# print(df_train['Vaccinated'].value_counts())

# # apply dummies on the training set
# col = 'Vaccinated'
# df_dummies = pd.get_dummies(df_train[col])
# df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
# df_train = pd.concat([df_train, df_dummies], axis=1)

# # apply the same dummies on the test set
# col = 'Vaccinated'
# df_dummies = pd.get_dummies(df_test[col])
# df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
# df_test = pd.concat([df_test, df_dummies], axis=1)

## Feature: Dewormed

In [25]:
print(df_train['Dewormed'].value_counts())

# apply dummies on the training set
col = 'Dewormed'
df_dummies = pd.get_dummies(df_train[col])
df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
df_train = pd.concat([df_train, df_dummies], axis=1)

# apply the same dummies on the test set
col = 'Dewormed'
df_dummies = pd.get_dummies(df_test[col])
df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
df_test = pd.concat([df_test, df_dummies], axis=1)

1    5625
2    3200
3    1175
Name: Dewormed, dtype: int64


## Feature: Sterilized

In [26]:
print(df_train['Sterilized'].value_counts())

# apply dummies on the training set
col = 'Sterilized'
df_dummies = pd.get_dummies(df_train[col])
df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
df_train = pd.concat([df_train, df_dummies], axis=1)

# apply the same dummies on the test set
col = 'Sterilized'
df_dummies = pd.get_dummies(df_test[col])
df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
df_test = pd.concat([df_test, df_dummies], axis=1)

2    6737
1    2080
3    1183
Name: Sterilized, dtype: int64


## Feature: Health

In [27]:
# apply dummies on the training set
col = 'Health'
df_dummies = pd.get_dummies(df_train[col])
df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
df_train = pd.concat([df_train, df_dummies], axis=1)

# apply the same dummies on the test set
col = 'Health'
df_dummies = pd.get_dummies(df_test[col])
df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
df_test = pd.concat([df_test, df_dummies], axis=1)

## Feature: State

In [28]:
print(df_train['State'].value_counts())

# apply dummies on the training set
col = 'State'
df_dummies = pd.get_dummies(df_train[col])
df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
df_train = pd.concat([df_train, df_dummies], axis=1)

# apply the same dummies on the test set
col = 'State'
df_dummies = pd.get_dummies(df_test[col])
df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
df_test = pd.concat([df_test, df_dummies], axis=1)

41326    5790
41401    2567
41327     574
41336     342
41330     275
41332     175
41324      96
41325      72
41335      54
41345      16
41361      14
41367      12
41342      11
41415       2
Name: State, dtype: int64


# Feature: RescuerID

In [29]:
print(df_train['RescuerID'].value_counts().head(15))

fa90fa5b1ee11c86938398b60abc32cb    307
aa66486163b6cbc25ea62a34b11c9b91    205
c00756f2bdd8fa88fc9f07a8309f7d5d    167
b53c34474d9e24574bcec6a3d3306a0d    152
ee2747ce26468ec44c7194e7d1d9dad9    113
95481e953f8aed9ec3d16fc4509537e8     95
a042471e0f43f2cf707104a1a138a7df     70
b770bac0ca797cf1433c48a35d30c4cb     70
fd970cc91d06d82eebf046340137b272     62
438a9bdce8ef4d5948fc40e422d34d0d     57
7ed6d84e2e6879245e55447aee39c328     57
e62135526c27156b8479420aad166317     52
8b6c5cd067ada5f54ca5ffc7f7b5d896     48
cccb18b8f8b81862f9a1ebc65d651d22     46
530f57b53cb3199e1d5e67733ddc0876     44
Name: RescuerID, dtype: int64


In [30]:
# df_train['fa90fa5b1ee11c86938398b60abc32cb'] = np.where(df_train['RescuerID'].str.contains("fa90fa5b1ee11c86938398b60abc32cb")==True, '1', '0')
# df_train['fa90fa5b1ee11c86938398b60abc32cb'] = df_train['fa90fa5b1ee11c86938398b60abc32cb'].astype(int)

# df_test['fa90fa5b1ee11c86938398b60abc32cb'] = np.where(df_test['RescuerID'].str.contains("fa90fa5b1ee11c86938398b60abc32cb")==True, '1', '0')
# df_test['fa90fa5b1ee11c86938398b60abc32cb'] = df_test['fa90fa5b1ee11c86938398b60abc32cb'].astype(int)

# df_test['aa66486163b6cbc25ea62a34b11c9b91'] = np.where(df_test['RescuerID'].str.contains("aa66486163b6cbc25ea62a34b11c9b91")==True, '1', '0')
# df_test['aa66486163b6cbc25ea62a34b11c9b91'] = df_test['aa66486163b6cbc25ea62a34b11c9b91'].astype(int)

# df_train['aa66486163b6cbc25ea62a34b11c9b91'] = np.where(df_train['RescuerID'].str.contains("aa66486163b6cbc25ea62a34b11c9b91")==True, '1', '0')
# df_train['aa66486163b6cbc25ea62a34b11c9b91'] = df_train['aa66486163b6cbc25ea62a34b11c9b91'].astype(int)

# df_test['c00756f2bdd8fa88fc9f07a8309f7d5d'] = np.where(df_test['RescuerID'].str.contains("c00756f2bdd8fa88fc9f07a8309f7d5d")==True, '1', '0')
# df_test['c00756f2bdd8fa88fc9f07a8309f7d5d'] = df_test['c00756f2bdd8fa88fc9f07a8309f7d5d'].astype(int)

# df_train['c00756f2bdd8fa88fc9f07a8309f7d5d'] = np.where(df_train['RescuerID'].str.contains("c00756f2bdd8fa88fc9f07a8309f7d5d")==True, '1', '0')
# df_train['c00756f2bdd8fa88fc9f07a8309f7d5d'] = df_train['c00756f2bdd8fa88fc9f07a8309f7d5d'].astype(int)

# df_test['b53c34474d9e24574bcec6a3d3306a0d'] = np.where(df_test['RescuerID'].str.contains("b53c34474d9e24574bcec6a3d3306a0d")==True, '1', '0')
# df_test['b53c34474d9e24574bcec6a3d3306a0d'] = df_test['b53c34474d9e24574bcec6a3d3306a0d'].astype(int)

# df_train['b53c34474d9e24574bcec6a3d3306a0d'] = np.where(df_train['RescuerID'].str.contains("b53c34474d9e24574bcec6a3d3306a0d")==True, '1', '0')
# df_train['b53c34474d9e24574bcec6a3d3306a0d'] = df_train['b53c34474d9e24574bcec6a3d3306a0d'].astype(int)


## Feature: Description

In [31]:
# # Start with one review:
# text = df_train.Description[0]


# text = ' '.join(df_train['Description'])
# print ("There are {} words in the combination of all review.".format(len(text)))

# # Create stopword list:
# stopwords = set(STOPWORDS)
# stopwords.update(["drink", "now", "wine", "flavor", "flavors"])


# # Create and generate a word cloud image:
# wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)

# # Display the generated image:
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis("off")
# plt.show()

In [32]:
df_train['loving'] = np.where(df_train['Description'].str.contains("loving")==True, '1', '0')
df_train['loving'] = df_train['loving'].astype(int)

df_test['loving'] = np.where(df_test['Description'].str.contains("loving")==True, '1', '0')
df_test['loving'] = df_test['loving'].astype(int)

df_train['adopt'] = np.where(df_train['Description'].str.contains("adopt")==True, '1', '0')
df_train['adopt'] = df_train['adopt'].astype(int)

df_test['adopt'] = np.where(df_test['Description'].str.contains("adopt")==True, '1', '0')
df_test['adopt'] = df_test['adopt'].astype(int)

df_train['adopted'] = np.where(df_train['Description'].str.contains("adopted")==True, '1', '0')
df_train['adopted'] = df_train['adopted'].astype(int)

df_test['adopted'] = np.where(df_test['Description'].str.contains("adopted")==True, '1', '0')
df_test['adopted'] = df_test['adopted'].astype(int)


In [33]:
print(df_train.columns)

Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID',
       'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed',
       'Type_1', 'Type_2', 'Name Status', 'M1', 'M13-M24', 'M168+', 'M2',
       'M212-M168', 'M25-M72', 'M3', 'M4', 'M5', 'M6', 'M7-M12', 'M73-M120',
       'T1', 'T10', 'T11', 'T12', 'T13', 'T14', 'T15', 'T16', 'T17', 'T18',
       'T19', 'T2', 'T20', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9', 'TX',
       'Gender_1', 'Gender_2', 'Gender_3', 'MaturitySize_1', 'MaturitySize_2',
       'MaturitySize_3', 'MaturitySize_4', 'FurLength_1', 'FurLength_2',
       'FurLength_3', 'Dewormed_1', 'Dewormed_2', 'Dewormed_3', 'Sterilized_1',
       'Sterilized_2', 'Sterilized_3', 'Health_1', 'Health_2', 'Health_3',
       'State_41324', 'State_41325', 'State_41326', 'State_41327',
       'State_41330', 'State

# Define Train & Test Data for Data Modeling

In [34]:
feature = list(df_train)

# items to be removed 
unwanted_feature = {'Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', '',
       'Sterilized', 'Health', 'State', 'RescuerID',
       'VideoAmt', 'Description', 'PetID', 'AdoptionSpeed', 'Color'}


list1 = [ele for ele in feature if ele not in unwanted_feature]

# list1

In [35]:
# select the columns
X_columns = list1
y_column = ['AdoptionSpeed']

In [36]:
# split the data using sklearn

threshold = 0.66
X = df_train[X_columns]
y = df_train[y_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1.0-threshold, shuffle=True)

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

X_train (6600, 76)
y_train (6600, 1)
X_test (3400, 76)
y_test (3400, 1)


# Model Experiment

In [37]:
def model_training(model_name, model, X_train, y_train):
    model.fit(X_train, y_train)
    return model
    
def model_prediction(model, X_test):
    y_pred = model.predict(X_test)
    return y_pred

def model_evaluation(model_name, y_test, y_pred):
    kappa = cohen_kappa_score(y_test, y_pred, weights ='quadratic')
    print(model_name)
    print('kappa', round(kappa, 4))
#     plt.scatter(y_test, y_pred, alpha=0.3)
#     plt.plot(range(0,5000000, 100), range(0,5000000, 100), '--r', alpha=0.3, label='Line1')
#     plt.title(model_name)
#     plt.xlabel('True Value')
#     plt.ylabel('Predict Value')
#     plt.xlim([0, 5000000])
#     plt.ylim([0, 5000000])
#     plt.show()
    print('')

def run_experiment(model_name, model, X_train, y_train, X_test):
    train_model = model_training(model_name, model, X_train, y_train)
    predictions = model_prediction(train_model, X_test)
    model_evaluation(model_name, y_test, predictions)
    
# run_experiment('KNN', KNeighborsClassifier(), X_train, y_train, X_test)
# run_experiment('SVC Kernel', SVC(kernel="linear", C=0.025), X_train, y_train, X_test)
# run_experiment('SVC Gamma', SVC(kernel="linear", C=0.025), X_train, y_train, X_test)

# run_experiment('DecisionTree', DecisionTreeClassifier(), X_train, y_train, X_test)
# run_experiment('RandomForest 10', RandomForestClassifier(10), X_train, y_train, X_test)
# run_experiment('RandomForest 100', RandomForestClassifier(100), X_train, y_train, X_test)
# run_experiment('GradientBoosting', GradientBoostingClassifier(), X_train, y_train, X_test)
# run_experiment('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis(), X_train, y_train, X_test)
# run_experiment('GaussianProcess', GaussianProcessClassifier(), X_train, y_train, X_test)
# run_experiment('GaussianNB', GaussianNB(), X_train, y_train, X_test)

# run_experiment('Gradient Boosting', GradientBoostingRegressor(), X_train, y_train, X_test)


# Model Training

In [38]:
# train a GradientBoosting Classifier
model = GradientBoostingClassifier()
model.fit(X_train, y_train.values.ravel())
y_pred = model.predict(X_test)

# Model Evaluation

In [39]:
kappa = cohen_kappa_score(y_test, y_pred, weights ='quadratic')
print('kappa', round(kappa, 4))
print(confusion_matrix(y_test, y_pred))

kappa 0.3214
[[  1  28  27   8  28]
 [  3 177 300  58 162]
 [  3 142 381 118 253]
 [  0  98 266 128 257]
 [  2  69 186  64 641]]


Using Cross Validation

In [40]:
k = 3
results = []
kf = KFold(n_splits=k)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    model.fit(X_train, y_train.ravel())
    y_pred = model.predict(X_test)
    kappa = cohen_kappa_score(y_test, y_pred, weights ='quadratic')
    results.append(round(kappa, 4))

print('Kappa for each fold:', results)
print('AVG(kappa)', round(np.mean(results), 4))
print('STD(kappa)', round(np.std(results), 4))

Kappa for each fold: [0.3626, 0.3257, 0.3148]
AVG(kappa) 0.3344
STD(kappa) 0.0205


# Prepare your submission

In [41]:
df_prediction = df_test[X_columns]
df_test['AdoptionSpeed'] = model.predict(df_prediction)
df_test[['PetID', 'AdoptionSpeed']]

Unnamed: 0,PetID,AdoptionSpeed
0,f42161740,2
1,0118db3a8,4
2,e5164d828,2
3,5335bfb38,4
4,ff2cf88a0,4
5,1d13441b9,2
6,7d835cf7c,3
7,577d15fea,2
8,91736f444,4
9,db194aec8,2


In [42]:
df_test[['PetID', 'AdoptionSpeed']].to_csv('Submission/submission_5.csv', index=False)