In [2]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from config import db_password
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
df=pd.read_csv('../../all_records.csv')

In [None]:
df=df.drop(columns=['Unnamed: 0','Name_outcome','MonthYear_intake','MonthYear_outcome',
                    'gender_intake','gender_outcome','fixed_intake','fixed_outcome','Days_length','Outcome_Subtype',
                   'retriever','shepherd','beagle','terrier','boxer','poodle','rottweiler','dachshund','chihuahua',
                   'pit bull','Age','Age_upon_Outcome','Found_Location'])
df.head()

# Preprocessing Data

In [None]:
# Change Name_Intake to Yes or No
df.Name_intake = df.Name_intake.apply(lambda x: 'No' if pd.isnull(x) else 'Yes')
df.Name_intake.value_counts()

In [None]:
df.isnull().sum()

In [None]:
df=df.dropna()

In [None]:
# Check for duplicates rows
count=0
for i in df['Animal ID'].duplicated():
    if i is True:
        count+=1
count

In [None]:
df=df.drop_duplicates(subset='Animal ID')

In [None]:
# Combine Died & Disposal Outcomes
df.loc[df.Outcome_Type == 'Disposal', 'Outcome_Type'] = 'Died'
df.Outcome_Type.value_counts()

In [None]:
# Combine Transfer & Relocate Outcomes
df.loc[df.Outcome_Type == 'Relocate', 'Outcome_Type'] = 'Transfer'
df.Outcome_Type.value_counts()

In [None]:
#### Combine Rto-Adopt & Return to Owner
df.loc[df.Outcome_Type == 'Rto-Adopt', 'Outcome_Type'] = 'Return to Owner'
df.Outcome_Type.value_counts()

In [None]:
df.loc[df.Outcome_Type == 'Missing', 'Outcome_Type'] = None
df.Outcome_Type.value_counts()

In [None]:
df.loc[df.Outcome_Type == 'Euthanasia', 'Outcome_Type'] = 'Died'
df.Outcome_Type.value_counts()

In [None]:
df.isnull().sum()

In [None]:
df.dropna()

In [None]:
# Update DateTime_intake to datatime datatype
df.DateTime_intake = pd.to_datetime(df.DateTime_intake, format='%Y-%m-%d')

In [None]:
# Update DateTime_outcome to datatime datatype
df.DateTime_outcome = pd.to_datetime(df.DateTime_outcome, format='%Y-%m-%d')

In [None]:
# Create a new column and calculate the length of stay
df['datetime_length']=df['DateTime_outcome']-df['DateTime_intake']

In [None]:
# Create a temporary 'int' column and convert datetime to int
df['int'] = df['datetime_length'].astype(np.int64)

In [None]:
# replace negative values to nan
df.loc[df.int < 0, 'int'] = None

In [None]:
# Check for null values
df.isnull().sum()

In [None]:
# drop null values
df=df.dropna()

In [None]:
# drop temporary 'int' column and original DateTime_length column
df=df.drop(columns=['DateTime_length','datetime_length'])

# Rename calculated length of stay column
df.rename(columns = {'int':'DateTime_length'}, inplace = True)

In [None]:
# Combine Aged and Feral with Other for IntakeCondition
df.loc[(df.IntakeCondition == 'Aged') | (df.IntakeCondition == 'Feral'), 'IntakeCondition'] = 'Other'

In [None]:
# Combine Injured and Sick for Intake Condition
df.loc[(df.IntakeCondition == 'Injured') | (df.IntakeCondition == 'Sick'), 'IntakeCondition'] = 'Medical'

In [None]:
# Combine Pregnant and Nursing for  IntakeCondition
df.loc[(df.IntakeCondition == 'Nursing') | (df.IntakeCondition == 'Pregnant'), 'IntakeCondition' ] = 'Maternity'
df.IntakeCondition.value_counts()

In [None]:
# Combine Bird and Livestock with Other for Animal_Type_intake
df.loc[(df.Animal_Type_intake == 'Bird') | (df.Animal_Type_intake == 'Livestock'), 'Animal_Type_intake'] = 'Other'
df.Animal_Type_intake.value_counts()

In [None]:
df.rename(columns={'Sex': 'Sex_Intake'}, inplace=True)

In [None]:
# Find Mix breeds
new_breed=[]
for i in df['Breed_intake']:
    if 'Mix' in i:
        i='Mix'
    elif "/" in i:
        i="Mix"
    elif 'Chihuahua' in i:
        i = 'Chihuahua'
    elif 'Retriever' in i or i=='Chesa Bay Retr':
        i = 'Retriever Breeds'
    elif 'Bull' in i or 'Staffordshire' in i or i=='Chinese Sharpei' or i=='Dogo Argentino' or i=='Boxer':
        i = 'Bully Breeds'
    elif 'Terrier' in i:
        i= 'Terrier Breeds'
    elif i=='Bat' or i=='Raccoon' or i=='Opossum' or i=='Duck' or i=='Fox' or i=='Grackle' or i=='Hawk' or i=='Coyote' or i=='Pigeon' or i=='Dove' or i=='Armadillo' or i=='Owl' or i=='Skunk' or i=='Squirrel' or i=='Mockingbird' or i=='Heron' or i=='Sparrow':
        i = 'Wildlife'
    elif i=='German Shepherd' or i=='Belgian Malinois':
        i = 'German Shepherd'
    elif 'Pointer' in i or 'Span' in i or i=='Vizsla' or i=='Brittany' or i=='Shiba Inu' or i=='Dalmatian' or i=='Standard Poodle' or i=='Carolina Dog' or i=='Weimaraner':
        i = 'Sporting Breeds'
    elif 'hound' in i or 'Hound' in i or i=='Beagle' or i=='Saluki' or 'Coon' in i or i=='Catahoula' or i=='Black Mouth Cur' or i=='Harrier' or i=='Blue Lacy' or i=='Treeing Tennesse Brindle' or i=='Whippet':
        i = 'Hound Breeds'
    elif 'Husky' in i or 'Malamute' in i or 'Eskimo' in i or i=='Finnish Spitz' or i=='Samoyed' or i=='Keeshond' or i=='Jindo':
        i = 'Husky Breeds'
    elif i=='Shih Tzu' or i=='Miniature Poodle' or i=='Miniature Schnauzer' or i=='Maltese' or i=='Pomeranian' or i=='Lhasa Apso' or i=='Toy Poodle' or i=='Pekingese' or i=='Bichon Frise' or i=='Cavalier Span' or i=='West Highland' or i=='Papillon' or i=='Havanese' or i=='Japanese Chin' or i=='Dandie Dinmont' or i=='Bruss Griffon' or i=='Coton De Tulear' or 'Dachshund' in i or i=='French Bulldog' or i=='Pug' or i=='Miniature Pinscher' or i=='Pbgv':
        i = 'Toy Breeds'
    elif 'Australian' in i or 'Collie' in i or 'Heeler' in i or 'Sheepdog' in i or i=='English Shepherd' or i=='Dutch Shepherd' or 'Corgi' in i or i=='Beauceron' or i=='Hovawart':
        i = 'Herding Breeds'
    elif i=='Chinese Sharpei' or i=='Chow Chow' or i=='Standard Schnauzer' or i=='Basenji' or i=='Rottweiler' or i=='Doberman Pinsch':
        i = 'Working Breeds'
    elif i=='Great Pyrenees' or i=='Great Dane' or i=='Anatol Shepherd' or i=='Cane Corso' or i=='Rhod Ridgeback' or i=='Akita' or i=='Mastiff' or i=='Presa Canario' or 'Bernese' in i or i=='Burmese' or 'Bernard' in i or i=='Leonberger' or i=='Greater Swiss Mountain Dog' or i=='Boerboel'or i=='Landseer':
        i = 'X Large Breeds'
    elif i=='Domestic Shorthair':
        i = 'Domestic Shorthair'
    elif i=='Domestic Medium Hair':
        i = 'Domestic Medium Hair'
    elif i=='Persian' or i=='Siamese' or i=='Domestic Longhair' or i=='Persian' or i=='Himalayan' or i=='Russian Blue' or i=='Bengal' or i=='Devon Rex' or i=='Sphynx' or i=='British Shorthair' or i=='Manx':
        i = 'Other Cat'
    else:
        i='Other'
    new_breed.append(i)
new_breed

In [None]:
df["new_breed"]=new_breed

In [None]:
breed_counts=df['new_breed'].value_counts()
breed_counts

In [None]:
df.loc[df.new_breed == 'Retriever Breeds', 'new_breed'] = 'Sporting Breeds'
df.loc[df.new_breed == 'Chihuahua', 'new_breed'] = 'Toy Breeds'
df.loc[df.new_breed == 'German Shepherd', 'new_breed'] = 'Working Breeds'
df.loc[df.new_breed == 'Husky Breeds', 'new_breed'] = 'Sporting Breeds'
df['new_breed'].value_counts()

In [None]:
replace_breed=breed_counts.loc[breed_counts<150].index.tolist()
for i in replace_breed:
    df.new_breed = df.new_breed.replace(i,"Other")

df.new_breed.value_counts()

In [None]:
# drop and rename
df=df.drop(columns=['Breed_intake'])
df=df.rename(columns={"new_breed": "Breed_Type"})
df.head()

In [None]:
colorNew=[]
for i in df.Color_intake:
    if "/" in i:
        i="Bicolor"
    elif "Tabby" in i:
        i='Tabby'
    elif "Brindle" in i:
        i="Tabby"
    elif "Merle" in i:
        i="Merle"
    elif "Tiger" in i:
        i="Tiger"
    elif "Tortie" in i:
        i="Tabby"
    elif "Calico" in i:
        i="Tabby"
    elif "Torbie" in i:
        i="Tabby"
    elif i=="Apricot" or i=="Gold" or i=="Yellow" or i=="Fawn":
        i="Orange"
    elif "Blue" in i:
        i="Blue"
    elif "Black" in i or i=="Sable":
        i="Black"
    elif i=="Liver":
        i="Brown"
    elif "Point" in i:
        i='Point'
    elif "Tick" in i:
        i="Point"
    else:
        i=i
    colorNew.append(i)
print(set(colorNew))
print(len(set(colorNew)))

In [None]:
df['Color_intake']=colorNew

In [None]:
color_counts=df['Color_intake'].value_counts()
color_counts

In [None]:
df.loc[df.Color_intake.isin(['Red', 'Orange']), 'Color_intake'] = 'Tan'

In [None]:
replace_color=color_counts.loc[color_counts<500].index.tolist()
for i in replace_color:
    df.Color_intake = df.Color_intake.replace(i,"Other")

df.Color_intake.value_counts()

In [None]:
intake_df=df[['Animal ID','DateTime_intake','Intake_Type','IntakeCondition',
             'Animal_Type_intake',
              'Name_intake',
             'Sex_Intake',
             'Color_intake',
             'Breed_Type']]

columns_intake=['animal_id','datetime_intake','intake_type','intake_condition','animal_type_intake',
        'name_intake','sex_intake','color_intake','breed_type']

intake_df.columns =columns_intake
intake_df.head()

In [None]:
outcome_df=df[['Animal ID','DateTime_outcome',
              'Outcome_Type',
              'Sex_upon_Outcome','fixed_changed','Age_Bucket','DateTime_length']]

columns_outcome=['animal_id','datetime_outcome','outcome_type',
                 'sex_upon_outcome','fixed_changed','age_bucket','datetime_length']

outcome_df.columns=columns_outcome

outcome_df.head()

In [None]:
outcome_df.info()

In [None]:
# merge data
all_df = intake_df.merge(outcome_df, on='animal_id', how='left')
all_df.head()

In [None]:
all_df.info()

In [None]:
new_df=all_df.drop(columns=['animal_id','datetime_intake','datetime_outcome',
                    'sex_intake','fixed_changed'])
new_df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# encode text columns with dummy values
df_encoded = pd.get_dummies(new_df, columns=['intake_type', 'intake_condition', 'animal_type_intake', 'name_intake', 
                    'color_intake', 'sex_upon_outcome', 'age_bucket', 'breed_type'])

In [None]:
le = LabelEncoder()
df_encoded['outcome_type'] = le.fit_transform(df_encoded['outcome_type'])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)

In [None]:
df_encoded.head()

In [None]:
# Define the features set.
X = df_encoded.copy()
X=X.drop('outcome_type',axis=1)
X.head()

In [None]:
# Define the target set.
y = df_encoded["outcome_type"].ravel()
y[:5]

In [None]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Resampling

In [None]:
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X_train_scaled, y_train)

# Random Forest

In [None]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [None]:
# Fitting the model
rf_model = rf_model.fit(X_resampled, y_resampled)

In [None]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [None]:
results = pd.DataFrame({
   "Prediction": predictions,
   "Actual": y_test
}).reset_index(drop=True)
results.head()

In [None]:
# print confusion matrix
cm = confusion_matrix(predictions, y_test)
cm_df = pd.DataFrame(cm,
                     index = ['Adoption', 'Died', 'Return to Owner','Transfer'], 
                     columns = ['Adoption', 'Died', 'Return to Owner','Transfer'])
cm_df

In [None]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [None]:
print(f"Accuracy Score : {acc_score}")
# print classification report
print(classification_report(predictions, y_test, target_names=['Adoption', 'Died', 'Return to Owner','Transfer']))

In [None]:
# show feature importance
feat_imp = pd.Series(rf_model.feature_importances_, index=X.columns)
feat_imp.nlargest(10).plot(kind='bar', figsize=(12,5), cmap='Dark2', edgecolor='black', title='Feature Importance')
# plt.xticks(rotation=65)

# Boosting Gradient

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=0.5, max_features=5, max_depth=3, random_state=0)

classifier.fit(X_resampled, y_resampled)
predictions = classifier.predict(X_test_scaled)

In [None]:
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")
print(classification_report(predictions, y_test, target_names=['Adoption', 'Died', 'Return to Owner','Transfer']))

In [None]:
# show feature importance
feat_imp = pd.Series(classifier.feature_importances_, index=X.columns)
feat_imp.nlargest(10).plot(kind='bar', figsize=(12,5), cmap='Dark2', edgecolor='black', title='Feature Importance')
# plt.xticks(rotation=65)

['builtins', 'builtins', 'pandas', 'numpy', 'types']