# Pet Adoption Prediction:
Animal adoption rates are strongly correlated to the metadata associated with their online profiles, such as descriptive text and photo characteristics. We are trying to predict the Pet Adoption speed based on their listing on the website.

## 1. Importing all dependencies

In [1]:
#!pip install webcolors

In [2]:
import pandas as pd
import os
import json
import webcolors
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, cohen_kappa_score
import numpy as np

## 2. Image Data

### 2.1 Defining functions to get color name out of rgb values returned for images(using Google Vision API)

In [3]:
# if webcolors api doesnt return any color - get the closest color
def closest_colour(requested_colour):
    min_colours = {}
    for key, name in webcolors.css3_hex_to_names.items():
        r_c, g_c, b_c = webcolors.hex_to_rgb(key)
        rd = (r_c - requested_colour[0]) ** 2
        gd = (g_c - requested_colour[1]) ** 2
        bd = (b_c - requested_colour[2]) ** 2
        min_colours[(rd + gd + bd)] = name
    return min_colours[min(min_colours.keys())]

# using webcolors api to get color name out of rgb values
def get_colour_name(requested_colour):
    if(len(requested_colour)) < 3:
        return ""
    try:
        closest_name = actual_name = webcolors.rgb_to_name(requested_colour)
    except ValueError:
        closest_name = closest_colour(requested_colour)
        actual_name = None
    return closest_name

### 2.2 Reading the LabelAnnotations and FaceAnnotations from Image Data (using JSON returned from Google Vision API)

In [4]:
# likelihood scores dictionary
likelihood_scores = {'UNKNOWN': '0', 'VERY_UNLIKELY': 0, 'UNLIKELY': 1, 'POSSIBLE': 2, 'LIKELY': 3, 'VERY_LIKELY': 4}


In [5]:
# reading all file names from a directory
def read_json(path):
    json_files = [pos_json for pos_json in os.listdir(path)]
    print("Total Number of JSON Files: ", len(json_files))
    return json_files

### 2.2.1 Finding out the most common Labels in Images

In [6]:
# Parsing all JSONs and finding out the most frequent labels in the images.
def read_most_common_labels(json_files, cutoff, path_to_json):
    all_labels = []
    for index, js in enumerate(json_files):
        with open(os.path.join(path_to_json, js)) as json_file:
            json_text = json.load(json_file)
            labelAnnotations = json_text.get('labelAnnotations', 'N/A')
            if labelAnnotations != 'N/A':
                for label in labelAnnotations:
                    all_labels.append(label['description'])

    unique_labels = set(all_labels)

    important_words = {}
    for l in unique_labels:
        count = all_labels.count(l)
        word = [l]
        if count in important_words:
            important_words[count].append(l)
        else:
            important_words[count] = word

    # sorted_d = sorted(important_words.items(), key=operator.itemgetter(0), reverse=True)

    filtered_words = {k: v for k, v in important_words.items() if k > cutoff}
    return filtered_words

In [7]:
path_to_json = 'image/train_metadata/'
files = read_json(path_to_json)

Total Number of JSON Files:  58311


In [8]:
# cutoff is set to 8000 words
filtered_words = read_most_common_labels(files, 8000, path_to_json)
filtered_words

{26729: ['whiskers'],
 27037: ['small to medium sized cats'],
 20109: ['domestic short haired cat'],
 29716: ['dog'],
 10215: ['european shorthair'],
 26892: ['cat like mammal'],
 13295: ['puppy'],
 30836: ['carnivoran'],
 13057: ['fauna'],
 8049: ['aegean cat'],
 30544: ['dog breed group'],
 27213: ['cat'],
 28375: ['snout'],
 10308: ['street dog'],
 30470: ['dog breed'],
 30799: ['dog like mammal'],
 8778: ['sporting group'],
 20762: ['kitten']}

In [9]:
label_columns = [v for k, [v] in filtered_words.items()]
print(label_columns)

['whiskers', 'small to medium sized cats', 'domestic short haired cat', 'dog', 'european shorthair', 'cat like mammal', 'puppy', 'carnivoran', 'fauna', 'aegean cat', 'dog breed group', 'cat', 'snout', 'street dog', 'dog breed', 'dog like mammal', 'sporting group', 'kitten']


### 2.2.2 Constructing a DataFrame reading all JSON Files containing Image Metadata

In [10]:
def construct_df_from_json(json_files, df, path_to_json):
    for index, js in enumerate(json_files):
        with open(os.path.join(path_to_json, js)) as json_file:
            json_text = json.load(json_file)
            #The json file name is in the format petid-photonum
            full_id = js.partition('.json')[0].partition('-')
            pet_id = full_id[0]
            photo_num = full_id[2]
            # Face Annotations and Likelihood Scores
            faceAnnotations = json_text.get('faceAnnotations', 'N/A')
            detectionConfidence = 0
            joyLikelihood = 0
            sorrowLikelihood = 0
            angerLikelihood = 0
            surpriseLikelihood = 0
            underExposedLikelihood = 0
            blurredLikelihood = 0
            headwearLikelihood = 0

            if faceAnnotations != 'N/A':
                detectionConfidence = faceAnnotations[0]['detectionConfidence']
                joyLikelihood = likelihood_scores[faceAnnotations[0]['joyLikelihood']]
                sorrowLikelihood = likelihood_scores[faceAnnotations[0]['sorrowLikelihood']]
                angerLikelihood = likelihood_scores[faceAnnotations[0]['angerLikelihood']]
                surpriseLikelihood = likelihood_scores[faceAnnotations[0]['surpriseLikelihood']]
                underExposedLikelihood = likelihood_scores[faceAnnotations[0]['underExposedLikelihood']]
                blurredLikelihood = likelihood_scores[faceAnnotations[0]['blurredLikelihood']]
                headwearLikelihood = likelihood_scores[faceAnnotations[0]['headwearLikelihood']]
            #Detection Score of Popular Labels
            labelcount = 0
            labels = ''
            dog_like_mammal = 0
            kitten = 0
            puppy = 0
            cat_like_mammal = 0
            european_shorthair = 0
            fauna = 0
            carnivoran = 0
            cat = 0
            snout = 0
            domestic_short_haired_cat = 0
            dog = 0
            dog_breed_group = 0
            small_to_medium_sized_cats = 0
            aegean_cat = 0
            sporting_group = 0
            street_dog = 0
            whiskers = 0
            dog_breed = 0

            labelAnnotations = json_text.get('labelAnnotations', 'N/A')
            if labelAnnotations != 'N/A':
                for label in labelAnnotations:
                    labelcount += 1
                    labels = labels + label['description']
                    if label['description'] == 'dog like mammal':
                        dog_like_mammal = label['score']
                    elif label['description'] == 'kitten':
                        kitten = label['score']
                    elif label['description'] == 'puppy':
                        puppy = label['score']
                    elif label['description'] == 'cat like mammal':
                        cat_like_mammal = label['score']
                    elif label['description'] == 'european shorthair':
                        european_shorthair = label['score']
                    elif label['description'] == 'fauna':
                        fauna = label['score']
                    elif label['description'] == 'carnivoran':
                        carnivoran = label['score']
                    elif label['description'] == 'cat':
                        cat = label['score']
                    elif label['description'] == 'snout':
                        snout = label['score']
                    elif label['description'] == 'domestic short haired cat':
                        domestic_short_haired_cat = label['score']
                    elif label['description'] == 'dog':
                        dog = label['score']
                    elif label['description'] == 'dog breed group':
                        dog_breed_group = label['score']
                    elif label['description'] == 'small to medium sized cats':
                        small_to_medium_sized_cats = label['score']
                    elif label['description'] == 'aegean cat':
                        aegean_cat = label['score']
                    elif label['description'] == 'sporting group':
                        sporting_group = label['score']
                    elif label['description'] == 'street dog':
                        street_dog = label['score']
                    elif label['description'] == 'whiskers':
                        whiskers = label['score']
                    elif label['description'] == 'dog breed':
                        dog_breed = label['score']

            cropHintsConfidence = 0
            cropHintsAnnotation = json_text.get('cropHintsAnnotation', 'N/A')
            if cropHintsAnnotation != 'N/A':
                cropHintsConfidence = cropHintsAnnotation['cropHints'][0]['confidence']

            color1 = ""
            color2 = ""
            color3 = ""
            colors_in_image = 0

            imagePropertiesAnnotation = json_text.get('imagePropertiesAnnotation', 'N/A')
            if imagePropertiesAnnotation != 'N/A':
                colorlist = imagePropertiesAnnotation['dominantColors']['colors']
                colors_in_image = len(colorlist)
                colorlist = sorted(colorlist, key=lambda k: k['score'], reverse=True)[:3]
                # Checking if colors in the csv data matches colors in Images
                if colors_in_image > 0:
                    color1 = get_colour_name(tuple(colorlist[0]['color'][e] for e in colorlist[0]['color']))
                if colors_in_image > 1:
                    color2 = get_colour_name(tuple(colorlist[1]['color'][e] for e in colorlist[1]['color']))
                if colors_in_image > 2:
                    color3 = get_colour_name(tuple(colorlist[2]['color'][e] for e in colorlist[2]['color']))

            df.loc[index] = [pet_id, photo_num, detectionConfidence, joyLikelihood,
                             sorrowLikelihood,
                             angerLikelihood, surpriseLikelihood,
                             underExposedLikelihood,
                             blurredLikelihood, headwearLikelihood,
                             labels, labelcount, cropHintsConfidence, color1, color2,
                             color3, colors_in_image,
                             street_dog,
                             dog_like_mammal, aegean_cat, carnivoran,
                             small_to_medium_sized_cats,
                             cat_like_mammal, dog_breed, snout,
                             whiskers, domestic_short_haired_cat,
                             puppy, dog_breed_group,
                             fauna, kitten, european_shorthair, dog, sporting_group, cat]

    return df


In [11]:
# Train Images
df_columns = ['petid', 'photo_num', 'detectionConfidence', 'joyLikelihood',
              'sorrowLikelihood', 'angerLikelihood', 'surpriseLikelihood',
              'underExposedLikelihood', 'blurredLikelihood', 'headwearLikelihood',
              'labels', 'labelcount', 'cropHintsConfidence', 'color1', 'color2', 'color3', 'colors_in_image',
              'street dog',
              'dog like mammal', 'aegean cat',
              'carnivoran',
              'small to medium sized cats', 'cat like mammal', 'dog breed', 'snout',
              'whiskers', 'domestic short haired cat', 'puppy', 'dog breed group',
              'fauna', 'kitten', 'european shorthair', 'dog', 'sporting group', 'cat']

pet_image_data = pd.DataFrame(columns=df_columns)
pet_image_data = construct_df_from_json(files, pet_image_data, path_to_json)
pet_image_data.to_pickle("image/pet_image_data_trainv1.pkl")

In [12]:
# Test Images
path_to_json = 'image/test_metadata/'
test_files = read_json(path_to_json)
df_columns = ['petid', 'photo_num', 'detectionConfidence', 'joyLikelihood',
              'sorrowLikelihood', 'angerLikelihood', 'surpriseLikelihood',
              'underExposedLikelihood', 'blurredLikelihood', 'headwearLikelihood',
              'labels', 'labelcount', 'cropHintsConfidence', 'color1', 'color2', 'color3', 'colors_in_image',
              'street dog',
              'dog like mammal', 'aegean cat',
              'carnivoran',
              'small to medium sized cats', 'cat like mammal', 'dog breed', 'snout',
              'whiskers', 'domestic short haired cat', 'puppy', 'dog breed group',
              'fauna', 'kitten', 'european shorthair', 'dog', 'sporting group', 'cat']
pet_image_data_test = pd.DataFrame(columns=df_columns)
pet_image_data_test = construct_df_from_json(test_files, pet_image_data_test, path_to_json)
pet_image_data_test.to_pickle("image/pet_image_data_testv1.pkl")

Total Number of JSON Files:  15040


### 2.3 Cleaning the Image Data

In [13]:
def clean_image(df):
    df['photo_num'] = df['photo_num'].astype('int')
    df['detectionConfidence'] = df['detectionConfidence'].astype('float')
    df['cropHintsConfidence'] = df['cropHintsConfidence'].astype('float')
    df['joyLikelihood'] = df['joyLikelihood'].astype('int')
    df['colors_in_image'] = df['colors_in_image'].astype('int')
    df['sorrowLikelihood'] = df['sorrowLikelihood'].astype('int')
    df['angerLikelihood'] = df['angerLikelihood'].astype('int')
    df['surpriseLikelihood'] = df['surpriseLikelihood'].astype('int')
    df['underExposedLikelihood'] = df['underExposedLikelihood'].astype('int')
    df['blurredLikelihood'] = df['blurredLikelihood'].astype('int')
    df['headwearLikelihood'] = df['headwearLikelihood'].astype('int')
    df['labelcount'] = df['labelcount'].astype('int')
    df['street dog'] = df['street dog'].astype('float')
    df['dog like mammal'] = df['dog like mammal'].astype('float')
    df['aegean cat'] = df['aegean cat'].astype('float')
    df['carnivoran'] = df['carnivoran'].astype('float')
    df['small to medium sized cats'] = df['small to medium sized cats'].astype('float')
    df['cat like mammal'] = df['cat like mammal'].astype('float')
    df['dog breed'] = df['dog breed'].astype('float')
    df['snout'] = df['snout'].astype('float')
    df['whiskers'] = df['whiskers'].astype('float')
    df['domestic short haired cat'] = df['domestic short haired cat'].astype('float')
    df['puppy'] = df['puppy'].astype('float')
    df['dog breed group'] = df['dog breed group'].astype('float')
    df['fauna'] = df['fauna'].astype('float')
    df['kitten'] = df['kitten'].astype('float')
    df['european shorthair'] = df['european shorthair'].astype('float')
    df['dog'] = df['dog'].astype('float')
    df['sporting group'] = df['sporting group'].astype('float')
    df['cat'] = df['cat'].astype('float')
    return df

In [14]:
# Reading the first version of Image data in pickel files. 
train_images = pd.read_pickle("image/pet_image_data_trainv1.pkl")
test_images = pd.read_pickle("image/pet_image_data_testv1.pkl")

In [15]:
train_images = clean_image(train_images)
test_images = clean_image(test_images)

### 2.4 - Aggregating the Image Data.
Pets have more than 1 images. It is found that the first image is very useful to predict the Adoption Speed. Here we are aggregating the features of 1st Image with 80% weightage and the rest of the images have 20% weightage

In [16]:
def aggregate_photos(df):
    columns = ['petid', 'photo_num', 'labels', 'color1', 'color2', 'color3']
    categoricals = df[columns]
    categoricals = categoricals[categoricals['photo_num'] == 1]
    categoricals.drop(columns=['photo_num'], inplace=True)
    df = df.drop(columns=['labels', 'color1', 'color2', 'color3'])
    first_photo = df[df['photo_num'] == 1]
    rest_photo = df[df['photo_num'] != 1]
    first_photo = first_photo.groupby('petid').sum() * 0.8
    rest_photo = rest_photo.groupby('petid').sum() * 0.2
    all_photos = pd.concat([first_photo, rest_photo])
    all_photos = all_photos.groupby('petid').sum()
    all_photos.reset_index(inplace=True)
    all_photos = pd.merge(all_photos, categoricals, on='petid')

    all_photos.drop(columns=['photo_num'], inplace=True)

    return all_photos

In [17]:
train_images = aggregate_photos(train_images)
test_images = aggregate_photos(test_images)

In [18]:
train_images.to_pickle("image/pet_image_data_trainv2.pkl")
test_images.to_pickle("image/pet_image_data_testv2.pkl")

## 3. Working on the Structured CSV Features - train.csv and test.csv

In [19]:
train_df = pd.read_csv("csvfeatures/train.csv")
test_df = pd.read_csv("csvfeatures/test.csv")

### 3.1 Cleaning up and creating some extra Features.

In [20]:
def cleanup(df):
    df['Name'].fillna('No Name', inplace=True)
    df['NumColors'] = df.loc[:, 'Color1':'Color3'].apply(
        lambda row: bool(row.Color1) + bool(row.Color2) + bool(row.Color3), axis=1)
    df['IsMixedBreed'] = df.loc[:, 'Breed1':'Breed2'].apply(
        lambda row: bool(row.Breed1) * bool(row.Breed2), axis=1)
    name_frequency_df = pd.DataFrame(df['Name'].value_counts())
    df['NameFrequency'] = name_frequency_df.loc[df['Name'], 'Name'].values
    df['Description'].fillna('', inplace=True)
    df['WordCount'] = df['Description'].str.split().apply(lambda x: len(x))
    df = df.drop('Description', axis=1)
    return df

In [21]:
train_copy_df = cleanup(train_df)
test_copy_df = cleanup(test_df)

In [22]:
train_copy_df.to_pickle("csvfeatures/trainv1.pkl")
test_copy_df.to_pickle("csvfeatures/testv1.pkl")

### Note: We have included a separate notebook with our submission that includes the EDA and Feature Engineering steps and explains how we created the test_cleaned_Features_v2.pk and train_cleaned_Features_v2.pk files.

In [23]:
test = pd.read_pickle('cleanedData/test_cleaned_Features_v2.pk')
train = pd.read_pickle('cleanedData/train_cleaned_Features_v2.pk')


### 3.2 Some More Feature Engineering

In [24]:
# Finding out popular breeds in the data
popularBreeds = train.loc[train['AdoptionSpeed'] == 0, ['Breed1', 'Breed2']]
popularBreed1 = popularBreeds['Breed1'].unique()
popularBreed2 = popularBreeds['Breed2'].unique()

In [25]:
def eda(data):
    # Name
    data['hasName'] = data['Name'] != 'No Name'
    data['hasName'] = data['hasName'].astype('int')

    # Age
    data['isYoung'] = (data['Age'] < 2).astype('int')
    data['isOld'] = (data['Age'] > 12).astype('int')

    # Breed

    data['ispopularBreed1'] = data.apply(lambda x: x['Breed1'] in popularBreed1, axis=1).astype('int')
    data['ispopularBreed2'] = data.apply(lambda x: x['Breed2'] in popularBreed2, axis=1).astype('int')

    # mapping color with colorlabels
    colorlabels = pd.read_csv('csvfeatures/color_labels.csv')
    data['color1Name'] = data['Color1'].map(colorlabels.set_index('ColorID')['ColorName'])
    data['color2Name'] = data['Color2'].map(colorlabels.set_index('ColorID')['ColorName'])
    data['color3Name'] = data['Color3'].map(colorlabels.set_index('ColorID')['ColorName'])
    data[['color1Name', 'color2Name', 'color3Name']] = data[['color1Name', 'color2Name', 'color3Name']].fillna(
        'No Color')

    data.drop(columns=['Color1', 'Color2', 'Color3'], inplace=True)

    # Merge Image color data and see if color in image matches color in csv
    # TO DO

    # De-wormed
    data['Dewormed'] = data['Dewormed'].astype('category')

    # Fee
    data['isNotFree'] = (data['Fee'] != 0).astype('int')

    # FurLength
    data['FurLength'] = data['FurLength'].astype('category')

    # Gender
    data['Gender'] = data['Gender'].astype('category')

    # Health
    data['Health'] = data['Health'].astype('category')

    # Quantity
    data['isSingle'] = (data['Quantity'] == 1).astype('int')

    # State
    data['State'] = data['State'].astype('category')

    # Sterilized
    data['Sterilized'] = data['Sterilized'].astype('category')

    # Type
    data['Type'] = data['Type'].astype('category')

    # Vaccinated
    data['Vaccinated'] = data['Vaccinated'].astype('category')

    # Videos
    data['hasVideo'] = (data['VideoAmt'] > 1).astype('int')

    data['hasLotOfVideos'] = (data['VideoAmt'] > 5).astype('int')

    return data

In [26]:
train = eda(train)
test = eda(test)

In [27]:
train.to_pickle('cleanedData/csv_features_trainv3.pkl')
test.to_pickle('cleanedData/csv_features_testv3.pkl')

## 4. Merging the Image and CSV Features

In [28]:
train_features = pd.read_pickle('cleanedData/csv_features_trainv3.pkl')
test_features = pd.read_pickle('cleanedData/csv_features_testv3.pkl')

train_images = pd.read_pickle('image/pet_image_data_trainv2.pkl')
test_images = pd.read_pickle('image/pet_image_data_testv2.pkl')

In [29]:
def mergeandCreateFeatures(features, images):
    completeData = pd.merge(features, images, left_on='PetID', right_on='petid', how='left')
    completeData.drop(columns=['petid'], inplace=True)

    breed_labels = pd.read_csv('csvfeatures/breed_labels.csv')

    completeData['breed1Name'] = completeData['Breed1'].map(breed_labels.set_index('BreedID')['BreedName'])
    completeData['breed2Name'] = completeData['Breed2'].map(breed_labels.set_index('BreedID')['BreedName'])
    completeData['breed2Name'].fillna("not_recognized", inplace=True)

    completeData['labels'].fillna(" ", inplace=True)
    completeData['breed1Name'].fillna("not_recognized", inplace=True)

    completeData['breed1Name'] = completeData['breed1Name'].str.lower()
    completeData['breed2Name'] = completeData['breed2Name'].str.lower()
    completeData['labels'] = completeData['labels'].str.lower()

    completeData['isBreed1Matching'] = completeData[['breed1Name', 'labels']].apply(
        lambda x: x['breed1Name'] in x['labels'], axis=1).astype('int')
    completeData['isBreed2Matching'] = completeData[['breed2Name', 'labels']].apply(
        lambda x: x['breed2Name'] in x['labels'], axis=1).astype('int')

    # colors from csv features
    completeData['color1Name'] = completeData['color1Name'].str.lower()
    completeData['color2Name'] = completeData['color2Name'].str.lower()
    completeData['color3Name'] = completeData['color3Name'].str.lower()

    # colors from images
    completeData['color1'] = completeData['color1'].str.lower()
    completeData['color2'] = completeData['color2'].str.lower()
    completeData['color3'] = completeData['color3'].str.lower()

    completeData['iscolor1Matching'] = (completeData['color1'] == completeData['color1Name']).astype('int')
    completeData['iscolor2Matching'] = (completeData['color2'] == completeData['color2Name']).astype('int')
    completeData['iscolor3Matching'] = (completeData['color3'] == completeData['color3Name']).astype('int')

    # Null Values
    completeData.loc[:, 'detectionConfidence': 'cat'] = completeData.loc[:, 'detectionConfidence': 'cat'].fillna(0)
    completeData.loc[:, 'color1': 'color3'] = completeData.loc[:, 'color1': 'color3'].fillna("")

    return completeData


In [30]:
complete_train = mergeandCreateFeatures(train_features, train_images)
complete_test = mergeandCreateFeatures(test_features, test_images)

complete_train.to_pickle('cleanedData/complete_train.pkl')
complete_test.to_pickle('cleanedData/complete_test.pkl')

## 5. Modelling

## 5.1 Text Sentiment Analysis using CountVectorizer

The approach is to run a separate model on the 'Description' column that tries to prediction the Adotpion Speed between 0 to 4. Then use the probabilities returned from the model, aggregate it with the rest of the data and run further models. We are using a Model Stacking Approach

### 5.1.1 Using StratifiedKFold, generate prediction probabilities for Adoption Speed based on the Description 'Vector'

In [44]:
def Stacking(model, train, y, test, n_fold):
    folds = StratifiedKFold(n_splits=n_fold, random_state=1)
    test_pred = pd.DataFrame()
    test_pred_proba = pd.DataFrame()
    train_pred = pd.DataFrame()
    train_pred_prob = pd.DataFrame()
    
    for train_indices, val_indices in folds.split(train, y.values):
        x_train, x_val = train.iloc[train_indices], train.iloc[val_indices]
        y_train, y_val = y.iloc[train_indices], y.iloc[val_indices]
        model.fit(X=x_train, y=y_train)
        train_pred = pd.concat([train_pred, pd.Series(model.predict(x_val))], axis=0)
        print("Accuracy Score:", round(accuracy_score(y_val, pd.Series(model.predict(x_val))) * 100, 2))
        train_pred_prob = pd.concat([train_pred_prob, pd.DataFrame(model.predict_proba(x_val))], axis=0)
        test_pred = pd.concat([test_pred, pd.Series(model.predict(test))], axis=1)
        test_pred_proba = pd.concat([test_pred_proba, pd.DataFrame(model.predict_proba(test))], axis=1)
        
    # Combine probabilities by multiplying.
    test_pred_proba['x0'] = test_pred_proba.iloc[:, 0] * test_pred_proba.iloc[:, 5]
    test_pred_proba['x1'] = test_pred_proba.iloc[:, 1] * test_pred_proba.iloc[:, 6]
    test_pred_proba['x2'] = test_pred_proba.iloc[:, 2] * test_pred_proba.iloc[:, 7]
    test_pred_proba['x3'] = test_pred_proba.iloc[:, 3] * test_pred_proba.iloc[:, 8]
    test_pred_proba['x4'] = test_pred_proba.iloc[:, 4] * test_pred_proba.iloc[:, 9]

    test_pred_proba = test_pred_proba[['x0', 'x1', 'x2', 'x3', 'x4']]

    return test_pred, train_pred, train_pred_prob, test_pred_proba

In [41]:
# Read description from the train and test data
train = pd.read_csv('csvfeatures/train.csv')
train['Description'].fillna('', inplace=True)
train['length'] = train['Description'].apply(len)
test = pd.read_csv('csvfeatures/test.csv')
test['Description'].fillna('', inplace=True)
test['length'] = test['Description'].apply(len)

data_classes = train

x = data_classes['Description']
y = data_classes['AdoptionSpeed']
x_holdout = test['Description']

### 5.1.2 Use a CountVectorizer to convert the description into vectors

In [42]:
def text_process(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]


vocab = CountVectorizer(analyzer=text_process).fit(x)
r0 = x[0]
vocab0 = vocab.transform([r0])

x = vocab.transform(x)
x = pd.DataFrame(x.todense(), columns=vocab.get_feature_names())

x_holdout = vocab.transform(x_holdout)
x_holdout = pd.DataFrame(x_holdout.todense(), columns=vocab.get_feature_names())

### 5.1.3 Run a Random Forest on the vectorized description using the stacking function defined above.

In [45]:
from sklearn.ensemble import RandomForestClassifier
rmfr = RandomForestClassifier(n_estimators=150)

test_predicted_sentiment_class, train_predicted_sentiment_class, train_predicted_sentiment, test_predicted_sentiment = Stacking(
    model=rmfr, n_fold=10, train=x, test=x_holdout, y=y)

Accuracy Score: 39.6
Accuracy Score: 40.6
Accuracy Score: 41.0
Accuracy Score: 40.6
Accuracy Score: 38.73
Accuracy Score: 39.53
Accuracy Score: 38.8
Accuracy Score: 40.19
Accuracy Score: 41.46
Accuracy Score: 39.81


### 5.1.4  - Below are really time Taking Algos - will have to settle with Random Forests only


In [46]:
# from xgboost import XGBClassifier
# xgb = XGBClassifier()
# test_predicted_sentiment_class, train_predicted_sentiment_class, train_predicted_sentiment, test_predicted_sentiment = Stacking(
#     model=xgb, n_fold=10, train=x, test=x_holdout, y=y)

In [47]:
# from sklearn.neural_network import MLPClassifier
# mlp = MLPClassifier()
# test_predicted_sentiment_class, train_predicted_sentiment_class, train_predicted_sentiment, test_predicted_sentiment = Stacking(
#     model=mlp, n_fold=10, train=x, test=x_holdout, y=y)



### 5.1.5 Save the predictions from Sentiment Analysis

In [48]:
train_predicted_sentiment.to_pickle('text/train_predicted_sentiment.pkl')
train_predicted_sentiment_class.to_pickle('text/train_predicted_sentiment_class.pkl')
test_predicted_sentiment.to_pickle('text/test_predicted_sentiment.pkl')
test_predicted_sentiment_class.to_pickle('text/test_predicted_sentiment_class.pkl')

## 5.2 Run Stacking Predictions
Aggregate the cleaned data with the results of the sentiment analysis and run Models

### 5.2.1 Ordinal Responses
Here, we are trying to run five different models:
1. 1st model to prediction if the Adoption speed will be 0 or not
2. 2nd model to prediction if the Adoption speed will be 1 or not
3. 3rd model to prediction if the Adoption speed will be 2 or not
4. 4th model to prediction if the Adoption speed will be 3 or not
5. 5th model to prediction if the Adoption speed will be 4 or not

We then combine the probabilities of all models and try to derive the final results.

Also, we multiply the probability of not having 0 adoption speed by 0.8 - since our sample is imbalanced and we need to give some weight to Adoption Speed 0 as well. This approach eventually improves our score

In [63]:
def run_stacking_predictions(modelObject, modelName, abbv):
    pd.set_option('display.max_columns', 100)

    test_predictions_prob = pd.read_pickle('text/test_predicted_sentiment.pkl')
    test_predictions_prob.columns = ['x0', 'x1', 'x2', 'x3', 'x4']

    train_predictions_prob = pd.read_pickle('text/train_predicted_sentiment.pkl')
    train_predictions_prob.columns = ['x0', 'x1', 'x2', 'x3', 'x4']

    data = pd.read_pickle('cleanedData/complete_train.pkl')
    print(data.shape)
    train_predictions_prob.reset_index(inplace=True, drop=True)
    data = pd.concat([train_predictions_prob , data], axis=1)
    print(data.shape)


    # some last minute cleaning
    categoricals = ['Dewormed', 'FurLength', 'Gender', 'Health', 'State', 'Sterilized', 'Type', 'Vaccinated',
                    'color1Name', 'color2Name', 'color3Name']
    data = pd.get_dummies(data, columns=categoricals)

    X = data.drop(['Name',
                   'PetID',
                   'AdoptionSpeed',
                   'Description',
                   'labels',
                   'color1',
                   'color2',
                   'color3',
                   'breed1Name',
                   'breed2Name',
                   'iscolor3Matching',
                   'RescuerID'], axis=1)
    y = data['AdoptionSpeed']

    data_test = pd.read_pickle('cleanedData/complete_test.pkl')
    print(data_test.shape)
    test_predictions_prob.reset_index(inplace=True, drop=True)
    
    data_test = pd.concat([test_predictions_prob, data_test], axis=1)
    print(data_test.shape)

    data_test = pd.get_dummies(data_test, columns=categoricals)

    X_holdout = data_test.drop(['Name',
                                'PetID',
                                'AdoptionSpeed',
                                'Description',
                                'labels',
                                'color1',
                                'color2',
                                'color3',
                                'breed1Name',
                                'breed2Name',
                                'iscolor3Matching',
                                'RescuerID'], axis=1)

    missing_cols = set(X.columns) - set(X_holdout.columns)
    for c in missing_cols:
        X_holdout[c] = 0

    X_holdout = X_holdout[X.columns]

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3)



    # class 0 vs rest
    print("Fitting 1st Model - Class 0 vs rest")
    y0 = (y_train == 0).astype('int')
    modelObject.fit(x_train, y0)
    y_pred0_test = pd.DataFrame(modelObject.predict_proba(x_test)[:, 0] * 0.8)
    y_pred0_train = pd.DataFrame(modelObject.predict_proba(x_train)[:, 0] * 0.8)
    y_pred0_holdout = pd.DataFrame(modelObject.predict_proba(X_holdout)[:, 0] * 0.8)
    y_pred_test = y_pred0_test
    y_pred_train = y_pred0_train
    y_pred_holdout = y_pred0_holdout

    # class 1 vs rest
    print("Fitting 2nd - Class 1 vs rest")
    y1 = (y_train == 1).astype('int')
    modelObject.fit(x_train, y1)
    y_pred1_test = pd.DataFrame(modelObject.predict_proba(x_test)[:, 0])
    y_pred1_train = pd.DataFrame(modelObject.predict_proba(x_train)[:, 0])
    y_pred1_holdout = pd.DataFrame(modelObject.predict_proba(X_holdout)[:, 0])
    y_pred_test = pd.concat([y_pred_test, y_pred1_test], axis=1)
    y_pred_train = pd.concat([y_pred_train, y_pred1_train], axis=1)
    y_pred_holdout = pd.concat([y_pred_holdout, y_pred1_holdout], axis=1)

    # class 2 vs rest
    print("Fitting 3rd - Class 2 vs rest")
    y2 = (y_train == 2).astype('int')
    modelObject.fit(x_train, y2)
    y_pred2_test = pd.DataFrame(modelObject.predict_proba(x_test)[:, 0])
    y_pred2_train = pd.DataFrame(modelObject.predict_proba(x_train)[:, 0])
    y_pred2_holdout = pd.DataFrame(modelObject.predict_proba(X_holdout)[:, 0])
    y_pred_test = pd.concat([y_pred_test, y_pred2_test], axis=1)
    y_pred_train = pd.concat([y_pred_train, y_pred2_train], axis=1)
    y_pred_holdout = pd.concat([y_pred_holdout, y_pred2_holdout], axis=1)

    # class 3 vs rest
    print("Fitting 4th - Class 3 vs rest")
    y3 = (y_train == 3).astype('int')
    modelObject.fit(x_train, y3)
    y_pred3_test = pd.DataFrame(modelObject.predict_proba(x_test)[:, 0])
    y_pred3_train = pd.DataFrame(modelObject.predict_proba(x_train)[:, 0])
    y_pred3_holdout = pd.DataFrame(modelObject.predict_proba(X_holdout)[:, 0])
    y_pred_test = pd.concat([y_pred_test, y_pred3_test], axis=1)
    y_pred_train = pd.concat([y_pred_train, y_pred3_train], axis=1)
    y_pred_holdout = pd.concat([y_pred_holdout, y_pred3_holdout], axis=1)

    # class 4 vs rest
    print("Fitting 5th - class 4 vs rest")
    y4 = (y_train == 4).astype('int')
    modelObject.fit(x_train, y4)
    y_pred4_test = pd.DataFrame(modelObject.predict_proba(x_test)[:, 0])
    y_pred4_train = pd.DataFrame(modelObject.predict_proba(x_train)[:, 0])
    y_pred4_holdout = pd.DataFrame(modelObject.predict_proba(X_holdout)[:, 0])
    y_pred_test = pd.concat([y_pred_test, y_pred4_test], axis=1)
    y_pred_train = pd.concat([y_pred_train, y_pred4_train], axis=1)
    y_pred_holdout = pd.concat([y_pred_holdout, y_pred4_holdout], axis=1)

    y_pred_test.columns = ['not0', 'not1', 'not2', 'not3', 'not4']
    y_pred_train.columns = ['not0', 'not1', 'not2', 'not3', 'not4']
    y_pred_holdout.columns = ['not0', 'not1', 'not2', 'not3', 'not4']

    # Combining Probabilities from 5 models
    y_pred_test['0'] = y_pred_test['not1'] + y_pred_test['not2'] + y_pred_test['not3'] + y_pred_test['not4']
    y_pred_test['1'] = y_pred_test['not0'] + y_pred_test['not2'] + y_pred_test['not3'] + y_pred_test['not4']
    y_pred_test['2'] = y_pred_test['not1'] + y_pred_test['not0'] + y_pred_test['not3'] + y_pred_test['not4']
    y_pred_test['3'] = y_pred_test['not1'] + y_pred_test['not2'] + y_pred_test['not0'] + y_pred_test['not4']
    y_pred_test['4'] = y_pred_test['not1'] + y_pred_test['not2'] + y_pred_test['not3'] + y_pred_test['not0']

    y_pred_train['0'] = y_pred_train['not1'] + y_pred_train['not2'] + y_pred_train['not3'] + y_pred_train['not4']
    y_pred_train['1'] = y_pred_train['not0'] + y_pred_train['not2'] + y_pred_train['not3'] + y_pred_train['not4']
    y_pred_train['2'] = y_pred_train['not1'] + y_pred_train['not0'] + y_pred_train['not3'] + y_pred_train['not4']
    y_pred_train['3'] = y_pred_train['not1'] + y_pred_train['not2'] + y_pred_train['not0'] + y_pred_train['not4']
    y_pred_train['4'] = y_pred_train['not1'] + y_pred_train['not2'] + y_pred_train['not3'] + y_pred_train['not0']

    y_pred_holdout['0'] = y_pred_holdout['not1'] + y_pred_holdout['not2'] + y_pred_holdout['not3'] + y_pred_holdout[
        'not4']
    y_pred_holdout['1'] = y_pred_holdout['not0'] + y_pred_holdout['not2'] + y_pred_holdout['not3'] + y_pred_holdout[
        'not4']
    y_pred_holdout['2'] = y_pred_holdout['not1'] + y_pred_holdout['not0'] + y_pred_holdout['not3'] + y_pred_holdout[
        'not4']
    y_pred_holdout['3'] = y_pred_holdout['not1'] + y_pred_holdout['not2'] + y_pred_holdout['not0'] + y_pred_holdout[
        'not4']
    y_pred_holdout['4'] = y_pred_holdout['not1'] + y_pred_holdout['not2'] + y_pred_holdout['not3'] + y_pred_holdout[
        'not0']

    y_pred_test.drop(columns=['not0', 'not1', 'not2', 'not3', 'not4'], inplace=True)
    y_pred_train.drop(columns=['not0', 'not1', 'not2', 'not3', 'not4'], inplace=True)
    y_pred_holdout.drop(columns=['not0', 'not1', 'not2', 'not3', 'not4'], inplace=True)

    y_pred_test.columns = [0, 1, 2, 3, 4]
    y_pred_train.columns = [0, 1, 2, 3, 4]
    y_pred_holdout.columns = [0, 1, 2, 3, 4]

    y_pred_test = y_pred_test.idxmax(axis=1)
    y_pred_train = y_pred_train.idxmax(axis=1)
    y_pred_holdout = y_pred_holdout.idxmax(axis=1)
    predictions = y_pred_holdout

    test_acc = accuracy_score(y_test, y_pred_test)

    train_acc = accuracy_score(y_train, y_pred_train)

    kappa = cohen_kappa_score(y_test, y_pred_test, weights='quadratic')

    print("Test set accuracy by " + modelName + " : {:.2f}".format(test_acc))
    print("Train set accuracy by " + modelName + " : {:.2f}".format(train_acc))
    print("Kappa by " + modelName + " : {:.2f}".format(kappa))

    print(confusion_matrix(y_test, y_pred_test))

    # fitting this on test set
    print('Fitting the Model on Test Set')

    petid = data_test['PetID']
    final_pred = pd.Series(predictions, name='AdoptionSpeed')
    submission = pd.concat([petid, final_pred], axis=1)
    submission.to_csv('submission-'+abbv+'.csv', index=False)

###  5.2.2 XGBoost

In [64]:
import xgboost as xgb
xg_classifier = xgb.XGBClassifier(
    max_depth=7, n_estimators=200)
run_stacking_predictions(xg_classifier, 'XGBoost Classifier', 'xgb')

(14993, 84)
(14993, 89)
(3948, 84)
(3948, 89)
Fitting 1st Model - Class 0 vs rest
Fitting 2nd - Class 1 vs rest
Fitting 3rd - Class 2 vs rest
Fitting 4th - Class 3 vs rest
Fitting 5th - class 4 vs rest
Test set accuracy by XGBoost Classifier : 0.44
Train set accuracy by XGBoost Classifier : 0.97
Kappa by XGBoost Classifier : 0.38
[[ 19  35  37   7  34]
 [ 26 316 266  96 192]
 [ 33 272 481 197 252]
 [ 27 137 272 275 249]
 [ 24  94 174  81 902]]
Fitting the Model on Test Set


### 5.2.3 Gradient Boosting

In [65]:
from sklearn.ensemble import GradientBoostingClassifier
gbm = GradientBoostingClassifier(random_state=15, n_estimators=250)
run_stacking_predictions(gbm, "Gradient Boosting Classifier", 'gb')

(14993, 84)
(14993, 89)
(3948, 84)
(3948, 89)
Fitting 1st Model - Class 0 vs rest
Fitting 2nd - Class 1 vs rest
Fitting 3rd - Class 2 vs rest
Fitting 4th - Class 3 vs rest
Fitting 5th - class 4 vs rest
Test set accuracy by Gradient Boosting Classifier : 0.46
Train set accuracy by Gradient Boosting Classifier : 0.63
Kappa by Gradient Boosting Classifier : 0.41
[[ 16  36  26   7  26]
 [ 23 357 311  63 185]
 [ 19 211 472 182 300]
 [ 20 129 281 253 287]
 [ 15 102 167  56 954]]
Fitting the Model on Test Set


### 5.2.4 Random Forest

In [66]:
randomForest = RandomForestClassifier(random_state=1, n_estimators=300, max_depth=7)
run_stacking_predictions(randomForest, "Random Forest Classifier", 'rf')

(14993, 84)
(14993, 89)
(3948, 84)
(3948, 89)
Fitting 1st Model - Class 0 vs rest
Fitting 2nd - Class 1 vs rest
Fitting 3rd - Class 2 vs rest
Fitting 4th - Class 3 vs rest
Fitting 5th - class 4 vs rest
Test set accuracy by Random Forest Classifier : 0.45
Train set accuracy by Random Forest Classifier : 0.57
Kappa by Random Forest Classifier : 0.36
[[ 10  31  33   8  50]
 [  6 231 382  45 277]
 [  1 118 552 149 383]
 [  0  72 331 214 326]
 [  1  59 161  59 999]]
Fitting the Model on Test Set


## Note: We are also submitting the following notebooks:
1. Text Analysis Notebook
2. EDA Notebook
3. More deep dive into modelling  - where we analyze the feature importance and correlation heatmaps. Also, the logic to adjust class boundaries to improve the quadratic kappa score.