In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
print(os.listdir("../input"))

## Project definition

* Goal: predict the adoptability of pets - specifically, how quickly is a pet adopted? predict the speed at which a pet is adopted, based on the pet’s listing on PetFinder. Sometimes a profile represents a group of pets. In this case, the speed of adoption is determined by the speed at which all of the pets are adopted.
* Metric: quadratic weighted kappa, which measures the agreement between two ratings. it's calculated between the scores which are expected/known and the predicted scores.
    - -X: less than random agreement between raters
    - 0: random agreement between raters
    - 1: complete agreement between raters
* More at [Evaluation](https://www.kaggle.com/c/petfinder-adoption-prediction#evaluation)
* Data: text, tabular, image
    - train.csv, test.csv: tabular/text data
    - breed_sample.csv: Type, and BreedName for each BreedID. Type 1 is dog, 2 is cat. 
    - color_labels.csv - Contains ColorName for each ColorID
    - state_labels.csv - Contains StateName for each StateID
    - images: For pets that have photos, they will be named in the format of PetID-ImageNumber.jpg. Image 1 is the profile (default) photo set for the pet. For privacy purposes, faces, phone numbers and emails have been masked.
    - image metadata: We have run the images through Google's Vision API, providing analysis on Face Annotation, Label Annotation, Text Annotation and Image Properties. You may optionally utilize this supplementary information for your image analysis. File name format is PetID-ImageNumber.json. Some properties will not exist in JSON file if not present, i.e. Face Annotation. Text Annotation has been simplified to just 1 entry of the entire text description (instead of the detailed JSON result broken down by individual characters and words). Phone numbers and emails are already anonymized in Text Annotation. Google Vision API reference: https://cloud.google.com/vision/docs/reference/rest/v1/images/annotate
    - sentiment data: We have run each pet profile's description through Google's Natural Language API, providing analysis on sentiment and key entities. You may optionally utilize this supplementary information for your pet description analysis. There are some descriptions that the API could not analyze. As such, there are fewer sentiment files than there are rows in the dataset. File name format is PetID.json. Google Natural Language API reference: https://cloud.google.com/natural-language/docs/basics    
   

In [None]:
train = pd.read_csv('../input/train/train.csv')
test = pd.read_csv('../input/test/test.csv')

In [None]:
print("Shape of training set: ", train.shape)
print("Shape of test set: ", test.shape)

# Tabular data

### Data fields in train, test

* Type - Type of animal (1 = Dog, 2 = Cat)
* Name - Name of pet (Empty if not named)
* Age - Age of pet when listed, in months
* Breed1 - Primary breed of pet (Refer to BreedLabels dictionary)
* Breed2 - Secondary breed of pet, if pet is of mixed breed (Refer to BreedLabels dictionary)
* Gender - Gender of pet (1 = Male, 2 = Female, 3 = Mixed, if profile represents group of pets)
* Color1 - Color 1 of pet (Refer to ColorLabels dictionary)
* Color2 - Color 2 of pet (Refer to ColorLabels dictionary)
* Color3 - Color 3 of pet (Refer to ColorLabels dictionary)
* MaturitySize - Size at maturity (1 = Small, 2 = Medium, 3 = Large, 4 = Extra Large, 0 = Not Specified)
* FurLength - Fur length (1 = Short, 2 = Medium, 3 = Long, 0 = Not Specified)
* Vaccinated - Pet has been vaccinated (1 = Yes, 2 = No, 3 = Not Sure)
* Dewormed - Pet has been dewormed (1 = Yes, 2 = No, 3 = Not Sure)
* Sterilized - Pet has been spayed / neutered (1 = Yes, 2 = No, 3 = Not Sure)
* Health - Health Condition (1 = Healthy, 2 = Minor Injury, 3 = Serious Injury, 0 = Not Specified)
* Quantity - Number of pets represented in profile
* Fee - Adoption fee (0 = Free)
* State - State location in Malaysia (Refer to StateLabels dictionary)
* RescuerID - Unique hash ID of rescuer
* VideoAmt - Total uploaded videos for this pet
* Description - Profile write-up for this pet. The primary language used is English, with some in Malay or Chinese.
* PetID - Unique hash ID of pet profile
* PhotoAmt - Total uploaded photos for this pet
* AdoptionSpeed - Categorical speed of adoption. Lower is faster. This is the value to predict. See below sec for more info. Contestants are required to predict this value. The value is determined by how quickly, if at all, a pet is adopted. The values are determined in the following way:
    - 0 - Pet was adopted on the same day as it was listed.
    - 1 - Pet was adopted between 1 and 7 days (1st week) after being listed.
    - 2 - Pet was adopted between 8 and 30 days (1st month) after being listed.
    - 3 - Pet was adopted between 31 and 90 days (2nd & 3rd month) after being listed.
    - 4 - No adoption after 100 days of being listed. (There are no pets in this dataset that waited between 90 and 100 days). 

In [None]:
train.sample(5)

In [None]:
breed_labels = pd.read_csv('../input/breed_labels.csv')
color_labels = pd.read_csv('../input/color_labels.csv')
state_labels = pd.read_csv('../input/state_labels.csv')

In [None]:
breed_labels.sample(5)

In [None]:
color_labels.sample(5)

In [None]:
state_labels.sample(5)

# Images
(adapted from my [images notebook](https://www.kaggle.com/anebzt/images-preprocessing-model))

In [None]:
import cv2
from cv2 import imread, cvtColor, resize, threshold, calcHist, equalizeHist

In [None]:
def open_images(path):
    
    xall = []
    j = 0
    
    for imgname in os.listdir(path):
        print("Opening image #{}".format(str(j + 1)), end="\r")
        imgpath = os.path.join(path, imgname)
        img = imread(imgpath, cv2.IMREAD_COLOR)
        xall.append(img)
        j += 1
        
        # RAM breaks after 30k images
        if j >= 1000:
            break

    n = len(xall)
    print("{} images in set.".format(n))
    return xall

In [None]:
train_img = open_images("../input/train_images/")
test_img = open_images("../input/test_images/")

In [None]:
train_img[0].shape