In [1]:
import os
import random
import math
import numpy as np
import pandas as pd
from PIL import Image, ImageDraw
import cv2

from sklearn.metrics import roc_auc_score, mean_squared_error
from IPython.display import clear_output
from tqdm import tqdm
import glob
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_csv = pd.read_csv('../input/petfinder-pawpularity-score/train.csv')
test_csv = pd.read_csv('../input/petfinder-pawpularity-score/test.csv')
sample_submission = pd.read_csv('../input/petfinder-pawpularity-score/sample_submission.csv')

In [3]:
train_csv

## Target variable

In [4]:
plt.hist(train_csv['Pawpularity'], bins=100)
plt.title('Target distribution')
plt.show()

In [5]:
ids_1 = train_csv[train_csv['Pawpularity'] == 1]['Id']
print(f'There are {len(ids_1)} observations with Pawpularity=1')
n = min(5, len(ids_1))

fig, axes = plt.subplots(nrows=1, ncols=n, figsize=(20, 20))
for i in range(n):
    image = Image.open(f'../input/petfinder-pawpularity-score/train/{ids_1.iloc[i]}.jpg').convert('RGB')
    axes[i].imshow(image)
    axes[i].axis('off')
plt.show()

In [6]:
ids_100 = train_csv[train_csv['Pawpularity'] == 100]['Id']
print(f'There are {len(ids_100)} observations with Pawpularity=100')
n = min(5, len(ids_100))

fig, axes = plt.subplots(nrows=1, ncols=n, figsize=(20, 20))
for i in range(n):
    image = Image.open(f'../input/petfinder-pawpularity-score/train/{ids_100.iloc[i]}.jpg').convert('RGB')
    axes[i].imshow(image)
    axes[i].axis('off')
plt.show()

In [7]:
train_csv['Pawpularity'].describe()

## Features

In [8]:
train_csv.describe()

In [9]:
sns.heatmap(train_csv.corr(), vmin=-1, vmax=1, center=0);

## Train-test-split

In [10]:
data_to_train, data_to_test = train_test_split(train_csv, test_size=0.1, random_state=42)
print(f'Train len {data_to_train.shape[0]}')
print(f'Test len {data_to_test.shape[0]}')

## Test data

In [11]:
ids = test_csv['Id']
fig, axes = plt.subplots(nrows=1, ncols=len(ids), figsize=(20, 20))
for i in range(len(ids)):
    image = Image.open(f'../input/petfinder-pawpularity-score/test/{ids.iloc[i]}.jpg').convert('RGB')
    axes[i].imshow(image)
    axes[i].axis('off')
plt.show()