# Exploratory data analysis

## Setup

In [1]:
import platform; print(platform.platform())
import sys; print("Python", sys.version)

Windows-10-10.0.19045-SP0
Python 3.10.5 (tags/v3.10.5:f377153, Jun  6 2022, 16:14:13) [MSC v.1929 64 bit (AMD64)]


In [2]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
train = pd.read_csv('../data/final/train.csv')
test = pd.read_csv('../data/final/test.csv')

train.sample(5)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Group_count,Cabin_deck,Cabin_side
3894,Mars,True,TRAPPIST-1e,15.0,False,0.0,0.0,0.0,0.0,0.0,True,7.0,F,P
5935,Earth,False,TRAPPIST-1e,16.0,False,17.0,,0.0,537.0,173.0,False,1.0,F,P
6944,Mars,False,TRAPPIST-1e,55.0,False,1236.0,0.0,52.0,0.0,0.0,False,2.0,F,P
6096,Earth,False,TRAPPIST-1e,10.0,False,0.0,0.0,0.0,0.0,0.0,True,7.0,G,S
5287,Mars,False,TRAPPIST-1e,28.0,False,1909.0,0.0,0.0,0.0,6.0,False,1.0,F,P


In [4]:
train.isna().sum()

HomePlanet      201
CryoSleep       217
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
Group_count       0
Cabin_deck      199
Cabin_side      199
dtype: int64

In [None]:
TARGET = 'Transported'
FEATURES = [col for col in train.columns if col not in [TARGET]]

numerical = train[FEATURES].select_dtypes(include=np.number).columns
categorical = train[FEATURES].select_dtypes(exclude=np.number).columns

print(f'Target: {TARGET}')
print(f'Features:\n\tnumerical: {numerical.to_list()}\n\tcategorical:{categorical.to_list()}')
print(f'Shapes:\n\ttrain: {train.shape}\n\ttest: {test.shape}')

In [None]:
os.makedirs('../figures', exist_ok=True)
sns.set_style("darkgrid")
sns.set_palette('Dark2')

## Response marginal analysis

In [None]:
sns.countplot(x=TARGET, data=train)
plt.savefig('../figures/response_marginal.png')

## Features marginal analysis

### Categorical features

In [None]:
# countplot of categorical features
fig, axs = plt.subplots(2, 3, figsize = (16, 9))
for ax, c in zip(axs.flatten(), categorical):
    sns.countplot(x=c, data=train, ax=ax)
plt.savefig('../figures/categorical_marginal.png')

### Numerical features

In [None]:
# boxplot of numerical features
fig, axs = plt.subplots(2, 4, figsize = (16, 9))
for ax, c in zip(axs.flatten(), numerical):
    sns.boxplot(y=c, data=train, ax=ax)
plt.savefig('../figures/numerical_marginal.png')

In [None]:
# numerical features
toplot = pd.concat([train[FEATURES], train[TARGET]], axis=1)
sns.pairplot(toplot, vars = numerical, hue= TARGET, height=2.5)
plt.savefig('../figures/numerical_pairplot.png')

In [None]:
# correlation between numerical variables
fig, ax = plt.subplots(figsize=(12, 12))     
sns.heatmap(train[numerical].corr(), annot=True, fmt = ".2f", cmap = "RdBu")
plt.savefig('../figures/numerical_corrmap.png')

## Bivariate analysis

In [None]:
# boxplot of numerical features by response value
fig, axs = plt.subplots(2, 4, figsize = (16, 9))
for ax, c in zip(axs.flatten(), numerical):
    sns.boxplot(y=c, x=TARGET, data=train, ax=ax)
plt.savefig('../figures/categorical_bivariate.png')

In [None]:
# countplot of categorical features by response value
fig, axs = plt.subplots(2, 3, figsize = (16, 9))
for ax, c in zip(axs.flatten(), categorical):
    sns.countplot(x=TARGET, hue=c, data=train, ax=ax)
plt.savefig('../figures/numerical_bivariate.png')