# DataViz with Seaborn

## Import Libraries

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
sns.__version__

'0.11.1'

### Read CSV File to Dataframe

In [3]:
#If running on Kaggle
# path = "../input/titanic/train.csv"

#If Running anywhere else
path = "https://github.com/datasciencedojo/datasets/raw/master/titanic.csv"

df = pd.read_csv(path)

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Categorical Data

### Countplot

In [None]:
sns.countplot(data=df, y='Survived')

In [None]:
sns.countplot(data=df, x='Survived').set_title("Countplot of Survived Passengers")

In [None]:
sns.countplot(data=df, x='Survived', hue='Sex').set_title("Countplot of Survived Passengers by Sex")

plt.xticks(
    [0, 1],
    ['Not Survived','Survived'],
)
plt.xlabel('Survived or Not Survived')

In [None]:
sns.catplot(data=df, x='Survived', hue='Sex', kind='count')

plt.title("Countplot of Survived Passengers by Sex")
plt.xticks(
    [0, 1],
    ['Not Survived','Survived'],
)
plt.xlabel('Survived or Not Survived')

### Pointplot

In [None]:
sns.catplot(data=df, x='Survived', y='Sex', kind='point')
plt.title("Pointplot of Survived Passengers by Sex")

In [None]:
sns.catplot(data=df, y='Survived', x='Sex', kind='point')

In [None]:
sns.catplot(data=df, y='Survived', x='Sex', hue='Pclass', kind='point')

## Numeric Data

### Displot - Distribution Plot | Histogram

In [None]:
sns.displot(data=df, x='Age')
plt.title("Distribution of Age")

In [None]:
sns.displot(data=df, x='Age', bins=8)
plt.title("Distribution of Age")

In [None]:
sns.displot(data=df, x='Age', bins=int(df['Age'].max()/10))
plt.title("Distribution of Age")

In [None]:
sns.displot(data=df, x='Age', hue='Sex')
plt.title("Distribution of Age by Sex")

In [None]:
sns.displot(data=df, x='Age', hue='Sex', kind='kde')
plt.title("KDE Distribution of Age by Sex")

In [None]:
sns.displot(data=df, x='Age', hue='Survived', kind='kde')
plt.title("KDE Distribution of Age by Sex")

In [None]:
sns.displot(data=df, x='Fare')

In [None]:
sns.displot(data=df, x='Fare')

plt.xlim(0, 200)

### Scatterplot

In [None]:
x = 'Fare'
y = 'Age'
c = 'Survived'

In [None]:
sns.scatterplot(data=df, x=x, y=y) \
.set_title("Scatterplot of Age against Fare")

In [None]:
sns.scatterplot(data=df, x=x, y=y) \
.set_title("Scatterplot of Age against Fare")

plt.ylim(0, 300)

In [None]:
sns.scatterplot(data=df, x=x, y=y) \
.set_title("Scatterplot of Age against Fare")

plt.xlim(0, 50)

In [None]:
sns.scatterplot(data=df, x=x, y=y, hue=c) \
.set_title("Scatterplot of Age against Fare separated by Survived & Not Survived")

### Boxplot

In [None]:
y = 'Age'
x = 'Survived'
c = 'Pclass'

In [None]:
sns.displot(data=df, x=y)

In [None]:
sns.boxplot(data=df, x=y)

In [None]:
sns.boxplot(data=df, x='Fare')

In [None]:
sns.boxplot(data=df, x=x, y=y).set_title(
    "Boxplot of Age Distribution of Survived and Not Survived"
)

plt.xticks(
    [0, 1],
    ['Not Survived','Survived'],
)

In [None]:
sns.boxplot(data=df, x=x, y=y) \
.set_title("Boxplot of Age Distribution of Survived and Not Survived")

plt.xticks(
    [0, 1],
    ['Not Survived','Survived'],
)

In [None]:
sns.boxplot(data=df, x=x, y=y, hue=c) \
.set_title("Boxplot of Age Distribution of Survived and Not Survived by Pclass")

plt.xticks(
    [0, 1],
    ['Not Survived','Survived'],
)

### Pairplot

In [None]:
sns.pairplot(df)

In [None]:
sns.pairplot(df[['Age', 'Fare', 'Survived']])

### Heatmap

In [None]:
corr = df.corr()
corr

In [None]:
sns.heatmap(corr)

In [None]:
sns.heatmap(
    df[['Age', 'Fare', 'Survived']].corr(),
    cmap='Blues',
    annot=True,
)

### Filter to Subset of Categorical Columns

In [None]:
df['Ticket'].value_counts()

In [None]:
df['Ticket'].value_counts().index[0:5]

In [None]:
ticket_subset = df['Ticket'].value_counts().index[0:5]

df_ticket_subset = df.loc[df['Ticket'].isin(ticket_subset)]

In [None]:
sns.countplot(data=df_ticket_subset, x='Ticket')