# Bivariate visualization with the Titanic data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def importtitanic():
    titanic_url = 'https://raw.githubusercontent.com/benjum/UCLAX-24Fall-EDA/main/Data/titanic.csv'
    df_titanic = pd.read_csv(titanic_url)

    df_titanic['Survived'] = df_titanic['Survived'].astype(str)
    df_titanic['PassengerId'] = df_titanic['PassengerId'].astype(str)
    df_titanic['Pclass'] = df_titanic['Pclass'].astype(str)
    
    df_titanic.dropna(subset=['Age','Embarked'], inplace=True)
    df_titanic.drop('Cabin', axis=1, inplace=True)
    
    return df_titanic

In [None]:
titanic = importtitanic()

In [None]:
titanic.info()

In [None]:
titanic.describe(include='all')

In [None]:
titanic

In [None]:
titanic[['Pclass','Sex']].value_counts().plot(kind='bar')

### Seaborn will start to shine for these bivariate plots

In [None]:
sns.countplot(data=titanic, x='Pclass')

In [None]:
sns.countplot(data=titanic, x='Pclass', hue='Sex')

In [None]:
sns.countplot(data=titanic, x='Pclass', hue='Sex', dodge=False)

In [None]:
g = sns.FacetGrid(titanic, row="Sex")
g.map_dataframe(sns.countplot, x="Pclass")

In [None]:
g = sns.FacetGrid(titanic, col="Sex")
g.map_dataframe(sns.countplot, x="Pclass")

In [None]:
g = sns.FacetGrid(titanic, col="Sex", hue='Sex')
g.map_dataframe(sns.countplot, x="Pclass", order=['1','2','3'])

In [None]:
sns.catplot(data=titanic,
            x='Pclass',
            col='Sex',
           kind='count')

In [None]:
sns.catplot(data=titanic,
            x='Pclass',
            col='Sex',
            kind='count',
            hue='Sex')

How about a stacked bar plot?

In [None]:
titanic[['Pclass','Sex']].value_counts()

In [None]:
t2 = titanic[['Pclass','Sex']].value_counts().reset_index(name='counts')

In [None]:
t2

In [None]:
t2.pivot(index='Pclass',columns='Sex',values='counts')

In [None]:
t2p = t2.pivot(index='Pclass',columns=['Sex'],values='counts')
t2p.plot(kind = 'bar', stacked=True)

Alternatively...

In [None]:
pd.crosstab(titanic['Pclass'], titanic['Sex'])

In [None]:
pd.crosstab(titanic['Pclass'], titanic['Sex']).plot(kind='bar',
                                                    stacked=True)

## Dealing with an overwhelming set of labels

In [None]:
titanic

In [None]:
titanic['Ticket'][:2]

In [None]:
titanic['Ticket'].value_counts().plot(kind='bar')

In [None]:
titanic['Ticket'].value_counts(ascending=True).plot(kind='barh')

In [None]:
titanic['Ticket'][:20].value_counts(ascending=True).plot(kind='barh')

In [None]:
titanic['Ticket'][-20:].value_counts(ascending=True).plot(kind='barh')

In [None]:
titanic['Ticket'].value_counts(ascending=True)[:20].plot(kind='barh')

In [None]:
titanic['Ticket'].value_counts(ascending=True)[-20:].plot(kind='barh')

## Color for highlighting

In [None]:
c = ['grey' for i in range(20)]
c[-4:-1] = ['red' for i in range(3)]

In [None]:
c

In [None]:
titanic['Ticket'].value_counts(ascending=True)[-20:].plot(kind='barh',
                                                          color=c)

## Lollipop

In [None]:
v = titanic['Ticket'].value_counts(ascending=True)[-20:]
my_range=range(1,len(v.index)+1)

In [None]:
v

In [None]:
v.values

In [None]:
my_range

In [None]:
# The horizontal plot is made using the hline function
plt.hlines(y=my_range, xmin=0, xmax=v.values, color='skyblue')
plt.plot(v.values, my_range, "o")

In [None]:
# The horizontal plot is made using the hline function
plt.hlines(y=my_range, xmin=0, xmax=v.values, color='skyblue')
plt.plot(v.values, my_range, "o")
 
# Add titles and axis names
plt.yticks(my_range, v.index)
plt.title("A vertical lolipop plot", loc='left')
plt.xlabel('Value of the variable')
plt.ylabel('Ticket')

# Show the plot
plt.show()

## Dot plot

In [None]:
# The horizontal plot is made using the hline function
#plt.hlines(y=my_range, xmin=0, xmax=v.values, color='skyblue')
plt.plot(v.values, my_range, "o")
 
# Add titles and axis names
plt.yticks(my_range, v.index)
plt.title("A vertical dot plot", loc='left')
plt.xlabel('Value of the variable')
plt.ylabel('Ticket')

# Show the plot
plt.show()

In [None]:
# The horizontal plot is made using the hline function
#plt.hlines(y=my_range, xmin=0, xmax=v.values, color='skyblue')
plt.plot(v.values, my_range, "o")

plt.vlines(x=4.5, ymin=0, ymax=max(my_range), 
           color='red', linestyle='--')
plt.text(x=4.6, y=5, s='guess at mean')

# Add titles and axis names
plt.yticks(my_range, v.index)
plt.title("A vertical dot plot", loc='left')
plt.xlabel('Value of the variable')
plt.ylabel('Ticket')

# Show the plot
plt.show()

In [None]:
# The horizontal plot is made using the hline function
plt.hlines(y=my_range, 
           xmin=[min(0,i-4.5) for i in v.values],
           xmax=[max(0,i-4.5) for i in v.values],
           color='skyblue')
plt.plot(v.values-4.5, my_range, "o")
 
# Add titles and axis names
plt.yticks(my_range, v.index)
plt.title("A vertical lolipop plot", loc='left')
plt.xlabel('Value of the variable')
plt.ylabel('Ticket')

# Show the plot
plt.show()

In [None]:
tf = titanic['Ticket'].value_counts(ascending=True)[-20:]
tf = tf - 4.5
tf.plot(kind='barh', color=c)

## Dueling histograms

In [None]:
sns.histplot(data=titanic, x='Age')

In [None]:
sns.histplot(data=titanic, x='Age', hue='Pclass')

In [None]:
sns.histplot(data=titanic, x='Age', hue='Pclass', multiple='stack')

In [None]:
sns.histplot(data=titanic, x='Age', hue='Pclass', kde=True)

In [None]:
sns.histplot(data=titanic, x='Age', hue='Pclass', kde=True, multiple='stack')

In [None]:
sns.histplot(data=titanic, x='Age', hue='Pclass', multiple='dodge')

In [None]:
sns.histplot(data=titanic, x='Age', hue='Pclass', multiple='fill')

In [None]:
sns.histplot(data=titanic, x='Age', hue='Pclass', multiple='layer')

In [None]:
sns.kdeplot(data=titanic, 
             x="Age", 
             hue="Pclass", 
             multiple="stack")

In [None]:
g = sns.FacetGrid(titanic, col="Pclass")
g.map_dataframe(sns.kdeplot, x="Age")

In [None]:
g = sns.FacetGrid(titanic, col="Pclass")
g.map_dataframe(sns.histplot, x="Age")

In [None]:
sns.catplot(data=titanic, 
             x="Age", 
             hue="Pclass",
            col="Pclass")

In [None]:
sns.displot(data=titanic, 
             x="Age", 
             hue="Pclass",
            row="Pclass")

In [None]:
sns.kdeplot(data=titanic, x='Age', hue='Pclass')

In [None]:
sns.kdeplot(data=titanic, x='Age', hue='Pclass', bw_adjust=.5)

In [None]:
sns.boxplot(data=titanic, x='Pclass', y='Age')

In [None]:
sns.boxplot(data=titanic, x='Age', y='Pclass')

In [None]:
#sns.set_theme(rc={"axes.facecolor": (0, 0, 0, 0)})
g = sns.FacetGrid(titanic, row="Pclass", aspect=5, height=3)
g.map_dataframe(sns.kdeplot, x="Age", fill=True, alpha=.2)
g.fig.subplots_adjust(hspace=-0.5)

In [None]:
sns.scatterplot(data=titanic.loc[titanic['Fare']<100], x='Age', y='Fare')

In [None]:
sns.pairplot(data=titanic)