In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [None]:
CSV_dataset_URL = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
dataset_df = pd.read_csv(CSV_dataset_URL)
dataset_columns = dataset_df.columns

In [None]:
print(dataset_df.dtypes)
print(dataset_df)

In [None]:
number_columns = ['Age', 'Fare']
categorical_columns = ['Sex', 'Embarked', 'Survived', 'Pclass', 'Parch', 'SibSp']

In [None]:
# Clean and preprocess data
preprocessed_df = dataset_df.drop(['Cabin', 'Name', 'Ticket'], axis = 1)
preprocessed_df.dropna(axis = 0, inplace = True)
# encode categorical features to numbers
# for category_feature in categorical_columns:
#   category_encoder = LabelEncoder()
#   preprocessed_df[category_feature] = category_encoder.fit_transform(
#       preprocessed_df[category_feature]
#   )
print(preprocessed_df)

In [None]:
# boxplots of number columns
preprocessed_df.boxplot('Age', grid = True, figsize = (2, 5))

In [None]:
preprocessed_df.boxplot('Fare', grid = True, figsize = (2, 10))

In [None]:
# Density plots for number columns
axis = preprocessed_df['Age'].plot.hist(density = True, xlim = [-20, 90], bins = range(1, 80))
axis.set_xlabel('Age in years')
preprocessed_df['Age'].plot.density(ax = axis)
plt.show()

In [None]:
axis = preprocessed_df['Fare'].plot.hist(density = True, xlim = [-100, 100], bins = range(1, 100))
axis.set_xlabel('Fare in $')
preprocessed_df['Fare'].plot.density(ax = axis)
plt.show()

In [None]:
sns.catplot(data = preprocessed_df, x = 'Sex', kind = 'count', height = 2)
plt.show()

In [None]:
sns.catplot(data = preprocessed_df, x = 'Embarked', kind = 'count', height = 2)
plt.show()

In [None]:
sns.catplot(data = preprocessed_df, x = 'Survived', kind = 'count', height = 2)
plt.show()

In [None]:
sns.catplot(data = preprocessed_df, x = 'Pclass', kind = 'count', height = 2)
plt.show()

In [None]:
sns.catplot(data = preprocessed_df, x = 'Parch', kind = 'count', height = 4)
plt.show()

In [None]:
sns.catplot(data = preprocessed_df, x = 'SibSp', kind = 'count', height = 4)
plt.show()

In [None]:
#TODO: Plot correlation matrix between columns:
#1)Continuos-Continuous
#2)Categorical-Categorical
#3)Continuos-Categorical
# Because as I read, df.corr() use only Pearson correlation method,
# which applied only for continuos values.

In [None]:
#1)Continuos-Continuous
age_fare_correlation_coeff = round(
    preprocessed_df[number_columns].corr()['Age'].iloc[1], 3
)
print('Age-Fare correlation coeff =', age_fare_correlation_coeff)

In [None]:
# create contingency table for columns pair
def get_contingency_table_for_pair(dataframe: pd.DataFrame, column_names: list[str]):
  target_features_pair_df = dataframe[column_names]
  contingency_table = target_features_pair_df.groupby(by = column_names, as_index = False).size()
  return contingency_table

print(get_contingency_table_for_pair(preprocessed_df, ['Embarked', 'Sex']))