In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from scipy.stats import chi2_contingency

In [None]:
CSV_dataset_URL = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
dataset_df = pd.read_csv(CSV_dataset_URL)
dataset_columns = dataset_df.columns

In [None]:
print(dataset_df.dtypes)
print(dataset_df)

In [None]:
number_columns = ['Age', 'Fare']
categorical_columns = ['Sex', 'Embarked', 'Survived', 'Pclass', 'Parch', 'SibSp']

In [None]:
# Clean and preprocess data
preprocessed_df = dataset_df.drop(['Cabin', 'Name', 'Ticket'], axis = 1)
preprocessed_df.dropna(axis = 0, inplace = True)
print(preprocessed_df)

In [None]:
# boxplots of number columns
preprocessed_df.boxplot('Age', grid = True, figsize = (2, 5))

In [None]:
preprocessed_df.boxplot('Fare', grid = True, figsize = (2, 10))

In [None]:
# Density plots for number columns
axis = preprocessed_df['Age'].plot.hist(density = True, xlim = [-20, 90], bins = range(1, 80))
axis.set_xlabel('Age in years')
preprocessed_df['Age'].plot.density(ax = axis)
plt.show()

In [None]:
axis = preprocessed_df['Fare'].plot.hist(density = True, xlim = [-100, 100], bins = range(1, 100))
axis.set_xlabel('Fare in $')
preprocessed_df['Fare'].plot.density(ax = axis)
plt.show()

In [None]:
sns.catplot(data = preprocessed_df, x = 'Sex', kind = 'count', height = 2)
plt.show()

In [None]:
sns.catplot(data = preprocessed_df, x = 'Embarked', kind = 'count', height = 2)
plt.show()

In [None]:
sns.catplot(data = preprocessed_df, x = 'Survived', kind = 'count', height = 2)
plt.show()

In [None]:
sns.catplot(data = preprocessed_df, x = 'Pclass', kind = 'count', height = 2)
plt.show()

In [None]:
sns.catplot(data = preprocessed_df, x = 'Parch', kind = 'count', height = 4)
plt.show()

In [None]:
sns.catplot(data = preprocessed_df, x = 'SibSp', kind = 'count', height = 4)
plt.show()

In [None]:
#TODO: Plot correlation matrix between columns:
#1)Continuos-Continuous
#2)Categorical-Categorical
#3)Continuos-Categorical
# Because as I read, df.corr() use only Pearson correlation method,
# which applied only for continuos values.

In [None]:
#1)Continuos-Continuous
age_fare_correlation_coeff = round(
    preprocessed_df[number_columns].corr()['Age'].iloc[1], 3
)
print('Age-Fare correlation coeff =', age_fare_correlation_coeff)

In [None]:
# create contingency table for columns pair
def get_contingency_matrix_for_columns_pair(dataframe: pd.DataFrame, column_names: list[str]):
  first_column_categories = pd.unique(dataframe[column_names[0]])
  target_features_pair_df = dataframe[column_names]
  groupby_matrix = target_features_pair_df.groupby(
      by = column_names, as_index = False
  ).size().values
  contingency_matrix = []
  for first_column_category in first_column_categories:
    category_groups_values = [
        group_i[-1] for group_i in groupby_matrix if group_i[0] == first_column_category
    ]
    contingency_matrix.append(category_groups_values)
  max_exist_groupby_subcategories = max(
      [len(category_groups_values) for category_groups_values in contingency_matrix]
  )
  for (i, matrix_elem) in enumerate(contingency_matrix):
    if len(matrix_elem) < max_exist_groupby_subcategories:
      len_delta = max_exist_groupby_subcategories - len(matrix_elem)
      contingency_matrix[i] += [0] * len_delta

  return contingency_matrix

contigency_matrix_test = get_contingency_matrix_for_columns_pair(
    preprocessed_df, ['Embarked', 'Parch']
)
print(contigency_matrix_test)

In [None]:
# chi-square tests of independence of categorical columns pairs
chi2_tests_for_categories = {}
for category_i in categorical_columns:
  for category_j in categorical_columns:
    categories_pair = (category_i, category_j)
    if categories_pair[::-1] in chi2_tests_for_categories.keys():
      continue
    if category_i != category_j:
      if categories_pair not in chi2_tests_for_categories.keys():
        contingency_ij_matrix = get_contingency_matrix_for_columns_pair(
            preprocessed_df, [category_i, category_j]
        )
        chi2_test_result = chi2_contingency(contingency_ij_matrix)
        chi2_tests_for_categories[categories_pair] = chi2_test_result

In [None]:
# show only accepted alternate hypothesis(categories pair has relation)
p_value_threshold = 0.05
pvalues_categories_pair_with_relationship = {}
for categories_pair in chi2_tests_for_categories:
  current_pair_pvalue = round(chi2_tests_for_categories[categories_pair].pvalue, 4)
  if current_pair_pvalue < p_value_threshold:
    pvalues_categories_pair_with_relationship[categories_pair] = current_pair_pvalue

print('Categories pair with relationship:')
for pair_pvalue in pvalues_categories_pair_with_relationship.items():
  print(pair_pvalue)