In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


In [None]:
# Load CSV file into a DataFrame
df = pd.read_csv('data/titanic.csv')
df.head()

In [None]:
df.drop('Name', axis=1, inplace=True)
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(10, 3))
sns.histplot(df['Survived'], kde=False)
plt.title('Survived')
plt.xlabel('Survived')
plt.ylabel('Count')
plt.xticks([0, 1], ['0', '1'])
plt.show()

In [None]:
fig, axes = plt.subplots(ncols=2, nrows=4, figsize=(20, 20))
for i, ax in zip(df.columns, axes.flat):
    sns.histplot(x=df[i], ax=ax).set(title=f"Histogram of : {i}", xlabel="")
plt.show()

In [None]:
df.info()

In [None]:
# Assuming df is your DataFrame
df_without_sex = df.drop('Sex', axis=1)  # Drop the 'Sex' column

# Calculate the correlation matrix using Spearman method
corr = df_without_sex.corr(method='spearman')

# Display the correlation matrix
print(corr)


In [None]:
# Define features (X) and target (y)
X = df.drop('Survived', axis=1)
y = df['Survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display train and test data
df_train, df_test, y_train, y_test = train_test_split(df, df['Survived'], test_size=0.2, random_state=123)

print("\nTRAIN:")
display(df_train.head())
print("SHAPE:", df_train.shape)

print("\nTEST:")
display(df_test.head())
print("SHAPE:", df_test.shape)

In [None]:
df_train.info()
df_train.isna().sum()
df_train.describe()

In [None]:
plt.figure(figsize=(10,10))
sns.set(font_scale=1.25)
sns.heatmap(
    corr, linewidths=1.5, annot=True, square=True, fmt=".2f", annot_kws={"size": 10}
)
plt.show()

In [None]:
df_train.head()

In [None]:
y = df_train["Survived"].values
print(y.shape)

In [None]:
y = df_train["Survived"].values

In [None]:
print(y.shape)

In [None]:
# Visualize the distribution of the target variable 'Survived'
sns.countplot(x='Survived', data=df)
plt.xlabel('Survived')
plt.ylabel('Count')
plt.title('Distribution of Survived')
plt.show()

# Explore relationships between features and the target variable
sns.countplot(x='Survived', hue='Pclass', data=df)
plt.xlabel('Survived')
plt.ylabel('Count')
plt.title('Survival by Passenger Class')
plt.show()

sns.boxplot(x='Survived', y='Age', data=df)
plt.xlabel('Survived')
plt.ylabel('Age')
plt.title('Survival by Age')
plt.show()

In [None]:
from scipy.stats import chi2_contingency

# Chi-squared test for Sex vs. Survived
sex_survived_contingency = pd.crosstab(df['Sex'], df['Survived'])
chi2_stat, p_val, dof, expected = chi2_contingency(sex_survived_contingency)
print("Chi-squared p-value for Sex vs. Survived:", p_val)

# Chi-squared test for Pclass vs. Survived
pclass_survived_contingency = pd.crosstab(df['Pclass'], df['Survived'])
chi2_stat, p_val, dof, expected = chi2_contingency(pclass_survived_contingency)
print("Chi-squared p-value for Pclass vs. Survived:", p_val)
