<table>
<tbody>
<tr><th><b>Variable</b></th><th><b>Definition</b></th><th><b>Key</b></th></tr>
<tr>
<td>survival</td>
<td>Survival</td>
<td>0 = No, 1 = Yes</td>
</tr>
<tr>
<td>pclass</td>
<td>Ticket class</td>
<td>1 = 1st, 2 = 2nd, 3 = 3rd</td>
</tr>
<tr>
<td>sex</td>
<td>Sex</td>
<td></td>
</tr>
<tr>
<td>Age</td>
<td>Age in years</td>
<td></td>
</tr>
<tr>
<td>sibsp</td>
<td># of siblings / spouses aboard the Titanic</td>
<td></td>
</tr>
<tr>
<td>parch</td>
<td># of parents / children aboard the Titanic</td>
<td></td>
</tr>
<tr>
<td>ticket</td>
<td>Ticket number</td>
<td></td>
</tr>
<tr>
<td>fare</td>
<td>Passenger fare</td>
<td></td>
</tr>
<tr>
<td>cabin</td>
<td>Cabin number</td>
<td></td>
</tr>
<tr>
<td>embarked</td>
<td>Port of Embarkation</td>
<td>C = Cherbourg, Q = Queenstown, S = Southampton</td>
</tr>
</tbody>
</table>

In [None]:
!git clone https://github.com/bsherin/LS405exploration
import sys
sys.path.append('.')
%cd LS405exploration
from utilities import *
from IPython.display import display, HTML

In [None]:
import pandas as pd
# file_id = "1lXO7JEO99fLidhHJDdZUpxinXTiU8Vf-"
# url = f'https://drive.google.com/uc?id={file_id}'
url = 'corpora/titanic.csv'
df = pd.read_csv(url)
dlist = df.to_dict('records')

# Frequency Distributions

In [None]:
def fix_numbers(df, col):
    df_fixed = df[df[col] != ""]
    df_fixed = df_fixed.astype({col: "float"})
    return df_fixed

In [None]:
import matplotlib.pyplot as plt
def freq_plot(df, col, by, nbins=10):
    df_fixed = fix_numbers(df, col)
    the_max = df_fixed[col].max()
    the_min = df_fixed[col].min()
    w = int((the_max - the_min) / nbins)
    bins = [int(w * c) for c in range(nbins + 1)]
    df_fixed.hist(column=col, by=by, bins=bins)
    plt.show()

In [None]:
freq_plot(df, "Age", "Pclass", 18)

In [None]:
df_age = fix_numbers(df, "Age")
df_age[df_age["Pclass"] == 1]["Age"].mean()

In [None]:
df_age[df_age["Pclass"] == 2]["Age"].mean()

In [None]:
df_age[df_age["Pclass"] == 3]["Age"].mean()

# Contingency Table and chi_squared

In [None]:
def build_ftable(df, row_param, col_param):
    rlabels = sorted(df[row_param].unique())
    clabels = sorted(df[col_param].unique())
    the_table = [[""] + list(clabels)]
    core_table = []
    for rl in rlabels:
        the_row = [rl]
        core_row = []
        dfr = df[df[row_param] == rl]
        for cl in clabels:
            the_row.append(len(dfr[dfr[col_param] == cl]))
            core_row.append(len(dfr[dfr[col_param] == cl]))
        the_table.append(the_row)
        core_table.append(core_row)
    title = f"rows={row_param}, cols={col_param}"
    the_html = html_table(the_table, title=title)
    display(the_html)
    return core_table

In [None]:
res_table = build_ftable(df, "Pclass", "Survived")

In [None]:
import numpy as np
rtt = np.array(res_table).transpose()
rtt

In [None]:
from scipy.stats import chi2_contingency
def surv_chi(df, row_param):
    core_table = build_ftable(df, row_param, "Survived")
    stat, p, dof, expected = chi2_contingency(core_table)
    return stat, p, dof, expected

In [None]:
from scipy.stats import chi2_contingency
def surv_chi_t(df, row_param):
    core_table = build_ftable(df, row_param, "Survived")
    ctt = np.array(core_table).transpose()
    display(html_table(ctt))
    stat, p, dof, expected = chi2_contingency(ctt)
    return stat, p, dof, expected

In [None]:
import numpy as np
stat, p, dof, expected = surv_chi(df, "Pclass")
print(f"Stat: {stat}")
print(f"p: {p}")
print(f"dof: {dof}")
print(f"expected: {np.round(expected)}")

In [None]:
import numpy as np
stat, p, dof, expected = surv_chi_t(df, "Pclass")
print(f"Stat: {stat}")
print(f"p: {p}")
print(f"dof: {dof}")
print(f"expected: {np.round(expected)}")

# t-test and anova

In [None]:
freq_plot(df, "Age", "Survived", 20)

In [None]:
for surv in ["0", "1"]:
    print(df_age[df_age["Survived"] == surv]["Age"].mean())

In [None]:
from scipy.stats import ttest_ind

In [None]:
ttest_ind(df_age[df_age["Survived"] == "0"]["Age"], df_age[df_age["Survived"] == "1"]["Age"])

In [None]:
from scipy.stats import ttest_ind, f_oneway
dsets = None
def surv_test(df, row, col):
    rnames = list(df[row].unique())
    dsets = [df[df[row] == r][col] for r in rnames]
    if len(rnames) == 2:
        res = ttest_ind(*dsets)
    else:
        res = f_oneway(*dsets)
    return res

In [None]:
surv_test(df_age, "Survived", "Age")

In [None]:
surv_test(df_age, "Pclass", "Age")

In [None]:
import pandas as pd
from scipy import stats

# Assuming your DataFrame is named `df` and has the relevant columns

# Step 1: Handle missing data in the 'Age' column
dfn = df_age.dropna(subset=['Age', 'Survived'])

# Step 2: Perform one-way ANOVA to compare the mean age of survivors and non-survivors
survived = dfn[dfn['Survived'] == 1]['Age']
not_survived = dfn[dfn['Survived'] == 0]['Age']

# Step 3: Perform ANOVA using scipy
res = stats.f_oneway(survived, not_survived)
res

In [None]:
res.statistic

In [None]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Step 1: Handle missing data (if any) in Age, Pclass, and Survived
dfn = df_age.dropna(subset=['Age', 'Pclass', 'Survived'])

# Step 2: Create an interaction term between Pclass and Survived
# We treat Survived as a factor even though it is binary
model = ols('Age ~ C(Pclass) * C(Survived)', data=dfn).fit()

# Step 3: Perform the two-way ANOVA
anova_table = sm.stats.anova_lm(model, typ=2)

# Step 4: View the ANOVA table
display(html_table(anova_table))

# Clustering

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from scipy.spatial.distance import pdist, squareform

In [None]:
categorical_cols = ['Survived', 'Pclass', "Sex", "Embarked"]
numerical_cols = ["Fare"]

In [None]:
from sklearn.preprocessing import MinMaxScaler
encoder = OneHotEncoder(sparse_output=False)
encoded_categorical = encoder.fit_transform(df[categorical_cols])
encoded_data = pd.concat([pd.DataFrame(encoded_categorical), df[numerical_cols].reset_index(drop=True)], axis=1)
hamming_distance_matrix = pdist(encoded_categorical, metric='hamming')
hamming_distance_matrix = MinMaxScaler().fit_transform(squareform(hamming_distance_matrix))
df[numerical_cols] = df[numerical_cols].apply(pd.to_numeric, errors='coerce')
numerical_distance_matrix = pdist(df[numerical_cols], metric='euclidean')
numerical_distance_matrix = MinMaxScaler().fit_transform(squareform(numerical_distance_matrix))
combined_distance_matrix = 0.9 * hamming_distance_matrix + 0.1 * numerical_distance_matrix

In [None]:
from sklearn.cluster import AgglomerativeClustering

# Perform hierarchical clustering using the precomputed distance matrix
clustering = AgglomerativeClustering(n_clusters=3, metric='precomputed', linkage='complete')

# Fit the clustering model to the combined distance matrix
cluster_labels = clustering.fit_predict(squareform(combined_distance_matrix))

# Add the cluster labels back to the original dataframe
df['cluster'] = cluster_labels

# Show the dataframe with clusters
display(html_table(df))

In [None]:
df["cluster"].value_counts().to_dict()

In [None]:
df = Collection["titanic"].df
encoder = OneHotEncoder(sparse_output=False)
encoded_categorical = encoder.fit_transform(df[categorical_cols])

# Compute Hamming distance for the categorical data
hamming_distance_matrix = pdist(encoded_categorical, metric='hamming')

# Compute Euclidean distance for the numerical data
df[numerical_cols] = df[numerical_cols].apply(pd.to_numeric, errors='coerce')
numerical_distance_matrix = pdist(df[numerical_cols], metric='euclidean')

# Normalize the condensed distance matrices (1D form)
scaler = MinMaxScaler()

# Reshape to 2D so that the scaler can be applied (scalers need 2D input)
hamming_distance_matrix_reshaped = hamming_distance_matrix.reshape(-1, 1)
numerical_distance_matrix_reshaped = numerical_distance_matrix.reshape(-1, 1)

# Fit the scaler on both distance matrices and transform
hamming_distance_matrix_normalized = scaler.fit_transform(hamming_distance_matrix_reshaped).flatten()
numerical_distance_matrix_normalized = scaler.fit_transform(numerical_distance_matrix_reshaped).flatten()

# Combine the normalized distance matrices (adjust the weights as needed)
combined_distance_matrix = 0.9 * hamming_distance_matrix_normalized + 0.1 * numerical_distance_matrix_normalized

# Convert the combined distance matrix back into a symmetric square form
combined_distance_matrix_square = squareform(combined_distance_matrix)

# Check if the distance matrix is symmetric
assert (combined_distance_matrix_square == combined_distance_matrix_square.T).all(), "Matrix is not symmetric"

# Now use this combined matrix for clustering
from sklearn.cluster import AgglomerativeClustering

# Perform hierarchical clustering using the combined precomputed distance matrix
clustering = AgglomerativeClustering(n_clusters=3, metric='precomputed', linkage='complete')
cluster_labels = clustering.fit_predict(combined_distance_matrix_square)

# Add the cluster labels back to the original dataframe
df['cluster'] = cluster_labels


In [None]:
df["cluster"].value_counts().to_dict()

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

plt.clf()
# Combine numerical and encoded categorical data
encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out(categorical_cols))
combined_data = pd.concat([encoded_categorical_df, df[numerical_cols].reset_index(drop=True)], axis=1)

# Perform t-SNE to reduce the combined data to 2 dimensions
tsne = TSNE(n_components=2, random_state=42)
reduced_tsne = tsne.fit_transform(combined_data)

# Plot the clusters
plt.scatter(reduced_tsne[:, 0], reduced_tsne[:, 1], c=df['cluster'], cmap='plasma')
plt.title('Cluster Visualization with t-SNE')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
# plt.colorbar(label='Cluster')
self.create_pyplot_html()

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage

# Perform linkage for hierarchical clustering
Z = linkage(squareform(combined_distance_matrix), method='complete')

# Plot the dendrogram
plt.clf()
plt.figure(figsize=(10, 5))
dendrogram(Z)
plt.title('Dendrogram for Hierarchical Clustering')
plt.xlabel('Data Points')
plt.ylabel('Distance')
self.create_pyplot_html()