# EDA for DS3

In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from scipy.stats import skew, kurtosis
import math

In [None]:
import os
print(os.getcwd())

## Initial Analysis of the Dataset

In [None]:
df = pd.read_csv("../../data/DS3.csv")

In [None]:
df = df.drop(df.columns[df.columns.str.contains('unnamed', case=False)], axis=1) 

In [None]:
df.describe().T

In [None]:
col_summary = pd.DataFrame({
    "Column": df.columns,
    "Non-Null Count": df.notnull().sum().values,
    "Dtype": df.dtypes.values
})
print(col_summary)

In [None]:
# drop dupes, unecessary in Dataset generated by 9f448db, since no dupes but left in for completeness' sake
df = df.drop_duplicates()

In [None]:
# Oxidation info is missing, will need to retry and guess oxidation info from pymatgen, risky though // TODO: Dataset13
# df = df.drop(df.columns[df.columns.str.contains('oxidation', case=False)], axis=1) 
# Drop plt IDs since not needed here
df = df.drop(['charged_id', 'discharged_id'], axis=1)

In [None]:
# Data type distribution
df.dtypes.value_counts()

## Distributions

In [None]:
targets = ['specific_capacity', 'delta_V', 'specific_energy']
for col in targets:
    fig, axs = plt.pyplot.subplots(1, 2, figsize=(12, 4))
    sns.histplot(df[col], kde=True, ax=axs[0])
    sns.boxplot(x=df[col], ax=axs[1])
    axs[0].set_title(f"Distribution of {col}")
    axs[1].set_title(f"Outliers in {col}")
    plt.pyplot.tight_layout()
    plt.pyplot.show()

In [None]:
stats = []
for col in targets:
    data = df[col].dropna()  # exclude NaNs for stats
    stats.append({
        'Feature': col,
        'Mean': data.mean(),
        'Skewness': skew(data),
        'Kurtosis': kurtosis(data, fisher=True),  # excess kurtosis (Fisher's definition)
        'Std Deviation': data.std()
    })

stats_df = pd.DataFrame(stats)
print(stats_df)

In [None]:
# corr bw input and output features
corr = df[df.select_dtypes(include=[np.number]).columns].corr().loc[targets]

plt.pyplot.figure(figsize=(16, 6));
sns.heatmap(corr, cmap='magma', center=0)
plt.pyplot.title("Correlation Heatmap")
plt.pyplot.show()

In [None]:
# full corr plot
corr = df[df.select_dtypes(include=[np.number]).columns].corr()
sns.heatmap(corr, cmap='magma', center=0)
plt.pyplot.title("Correlation Heatmap")
plt.pyplot.show()

In [None]:
sns.pairplot(df[targets])
plt.pyplot.suptitle("Pairwise Relationships between Target Variables", y=1.02)
plt.pyplot.show()

In [None]:
def plot_numeric_distributions(df):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    n_cols = 4
    n_plots = len(numeric_cols)
    n_rows = math.ceil(n_plots / n_cols)
    fig, axes = plt.pyplot.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
    axes = axes.flatten()
    for i, col in enumerate(numeric_cols):
        sns.histplot(df[col], kde=True, ax=axes[i])
        axes[i].set_title(col)
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])
    plt.pyplot.tight_layout()
    plt.pyplot.show()

plot_numeric_distributions(df)
