# Exploratory Data Analysis.

In [None]:
# 1. Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy import stats
sns.set(style="whitegrid")
warnings.filterwarnings('ignore')

## 1.Data Cleaning and Preparation:

In [None]:
# Loading the dataset into dataframe.
df = pd.read_csv("Cardiotocographic.csv")
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
# Checking for missing values.
df.isnull().sum()

In [None]:
# Handle missing values appropriately.
# Mean imputation (continuous)
mean_cols = ["LB", "MSTV", "MLTV", "Width"]
df[mean_cols] = df[mean_cols].fillna(df[mean_cols].mean())
# Median imputation (skewed / outliers)
median_cols = ["AC", "FM", "UC", "DL", "DS", "DP", "ASTV", "ALTV"]
df[median_cols] = df[median_cols].fillna(df[median_cols].median())
# Mode imputation (categorical)
df["Tendency"].fillna(df["Tendency"].mode()[0], inplace=True)
df["NSP"].fillna(df["NSP"].mode()[0], inplace=True)
 #NSP = Fetal State Class
df = df[df["NSP"].isin([1, 2, 3])]
# Fix invalid negative values (except Tendency)
for col in mean_cols + median_cols:
    df.loc[df[col] < 0, col] = df[col].median()

df.isnull().sum()

In [None]:
# Removinng dupllicates
df=df.drop_duplicates()

In [None]:
# All variables are numerical - No datatype correction needed
df.dtypes

In [None]:
df["NSP"].unique()

In [None]:
# Converting Target varible from float to int.
df["NSP"] = df["NSP"].astype(int)

In [None]:
# Detect and treat outliers.
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
outliers = ((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).sum()
outliers

In [None]:
# These outliers are clinically meaningful and should not be removed blindly, as they may indicate fetal distress.
# Retained to preserve medical relevance
plt.figure(figsize=(12,6))
sns.boxplot(data=df)
plt.xticks(rotation=90)
plt.title("Boxplot of Cardiotocographic Variables")
plt.show()

## 2.Statistical Summary:


In [None]:
# Statistical Summary
df.describe().T

In [None]:
# Examine the NSP column more closely as it's our target variable.
# NSP Meaning.
# 1 → Normal
# 2 → Suspect
# 3 → Pathologic
df['NSP'].value_counts().sort_index()

In [None]:
# The proportion of NSP values
df['NSP'].value_counts(normalize=True)

In [None]:
# Correlation analysis
correlation_matrix = df.corr()
print("\nTop Correlations with NSP:")
nsp_correlations = correlation_matrix['NSP'].sort_values(ascending=False)
print(nsp_correlations)

## 3.Data Visualization:

In [None]:
# Set style for plots
plt.style.use('default')
sns.set_palette("husl")

In [None]:
# Histograms for Key Variables
key_variables = ['LB', 'AC', 'FM', 'UC', 'ASTV', 'MSTV', 'ALTV', 'MLTV', 'Width', 'Tendency', 'NSP']

fig, axes = plt.subplots(3, 4, figsize=(20, 10))
axes = axes.ravel()

for i, var in enumerate(key_variables):
    df[var].hist(bins=30, ax=axes[i])
    axes[i].set_title(f'Distribution of {var}')
    axes[i].set_xlabel(var)
    axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Boxplots to Identify Outliers
fig, axes = plt.subplots(3, 4, figsize=(15, 10))
axes = axes.ravel()

for i, var in enumerate(key_variables):
    df.boxplot(column=var, ax=axes[i])
    axes[i].set_title(f'Boxplot of {var}')

plt.tight_layout()
plt.show()

In [None]:
# Correlation Heatmap
plt.figure(figsize=(12, 10))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Variables')
plt.show()
# Top correlations with NSP
nsp_correlations = correlation_matrix['NSP'].drop('NSP').sort_values(key=abs, ascending=False)
print("Top 5 variables most correlated with NSP:")
print(nsp_correlations.head())


In [None]:
# Scatter Plot of Most Correlated Variables
# Selecting two variables with highest correlation to NSP
top_corr_vars = nsp_correlations.index[:2].tolist()
if len(top_corr_vars) >= 2:
    plt.figure(figsize=(10, 8))
    sns.scatterplot(x=top_corr_vars[0], y=top_corr_vars[1], hue='NSP', data=df)
    plt.title(f'Relationship between {top_corr_vars[0]} and {top_corr_vars[1]} by Fetal State')
    plt.show()

In [None]:
# Violin Plots
fig, axes = plt.subplots(3, 4, figsize=(15, 10))
axes = axes.ravel()

for i, var in enumerate(key_variables):
    sns.violinplot(x='NSP', y=var, data=df, ax=axes[i])
    axes[i].set_title(f'{var} by Fetal State')

plt.tight_layout()
plt.show()

In [None]:
# Distribution of Target Variable (NSP)
plt.figure(figsize=(8, 6))
nsp_counts = df['NSP'].value_counts().sort_index()
plt.bar(nsp_counts.index.astype(str), nsp_counts.values, color=['green', 'orange', 'red'])
plt.title('Distribution of Fetal State Classification (NSP)')
plt.xlabel('NSP Value (1=Normal, 2=Suspect, 3=Pathologic)')
plt.ylabel('Number of Cases')
plt.show()

print("NSP Distribution:")
print(f"Normal (1): {nsp_counts[1]} cases")
print(f"Suspect (2): {nsp_counts[2]} cases") 
print(f"Pathologic (3): {nsp_counts[3]} cases")

## 4.Pattern Recognition and Insights:

In [None]:
# Pattern 1: Analyze relationships between variables and NSP
if 'NSP' in df.columns:
    # Group by NSP and calculate mean values
    nsp_grouped = df.groupby('NSP').mean()
    print("Mean Values by NSP:")
    print(nsp_grouped)

In [None]:
# Statistical test for differences between NSP groups
print("\nANOVA Results (comparing means across NSP groups):")
for var in key_variables:
        if var in df.columns:
            groups = [df[df['NSP'] == i][var] for i in sorted(df['NSP'].unique())]
            f_stat, p_value = stats.f_oneway(*groups)
            print(f"{var}: F-statistic = {f_stat:.2f}, p-value = {p_value:.4f}")

In [None]:
# Correlation analysis with NSP.
# Correlation matrix.
corr_matrix = df.corr()
# Extract correlations with target variable NSP.
corr_with_nsp = corr_matrix['NSP'].sort_values(ascending=False)
corr_with_nsp

In [None]:
# Compare means by NSP category.
# Group data by NSP and calculate mean of each feature.
mean_by_nsp = df.groupby('NSP').mean()
mean_by_nsp

In [None]:
# Identify potential clinical thresholds (using NSP group statistics).
# Calculate mean and median of key features by NSP
thresholds = df.groupby('NSP')[key_variables].agg(['mean', 'median'])
thresholds

In [None]:
# Analyze deceleration patterns (DL, DS, DP) across NSP groups
deceleration_cols = ['DL', 'DS', 'DP']
# Summary statistics by NSP
decel_stats = df.groupby('NSP')[deceleration_cols].mean()
decel_stats

In [None]:
# Analyze variability metrics (ASTV, MSTV, ALTV, MLTV) across NSP groups
variability_cols = ['ASTV', 'MSTV', 'ALTV', 'MLTV']
# Mean variability values by NSP
variability_stats = df.groupby('NSP')[variability_cols].mean()
variability_stats

In [None]:
# Multivariate Pattern Recognition
print("\nMULTIVARIATE PATTERNS AND INTERACTIONS")
print("="*50)
# Analyze interactions between key variables
print("Interaction analysis - How variables work together:")
# ASTV and DL interaction
high_astv_high_dl = df[(df['ASTV'] > df['ASTV'].median()) & 
                            (df['DL'] > df['DL'].median())]
low_astv_low_dl = df[(df['ASTV'] <= df['ASTV'].median()) & 
                          (df['DL'] <= df['DL'].median())]

print(f"High ASTV + High DL: {len(high_astv_high_dl)} cases, "
      f"{high_astv_high_dl['NSP'].value_counts(normalize=True).get(3, 0):.1%} pathological")
print(f"Low ASTV + Low DL: {len(low_astv_low_dl)} cases, "
      f"{low_astv_low_dl['NSP'].value_counts(normalize=True).get(3, 0):.1%} pathological")

## 5.Conclusion:

In [None]:
nsp_distribution = df['NSP'].value_counts().sort_index()

In [None]:
# Summary of key insights
print("="*50)
print("EXPLORATORY DATA ANALYSIS SUMMARY")
print("="*50)

print("\n*. TARGET VARIABLE (NSP) DISTRIBUTION:")
for nsp_value, count in nsp_distribution.items():
    percentage = (count / len(df)) * 100
    print(f"   NSP {nsp_value}: {count} cases ({percentage:.1f}%)")

print("\n*. KEY PATTERNS IDENTIFIED:")
# Based on correlation analysis
top_correlated = corr_with_nsp.drop('NSP').head(3)
bottom_correlated = corr_with_nsp.drop('NSP').tail(3)

print("Top 3 features positively correlated with NSP:")
for feature, corr in top_correlated.items():
    print(f"   {feature}: {corr:.3f}")

print("\nTop 3 features negatively correlated with NSP:")
for feature, corr in bottom_correlated.items():
    print(f"   {feature}: {corr:.3f}")

print("\n. RECOMMENDATIONS FOR FURTHER ANALYSIS:")
print("- Consider feature engineering to create additional relevant features")
print("- Apply machine learning models for NSP classification")
print("- Conduct more sophisticated outlier detection and treatment")
print("- Perform cross-validation to ensure model robustness")


In [None]:
# Summary.
summary = """
Impact on Decision-Making and Further Analysis:
- Variability-based features are critical for fetal health assessment. 
- Useful for feature selection and predictive modeling.
- Supports further machine learning and automated monitoring analysis. """
print(summary)