# Hepatitis C Data Analysis

**Author:** Affan Ahammed

full analysis: EDA, preprocessing, KMeans clustering, PCA, visualizations, saving outputs, and a final summary.


In [None]:
# 1) Setup: install (optional) and imports
# Uncomment the pip installs if running in a fresh Colab environment
# !pip install --quiet pandas matplotlib scikit-learn seaborn scipy

import os, pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy import stats

# Output directory
OUT_DIR = "analysis_outputs"
os.makedirs(OUT_DIR, exist_ok=True)
RANDOM_STATE = 42

print('Environment ready. Outputs will be saved to', OUT_DIR)

## 2) Load dataset
Load CSV from the GitHub link provided in the project instructions.

In [None]:
# 2) Load dataset
GITHUB_CSV = "https://raw.githubusercontent.com/salemprakash/EDA/main/Data/HepatitisCdata.csv"
df = pd.read_csv(GITHUB_CSV)
print('Raw shape:', df.shape)
df.head()

## 3) Quick EDA
Show data types and missing values per column.

In [None]:
# 3) Quick EDA
print(df.info())
print('\nMissing values per column:\n', df.isna().sum())

## 4) Clean trivial columns
Drop fully empty columns and 'Unnamed' index cols; select numeric columns for analysis.

In [None]:
# 4) Drop empty / unnamed columns and select numeric features
df = df.dropna(axis=1, how='all')
for col in df.columns:
    if col.startswith('Unnamed'):
        df.drop(columns=[col], inplace=True)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print('Numeric columns (count):', len(numeric_cols))
numeric_cols[:30]

## 5) Imputation
Impute missing numeric values using column means.

In [None]:
# 5) Mean imputation for numeric columns
imputer = SimpleImputer(strategy='mean')
df_num = pd.DataFrame(imputer.fit_transform(df[numeric_cols]), columns=numeric_cols)
print('Any NaNs after imputation?', df_num.isna().sum().sum())

## 6) Outlier detection
Compute z-scores and report number of extreme rows (z > 4).

In [None]:
# 6) Outlier detection (z-score)
z_scores = np.abs(stats.zscore(df_num, nan_policy='omit'))
outlier_mask = (z_scores > 4).any(axis=1)
print('Number of extreme rows (z>4):', outlier_mask.sum())

## 7) Winsorization
Clip each numeric column to [1st percentile, 99th percentile] to reduce extreme outliers.

In [None]:
# 7) Winsorize/clipping
def winsorize_df(df_in):
    df_out = df_in.copy()
    for col in df_out.columns:
        lower_q = df_out[col].quantile(0.01)
        upper_q = df_out[col].quantile(0.99)
        df_out[col] = df_out[col].clip(lower_q, upper_q)
    return df_out

df_wins = winsorize_df(df_num)
df_wins.describe().T[['mean','std','min','25%','50%','75%','max']].head()

## 8) Scaling
Standardize the winsorized features with StandardScaler.

In [None]:
# 8) Standard scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_wins)
scaled_df = pd.DataFrame(X_scaled, columns=df_wins.columns)
scaled_df.head()

## 9) Save scaled features
Save the scaled features as CSV for reproducibility.

In [None]:
# 9) Save scaled dataframe
scaled_df.to_csv(os.path.join(OUT_DIR, 'scaled_features.csv'), index=False)
print('Saved scaled_features.csv')

## 10) Elbow and Silhouette Analysis
Compute inertia and silhouette score for k in [2..7], plot elbow and silhouette charts (inline) and save PNG.

In [None]:
# 10) Elbow + silhouette analysis
def elbow_scores(X, k_range=range(2,8)):
    inertias = []
    sil_scores = []
    for k in k_range:
        km = KMeans(n_clusters=k, n_init=20, random_state=RANDOM_STATE)
        labels_k = km.fit_predict(X)
        inertias.append(km.inertia_)
        sil_scores.append(silhouette_score(X, labels_k))
    return inertias, sil_scores

k_range = range(2,8)
inertias, sil_scores = elbow_scores(X_scaled, k_range=k_range)

plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.plot(list(k_range), inertias, '-o')
plt.title('Elbow: Inertia vs k'); plt.xlabel('k'); plt.ylabel('Inertia'); plt.grid(True)
plt.subplot(1,2,2)
plt.plot(list(k_range), sil_scores, '-o')
plt.title('Silhouette Score vs k'); plt.xlabel('k'); plt.ylabel('Silhouette score'); plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, 'elbow_silhouette.png'), dpi=200)
plt.show()

print('Silhouette scores:', dict(zip(k_range, [round(s,3) for s in sil_scores])))

## 11) KMeans clustering
Choose k (3) and run KMeans with multiple inits; compute silhouette score.

In [None]:
# 11) KMeans with chosen k=3
k_opt = 3
kmeans = KMeans(n_clusters=k_opt, n_init=50, random_state=RANDOM_STATE)
cluster_labels = kmeans.fit_predict(X_scaled)
sil_score = silhouette_score(X_scaled, cluster_labels)
print(f'Chosen k = {k_opt}, silhouette score = {sil_score:.4f}')

## 12) Attach cluster labels
Append cluster labels to original dataframe and save CSV.

In [None]:
# 12) Append cluster labels and save
df_out = df.copy()
df_out['cluster'] = cluster_labels
# If the original has any string labels, keep them; otherwise this is fine
df_out.to_csv(os.path.join(OUT_DIR, 'data_with_clusters.csv'), index=False)
print('Saved data_with_clusters.csv')

## 13) Cluster profiling
Compute mean and std per cluster for numeric columns; save as CSV.

In [None]:
# 13) Cluster profiling
cluster_profile = df_out.groupby('cluster')[numeric_cols].agg(['mean','std','count']).round(3)
cluster_profile.to_csv(os.path.join(OUT_DIR, 'cluster_profile.csv'))
cluster_profile.head()

## 14) PCA
Fit PCA with 2 components for visualization and report explained variance.

In [None]:
# 14) PCA (2 components)
pca = PCA(n_components=2, random_state=RANDOM_STATE)
pcs = pca.fit_transform(X_scaled)
print('Explained variance ratio (2 components):', pca.explained_variance_ratio_, 'sum =', pca.explained_variance_ratio_.sum())

## 15) PCA scatter plot
Plot PCA projected points colored by cluster; save PNG and display inline.

In [None]:
# 15) PCA scatter colored by cluster
pc_df = pd.DataFrame(pcs, columns=['PC1','PC2'])
pc_df['cluster'] = cluster_labels

plt.figure(figsize=(8,6))
sns.scatterplot(data=pc_df, x='PC1', y='PC2', hue='cluster', palette='tab10', s=60, alpha=0.9)
plt.title('PCA (2 components) colored by KMeans cluster')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% var)'); plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% var)')
plt.legend(title='cluster', loc='best')
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, 'pca_clusters.png'), dpi=200)
plt.show()

## 16) PCA with true labels (if available)
If dataset contains a ground-truth label column, visualize it for comparison.

In [None]:
# 16) PCA colored by true label if present
possible_label_cols = ['Category','Class','Label','Stage']
label_col = None
for c in possible_label_cols:
    if c in df.columns:
        label_col = c
        break

if label_col:
    pc_df['true_label'] = df[label_col].values
    plt.figure(figsize=(8,6))
    sns.scatterplot(data=pc_df, x='PC1', y='PC2', hue='true_label', palette='tab20', s=60, alpha=0.9)
    plt.title('PCA colored by true label')
    plt.xlabel('PC1'); plt.ylabel('PC2')
    plt.legend(title='true_label', bbox_to_anchor=(1.05,1), loc='upper left')
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, 'pca_true_labels.png'), dpi=200)
    plt.show()
else:
    print('No common label column found. Skipping this plot.')

## 17) Summary statistics for selected features
Select a few key features (or fallback to first 3 numeric) and save means/stds per cluster.

In [None]:
# 17) Summary stats for representative features
features_of_interest = ['Bilirubin','ALT','AST']
existing = [f for f in features_of_interest if f in numeric_cols]
if len(existing) < 1:
    existing = numeric_cols[:3]
summary = df_out.groupby('cluster')[existing].agg(['mean','std','count']).round(3)
summary.to_csv(os.path.join(OUT_DIR, 'cluster_summary_selected_features.csv'))
summary

## 18) Save KMeans model
Persist the trained KMeans model to disk using pickle.

In [None]:
# 18) Save KMeans model
with open(os.path.join(OUT_DIR, 'kmeans_model.pkl'), 'wb') as f:
    pickle.dump(kmeans, f)
print('Saved kmeans_model.pkl')

## 19) Final metrics and cluster sizes
Print silhouette, chosen k, and sizes of each cluster.

In [None]:
# 19) Final metrics
print('FINAL METRICS')
print('Chosen k:', k_opt)
print('Silhouette score:', round(sil_score,4))
for i in range(k_opt):
    cnt = int((cluster_labels==i).sum())
    print(f'Cluster {i}: size = {cnt}')

## 20) Boxplots for selected features by cluster
Create boxplots for selected features and save PNG.

In [None]:
# 20) Boxplots (selected features)
features_plot = existing if 'existing' in globals() and len(existing)>0 else df_wins.columns[:3].tolist()
plt.figure(figsize=(12,4))
for i, col in enumerate(features_plot,1):
    plt.subplot(1, len(features_plot), i)
    sns.boxplot(x=df_out['cluster'], y=df_out[col])
    plt.title(col)
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, 'boxplots_by_cluster.png'), dpi=200)
plt.show()

## 21) Save quick textual summary
Save a short analysis_summary.txt summarizing chosen k, silhouette, cluster sizes and list saved files.

In [None]:
# 21) Save summary text file
with open(os.path.join(OUT_DIR, 'analysis_summary.txt'), 'w') as f:
    f.write('Hepatitis C Data Analysis\n')
    f.write(f'Chosen k: {k_opt}\n')
    f.write(f'Silhouette score: {round(sil_score,4)}\n')
    f.write('Cluster sizes:\n')
    for i in range(k_opt):
        f.write(f'  Cluster {i}: {int((cluster_labels==i).sum())}\n')
    f.write('\nSaved files in this folder:\n')
    for fname in sorted(os.listdir(OUT_DIR)):
        f.write('  ' + fname + '\n')

print('Saved analysis_summary.txt and listed files:')
print('\n'.join(sorted(os.listdir(OUT_DIR))))