# Cancer Dataset - Missing Values and Outlier Analysis

This notebook performs data exploration, cleaning, and outlier handling on the UAE Cancer Dataset.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

file_path = '_cancer_dataset_uae.csv'
df = pd.read_csv(file_path, encoding='utf-8', delimiter=',')


In [None]:
print(df.head())
print(df.info())
print(df.describe())


In [None]:
print("Missing values per column:")
print(df.isnull().sum())


In [None]:
if df.duplicated().any():
    print(f"Duplicates found: {df.duplicated().sum()}")
    df = df.drop_duplicates()
    print("Duplicates removed.")
else:
    print("No duplicate rows found.")


In [None]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].str.strip().str.lower()

df['diagnosis_date'] = pd.to_datetime(df['diagnosis_date'], errors='coerce')
df['treatment_start_date'] = pd.to_datetime(df['treatment_start_date'], errors='coerce')
df['death_date'] = pd.to_datetime(df['death_date'], errors='coerce')


In [None]:
missing_percentage = (df.isnull().sum() / len(df)) * 100
print("Percentage of missing values per column:")
print(missing_percentage)


In [None]:
df['comorbidities'] = df['comorbidities'].fillna('none')
df['cause_of_death'] = df['cause_of_death'].fillna('alive')


In [None]:
df_filtered = df.copy()
for column in ['age', 'weight', 'height']:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    print(f"{column.capitalize()} - Outliers found: {len(outliers)}")

    plt.figure(figsize=(8, 5))
    sns.boxplot(x=df[column])
    plt.title(f"Box Plot for {column.capitalize()}")
    plt.grid(True)
    plt.show()

    df_filtered = df_filtered[(df_filtered[column] >= lower_bound) & (df_filtered[column] <= upper_bound)]


In [None]:
print(f"Original dataset rows: {len(df)}")
print(f"Cleaned dataset rows: {len(df_filtered)}")
df_filtered.to_csv("cancerSummary.csv", index=False)
