In [1]:
file_path = "lung_cancer_data (2).csv"

In [None]:
import csv
import pandas as pd
import matplotlib.pyplot as plt

# Global variable to store the file path chosen in Task A
GLOBAL_FILE_PATH = None


# ========= Helper Function to Print COM731 ==========
def print_header():
    print("COM731")
    print("COM731")
    print("COM731")
    print("-" * 50)


# =====================================================
#                 TASK A — CSV + LIST
# =====================================================

def load_csv_to_list():
    """
    Loads CSV using csv.reader() into a list of dictionaries.
    """
    global GLOBAL_FILE_PATH
    print_header()

    file_path = input("Enter CSV filename or full path (e.g., lung_cancer_data (2).csv): ").strip()
    GLOBAL_FILE_PATH = file_path

    data_list = []

    try:
        with open(file_path, mode='r', newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                data_list.append(row)

        print("CSV successfully loaded!")
        return data_list

    except FileNotFoundError:
        print("ERROR: File not found. Ensure the file is in the COM731-Assessment folder.")
        return None


# ---------------- A1 ----------------

def task_A1(data_list):
    print("COM731\nCOM731\nCOM731")
    pid = input("Enter Patient ID to retrieve demographic info: ").strip()

    for row in data_list:
        if row["Patient_ID"] == pid:
            print("\nDemographic Information:")
            print("Patient ID:", row["Patient_ID"])
            print("Age:", row["Age"])
            print("Gender:", row["Gender"])
            print("Smoking History:", row["Smoking_History"])
            print("Ethnicity:", row["Ethnicity"])
            return

    print("Patient ID not found.")

# ---------------- A2 ----------------

def task_A2(data_list):
    print("COM731\nCOM731\nCOM731")
    ethnicity = input("Enter ethnicity: ").strip()

    print(f"\nMedical history for ethnicity: {ethnicity}\n")
    found = False

    for row in data_list:
        if row["Ethnicity"].lower() == ethnicity.lower():
            found = True
            print("Family History:", row["Family_History"])
            print("Diabetes:", row["Comorbidity_Diabetes"])
            print("Kidney Disease:", row["Comorbidity_Kidney_Disease"])
            print("Haemoglobin Level:", row["Haemoglobin_Level"])
            print("----")

    if not found:
        print("No records found for that ethnicity.")


# ---------------- A3 ----------------

def task_A3(data_list):
    print("COM731\nCOM731\nCOM731")
    treatment = input("Enter treatment name: ").strip()

    print(f"\nPatients surviving >100 months on {treatment}\n")
    found = False

    for row in data_list:
        if row["Treatment"].lower() == treatment.lower() and float(row["Survival_Months"]) > 100:
            found = True
            print("Age:", row["Age"])
            print("Tumor Size (mm):", row["Tumor_Size_mm"])
            print("Tumor Location:", row["Tumor_Location"])
            print("Stage:", row["Stage"])
            print("Survival Months:", row["Survival_Months"])
            print("-" * 20)

    if not found:
        print("No patients found surviving >100 months on that treatment.")

# ---------------- A4 ----------------


def task_A4(data_list):
    print("COM731\nCOM731\nCOM731")
    print("Custom Condition: Patients with Pulse > 90 and Tumor Size < 20 mm\n")

    found = False
    for row in data_list:
        if float(row["Blood_Pressure_Pulse"]) > 90 and float(row["Tumor_Size_mm"]) < 20:
            found = True
            print("Patient ID:", row["Patient_ID"])
            print("Age:", row["Age"])
            print("Tumor Size (mm):", row["Tumor_Size_mm"])
            print("Pulse:", row["Blood_Pressure_Pulse"])
            print("White Blood Cell Count:", row["White_Blood_Cell_Count"])
            print("-" * 20)

    if not found:
        print("No patients found matching the custom condition.")


# =====================================================
#                 TASK B — PANDAS ANALYSIS
# =====================================================

def load_dataframe():
    print_header()
    global GLOBAL_FILE_PATH

    if GLOBAL_FILE_PATH is None:
        print("ERROR: Load CSV first using Task A.")
        return None

    df = pd.read_csv(GLOBAL_FILE_PATH)
    print("DataFrame successfully loaded.")
    return df


# -------------- B1 --------------

def task_B1(df):
    print("COM731\nCOM731\nCOM731")
    
    ethnicity = input("Enter ethnicity: ").strip()
    
    # Filter by ethnicity and survival > 100
    filtered = df[
        (df["Ethnicity"].str.lower() == ethnicity.lower()) &
        (df["Survival_Months"] > 100)
    ]
    
    if filtered.empty:
        print("No matching patients found.")
        return
    
    # Count treatment frequencies
    top_treatments = filtered["Treatment"].value_counts().head(3)
    
    print(f"\nTop 3 treatments for ethnicity '{ethnicity}' (Survival >100 months):\n")
    print(top_treatments)


# -------------- B2 --------------

def task_B2(df):
    print("COM731\nCOM731\nCOM731")
    
    ethnicity = input("Enter ethnicity: ").strip()
    treatment = input("Enter treatment: ").strip()

    filtered = df[
        (df["Ethnicity"].str.lower() == ethnicity.lower()) &
        (df["Treatment"].str.lower() == treatment.lower())
    ]

    if filtered.empty:
        print("No data found for this ethnicity and treatment.")
        return

    avg_wbc = filtered["White_Blood_Cell_Count"].mean()

    print(f"\nAverage White Blood Cell Count for {ethnicity} patients on {treatment}: {avg_wbc:.2f}")

# -------------- B3 --------------

def task_B3(df):
    print("COM731\nCOM731\nCOM731")

    # Apply filters
    filtered = df[
        (df["Blood_Pressure_Pulse"] > 90) &
        (df["Tumor_Size_mm"] < 15)
    ]

    if filtered.empty:
        print("No patients match the criteria.")
        return

    # Group by tumor location and find average smoking pack years
    grouped = filtered.groupby("Tumor_Location")["Smoking_Pack_Years"].mean()

    print("\nAverage Smoking Pack Years by Tumor Location (Pulse>90 & Tumor Size<15mm):\n")
    print(grouped)


# -------------- B4 (Unique Analysis) --------------

def task_B4(df):
    print("COM731\nCOM731\nCOM731")
    
    print("Unique Analysis: Average Tumor Size per Cancer Stage\n")

    result = df.groupby("Stage")["Tumor_Size_mm"].mean()

    print(result)

# =====================================================
#           TASK C — MATPLOTLIB VISUALISATIONS
# =====================================================

# -------------- C1 --------------


def task_C1(df):
    print("COM731\nCOM731\nCOM731")

    ethnicity = input("Enter ethnicity: ").strip()

    # Filter rows using correct column name
    filtered = df[df["Ethnicity"].str.lower() == ethnicity.lower()]

    if filtered.empty:
        print("No data found for that ethnicity.")
        return

    # Correct column name for treatment
    counts = filtered["Treatment"].value_counts()

    # Plot bar chart
    plt.bar(counts.index, counts.values)
    plt.xlabel("Treatment Type")
    plt.ylabel("Number of Patients")
    plt.title(f"Treatment Proportions for {ethnicity}")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


# -------------- C2 --------------


def task_C2(df):
    print("COM731\nCOM731\nCOM731")

    stages = sorted(df["Stage"].unique())

    # For each ethnicity, calculate average smoking pack years per stage
    for eth in df["Ethnicity"].unique():
        avg_values = []
        for stage in stages:
            avg = df[(df["Ethnicity"] == eth) &
                     (df["Stage"] == stage)]["Smoking_Pack_Years"].mean()
            avg_values.append(avg)

        # Plot each ethnicity as a separate line
        plt.plot(stages, avg_values, marker='o', label=eth)

    plt.xlabel("Cancer Stage")
    plt.ylabel("Average Smoking Pack Years")
    plt.title("Trend of Average Smoking Pack Years per Stage (by Ethnicity)")
    plt.legend()
    plt.tight_layout()
    plt.show()
    

# -------------- C3 --------------

def task_C3(df):
    print("COM731\nCOM731\nCOM731")

    # Group by treatment and compute mean BP values
    grouped = df.groupby("Treatment")[
        ["Blood_Pressure_Systolic", "Blood_Pressure_Diastolic", "Blood_Pressure_Pulse"]
    ].mean()

    # Plot systolic BP
    plt.plot(grouped.index, grouped["Blood_Pressure_Systolic"], marker='o', label="Systolic")

    # Plot diastolic BP
    plt.plot(grouped.index, grouped["Blood_Pressure_Diastolic"], marker='o', label="Diastolic")

    # Plot pulse
    plt.plot(grouped.index, grouped["Blood_Pressure_Pulse"], marker='o', label="Pulse")

    plt.xlabel("Treatment Type")
    plt.ylabel("Blood Pressure")
    plt.title("Average Blood Pressure Measurements per Treatment Type")
    plt.xticks(rotation=45)
    plt.legend()
    plt.tight_layout()

# -------------- C4 (Unique Graph) --------------


def task_C4(df):
    print("COM731\nCOM731\nCOM731")

    print("Unique Visualisation: Average Survival Months per Cancer Stage\n")

    # Compute average survival months per stage
    result = df.groupby("Stage")["Survival_Months"].mean()

    # Plot bar chart
    plt.bar(result.index, result.values)
    plt.xlabel("Cancer Stage")
    plt.ylabel("Average Survival Months")
    plt.title("Average Survival Duration by Cancer Stage")
    plt.tight_layout()
    plt.show()


# =====================================================
#                       MENU
# =====================================================

def main_menu():
    data_list = None
    df = None

    while True:
        print("\n====== COM731 MENU ======")
        print("1. Load CSV (Task A start)")
        print("2. Task A1 – Demographics")
        print("3. Task A2 – Medical History")
        print("4. Task A3 – Treatment Survival")
        print("5. Task A4 – Custom Condition")
        print("6. Load DataFrame (Task B start)")
        print("7. Task B1 – Top 3 Treatments")
        print("8. Task B2 – Average WBC")
        print("9. Task B3 – Avg Smoking Packs")
        print("10. Task B4 – Unique Analysis")
        print("11. Task C1 – Treatment Chart")
        print("12. Task C2 – Smoking Trend Chart")
        print("13. Task C3 – BP Comparison Chart")
        print("14. Task C4 – Unique Chart")
        print("0. Exit")

        choice = input("Select an option: ")

        if choice == "1":
            data_list = load_csv_to_list()

        elif choice == "2":
            if data_list: task_A1(data_list)
            else: print("Load CSV first.")

        elif choice == "3":
            if data_list: task_A2(data_list)
            else: print("Load CSV first.")

        elif choice == "4":
            if data_list: task_A3(data_list)
            else: print("Load CSV first.")

        elif choice == "5":
            if data_list: task_A4(data_list)
            else: print("Load CSV first.")

        elif choice == "6":
            df = load_dataframe()

        elif choice == "7":
            if df is not None: task_B1(df)
            else: print("Load DataFrame first.")

        elif choice == "8":
            if df is not None: task_B2(df)
            else: print("Load DataFrame first.")

        elif choice == "9":
            if df is not None: task_B3(df)
            else: print("Load DataFrame first.")

        elif choice == "10":
            if df is not None: task_B4(df)
            else: print("Load DataFrame first.")

        elif choice == "11":
            if df is not None: task_C1(df)
            else: print("Load DataFrame first.")

        elif choice == "12":
            if df is not None: task_C2(df)
            else: print("Load DataFrame first.")

        elif choice == "13":
            if df is not None: task_C3(df)
            else: print("Load DataFrame first.")

        elif choice == "14":
            if df is not None: task_C4(df)
            else: print("Load DataFrame first.")

        elif choice == "0":
            print("Goodbye!")
            break

        else:
            print("Invalid choice. Try again.")


# Run program
main_menu()


1. Load CSV (Task A start)
2. Task A1 – Demographics
3. Task A2 – Medical History
4. Task A3 – Treatment Survival
5. Task A4 – Custom Condition
6. Load DataFrame (Task B start)
7. Task B1 – Top 3 Treatments
8. Task B2 – Average WBC
9. Task B3 – Avg Smoking Packs
10. Task B4 – Unique Analysis
11. Task C1 – Treatment Chart
12. Task C2 – Smoking Trend Chart
13. Task C3 – BP Comparison Chart
14. Task C4 – Unique Chart
0. Exit
