In [None]:
# Zarin Tasnim Biash

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import AgglomerativeClustering, KMeans, DBSCAN
from sklearn.metrics import silhouette_score
import scipy.cluster.hierarchy as shc


coverage = pd.read_csv('/content/coverage.csv')
mortality = pd.read_csv('/content/mortality.csv')
prevalence = pd.read_csv('/content/prevalence.csv')
screening_method = pd.read_csv('/content/screening_method.csv')
disability_adjusted_life_yrs = pd.read_csv('/content/disability_adjusted_life_years.csv')
years_lived_with_disability = pd.read_csv('/content/Years_lived_with_disability.csv')
years_lost_with_disability = pd.read_csv('/content/Years_lost_with_disability.csv')
prevalence_of_cancer = pd.read_csv('/content/prevalence_of_cancer.csv')
incidence_of_cancer = pd.read_csv('/content/incidence_of_cancer.csv')

# print(coverage.head())
# print(mortality.head())
# print(prevalence.head())
# print(screening_method.head())


# Standardize Location name value
def output (filepath: str, data):
    data.to_csv(filepath, sep=',', header=True, index=False)

location_id_unique = mortality['location_id'].unique()
location_unique = []
for location_id in location_id_unique:
    location_name = ''
    for id in mortality.index:
        if location_id == mortality.loc[id, 'location_id']:
            location_name = mortality.loc[id, 'location_name']
            break
    location_unique.append([location_id, location_name])
# print(location_unique)

for location in location_unique:
  for id in prevalence_of_cancer.index:
      if location[0] == prevalence_of_cancer.loc[id, 'location_id']:
          prevalence_of_cancer.loc[id, 'location_name'] = location[1]
  for id in incidence_of_cancer.index:
      if location[0] == incidence_of_cancer.loc[id, 'location_id']:
          incidence_of_cancer.loc[id, 'location_name'] = location[1]
  for id in years_lived_with_disability.index:
      if location[0] == years_lived_with_disability.loc[id, 'location_id']:
          years_lived_with_disability.loc[id, 'location_name'] = location[1]
  for id in years_lost_with_disability.index:
      if location[0] == years_lost_with_disability.loc[id, 'location_id']:
          years_lost_with_disability.loc[id, 'location_name'] = location[1]
  for id in disability_adjusted_life_yrs.index:
      if location[0] == disability_adjusted_life_yrs.loc[id, 'location_id']:
          disability_adjusted_life_yrs.loc[id, 'location_name'] = location[1]

# Coverage Dataset
coverage_cleaned = coverage.dropna(axis=1, how='all')
coverage_cleaned = coverage_cleaned[coverage_cleaned['Period'] == 2019]
coverage_columns_to_drop = ['Period', 'IndicatorCode', 'Indicator', 'ValueType', 'ParentLocationCode', 'ParentLocation', 'Location type', 'SpatialDimValueCode', 'Period type', 'IsLatestYear', 'FactValueTranslationID', 'FactComments', 'Language', 'DateModified']
coverage_cleaned = coverage_cleaned.drop(columns=coverage_columns_to_drop)

coverage_cleaned.rename(columns={'FactValueNumericLow': 'CoverageFactValueNumericLow', 'FactValueNumericHigh': 'CoverageFactValueNumericHigh', 'Value': 'CoverageValue'}, inplace=True)

# print(coverage_cleaned.head())
coverage_cleaned.to_csv('coverage_cleaned.csv', index=False)

# Mortality Dataset
mortality_cleaned = mortality[mortality['metric_name'] == 'Percent']
mortality_columns_to_drop = ['measure_name', 'metric_id', 'measure_id', 'location_id', 'sex_id', 'upper', 'lower', 'sex_name', 'cause_id', 'year', 'metric_name', 'cause_name', 'age_id']
mortality_cleaned = mortality_cleaned.drop(columns=mortality_columns_to_drop)
mortality_cleaned.rename(columns={'location_name': 'Location', 'val': 'Mortality_val'}, inplace=True)
# print(mortality_cleaned.head())
mortality_cleaned.to_csv('mortality_cleaned.csv', index=False)

# Prevalence Dataset
prevalence_cleaned = prevalence.dropna(axis=1, how='all')
prevalence_cleaned = prevalence_cleaned[prevalence_cleaned['Period'] == 2019]
prevalence_cleaned = prevalence_cleaned[prevalence_cleaned['Dim3'] == 'in lifetime']
prevalence_columns_to_drop = ['Period', 'IndicatorCode', 'Indicator', 'ValueType', 'ParentLocationCode', 'Value', 'FactValueNumericHigh', 'FactValueNumericLow', 'ParentLocation', 'Location type', 'SpatialDimValueCode', 'Period type', 'Period', 'Language', 'DateModified', 'IsLatestYear', 'Dim1 type', 'Dim1', 'Dim1ValueCode', 'Dim2 type', 'Dim2', 'Dim2ValueCode', 'Dim3 type', 'Dim3', 'Dim3ValueCode']
prevalence_cleaned = prevalence_cleaned.drop(columns=prevalence_columns_to_drop)
prevalence_cleaned.rename(columns={'FactValueNumeric':'PrevalenceFactValueNumeric'}, inplace=True)
# print(prevalence_cleaned.head())
prevalence_cleaned.to_csv('prevalence_cleaned.csv', index=False)

# Screening Dataset
screening_method_cleaned = screening_method.dropna(axis=1, how='all')
# Drop rows where year!=2019 and only keep 2019 rows
screening_method_cleaned = screening_method_cleaned[screening_method_cleaned['Period'] == 2019]
screening_method_columns_to_drop = ['Period', 'IndicatorCode', 'Indicator', 'ValueType', 'ParentLocationCode', 'ParentLocation', 'Location type', 'SpatialDimValueCode', 'Period type', 'IsLatestYear', 'FactValueTranslationID', 'FactComments', 'Language', 'DateModified']
screening_method_cleaned = screening_method_cleaned.drop(columns=screening_method_columns_to_drop)
screening_method_cleaned.rename(columns={'Value': 'ScreeningValue'}, inplace=True)
# print(screening_method_cleaned.head())
screening_method_cleaned.to_csv('screening_method_cleaned.csv', index=False)

#Disability_adjusted_life_yrs
disability_adjusted_life_yrs_cleaned = disability_adjusted_life_yrs.dropna(axis=1, how='all')
disability_adjusted_columns_to_drop = ['measure_id', 'measure_name', 'location_id', 'sex_id', 'sex_name', 'age_id', 'cause_id', 'cause_name', 'rei_id', 'rei_name', 'metric_id', 'metric_name', 'year', 'upper', 'lower']
disability_adjusted_life_yrs_cleaned = disability_adjusted_life_yrs_cleaned.drop(columns=disability_adjusted_columns_to_drop)
disability_adjusted_life_yrs_cleaned.rename(columns={'val': 'Disability_Adjusted_Life_Yrs_Value', 'location_name': 'Location'}, inplace=True)
# print(disability_adjusted_life_yrs_cleaned.head())
disability_adjusted_life_yrs_cleaned.to_csv('disability_adjusted_life_yrs_cleaned.csv', index=False)

#Years_lived_with_disability
yrs_lived_with_disability_cleaned = years_lived_with_disability.dropna(axis=1, how='all')
yrs_lived_with_disability_columns_to_drop = ['measure_id', 'measure_name', 'location_id', 'sex_id', 'sex_name', 'age_id', 'cause_id', 'cause_name', 'rei_id', 'rei_name', 'metric_id', 'metric_name', 'year', 'upper', 'lower']
yrs_lived_with_disability_cleaned = yrs_lived_with_disability_cleaned.drop(columns=yrs_lived_with_disability_columns_to_drop)
yrs_lived_with_disability_cleaned.rename(columns={'val': 'Yrs_Lived_With_Disability_Value', 'location_name': 'Location'}, inplace=True)
# print(yrs_lived_with_disability_cleaned.head())
yrs_lived_with_disability_cleaned.to_csv('yrs_lived_with_disability_cleaned.csv', index=False)

#Years_lost_with_disability
yrs_lost_with_disability_cleaned = years_lost_with_disability.dropna(axis=1, how='all')
yrs_lost_with_disability_columns_to_drop = ['measure_id', 'measure_name', 'location_id', 'sex_id', 'sex_name', 'age_id', 'cause_id', 'cause_name', 'rei_id', 'rei_name', 'metric_id', 'metric_name', 'year', 'upper', 'lower']
yrs_lost_with_disability_cleaned = yrs_lost_with_disability_cleaned.drop(columns=yrs_lost_with_disability_columns_to_drop)
yrs_lost_with_disability_cleaned.rename(columns={'val': 'Yrs_Lost_With_Disability_Value', 'location_name': 'Location'}, inplace=True)
# print(yrs_lost_with_disability_cleaned.head())
yrs_lost_with_disability_cleaned.to_csv('yrs_lost_with_disability_cleaned.csv', index=False)

#Prevalence_of_cancer
prevalence_of_cancer_cleaned = prevalence_of_cancer.dropna(axis=1, how='all')
prevalence_of_cancer_columns_to_drop = ['measure_id', 'measure_name', 'location_id', 'sex_id', 'sex_name', 'age_id', 'cause_id', 'cause_name', 'metric_id', 'metric_name', 'year', 'upper', 'lower']
prevalence_of_cancer_cleaned = prevalence_of_cancer_cleaned.drop(columns=prevalence_of_cancer_columns_to_drop)
prevalence_of_cancer_cleaned.rename(columns={'location_name': 'Location', 'val':'Prevalence_of_cancer_value'}, inplace=True)
# print(prevalence_of_cancer_cleaned.head())
prevalence_of_cancer_cleaned.to_csv('prevalence_of_cancer_cleaned.csv', index=False)

#Incidence_of_cancer
incidence_of_cancer_cleaned = incidence_of_cancer.dropna(axis=1, how='all')
incidence_of_cancer_columns_to_drop = ['measure_id', 'measure_name', 'location_id', 'sex_id', 'sex_name', 'age_id', 'cause_id', 'cause_name', 'metric_id', 'metric_name', 'year', 'upper', 'lower']
incidence_of_cancer_cleaned = incidence_of_cancer_cleaned.drop(columns=prevalence_of_cancer_columns_to_drop)
incidence_of_cancer_cleaned.rename(columns={'location_name': 'Location', 'val':'Incidence_of_cancer_value'}, inplace=True)
# print(incidence_of_cancer_cleaned.head())
incidence_of_cancer_cleaned.to_csv('incidence_of_cancer_cleaned.csv', index=False)

# Merging datasets on the 'Location' column
merged_df1 = coverage_cleaned.merge(prevalence_cleaned, on='Location', how='outer').merge(screening_method_cleaned, on='Location', how='outer')
merged_df1.to_csv('merged_dataset.csv', index=False)
# print(merged_df.head())

merged_df2 = pd.merge(disability_adjusted_life_yrs_cleaned, mortality_cleaned, on=['Location', 'age_name'], how='outer')
merged_df2 = pd.merge(merged_df2, yrs_lived_with_disability_cleaned, on=['Location', 'age_name'], how='outer')
merged_df2 = pd.merge(merged_df2, yrs_lost_with_disability_cleaned, on=['Location', 'age_name'], how='outer')
merged_df2 = pd.merge(merged_df2, prevalence_of_cancer_cleaned, on=['Location', 'age_name'], how='outer')
merged_df2 = pd.merge(merged_df2, incidence_of_cancer_cleaned, on=['Location', 'age_name'], how='outer')
final_df = pd.merge(merged_df1, merged_df2, on='Location', how='left')
# print(final_df.head())

unwanted_values = ["Not applicable", "Don't know", "No response"]
final_df = final_df[
    (~final_df['CoverageValue'].isin(unwanted_values)) &
    (~final_df['ScreeningValue'].isin(unwanted_values))
]
final_df = final_df.dropna(axis=1, how='all')
final_df = final_df.dropna(subset=['Mortality_val'])
final_df.loc[final_df['CoverageValue'] == 'Less than 10', 'CoverageFactValueNumericLow'] = 0
final_df.loc[final_df['CoverageValue'] == 'Less than 10', 'CoverageFactValueNumericHigh'] = 10
final_df.loc[final_df['CoverageValue'] == '70 or more', 'CoverageFactValueNumericLow'] = 70
final_df.loc[final_df['CoverageValue'] == '70 or more', 'CoverageFactValueNumericHigh'] = 100

final_df.to_csv('final_clean_dataset.csv', index=False)
# print(final_df.head())

# Selecting the columns we want to scale (excluding categorical variables if needed)
numerical_features = ['CoverageFactValueNumericLow', 'CoverageFactValueNumericHigh', 'PrevalenceFactValueNumeric', 'Disability_Adjusted_Life_Yrs_Value',
                      'Mortality_val', 'Yrs_Lived_With_Disability_Value', 'Yrs_Lost_With_Disability_Value', 'Prevalence_of_cancer_value',
                      'Incidence_of_cancer_value']

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Apply Min-Max scaling to the selected columns
final_df[numerical_features] = scaler.fit_transform(final_df[numerical_features])

# final_df.head()

final_df.to_csv('final_clean_dataset_standardized.csv', index=False)

saved_data = final_df

# print(list(saved_data.columns))
saved_data = saved_data.drop(['Location', 'CoverageValue', 'age_name', 'CoverageFactValueNumericLow', 'CoverageFactValueNumericHigh', 'ScreeningValue'], axis=1)

agg_cluster = AgglomerativeClustering(n_clusters=None, distance_threshold=0, linkage='ward')
agg_cluster_labels = agg_cluster.fit_predict(saved_data)
saved_data['Agglomerative_Cluster'] = agg_cluster_labels

# Visualize the dendrogram for Agglomerative Clustering
plt.figure(figsize=(10, 7))
plt.title("Dendrogram for Agglomerative Clustering")
dend = shc.dendrogram(shc.linkage(saved_data, method='ward'))

plt.show()

'''
Duc Dung Le
K-Means clustering
'''
def read_file(filepath: str, header: int, sep: str):
    data = pd.read_csv(
        filepath,
        header=header,
        sep=sep
        # encoding='utf-16-be'
    )
    return data

def task_kMeans():
    data = read_file(
        filepath='/content/final_clean_dataset_standardized.csv',
        header=0,
        sep=','
    )
    unique_coverage = data['CoverageValue'].unique()
    print(unique_coverage)
    unique_method = data['ScreeningValue'].unique()
    print(unique_method)
    plt.scatter(data['PrevalenceFactValueNumeric'], data['Prevalence_of_cancer_value'], cmap='viridis')
    plt.title(label='Dataset')
    plt.xlabel(xlabel='Prevalence to screening methods')
    plt.ylabel(ylabel='Prevalence to cancer')
    plt.show()
    plt.savefig('/content/fig0.png')
    min_max_data = data.copy()
    min_max_data[
        [
            'PrevalenceFactValueNumeric',
            'Prevalence_of_cancer_value',
            'Yrs_Lived_With_Disability_Value',
            'Yrs_Lost_With_Disability_Value',
            'Incidence_of_cancer_value',
            'Disability_Adjusted_Life_Yrs_Value',
            'Mortality_val'
        ]
    ] = MinMaxScaler().fit_transform(
        data[
            [
                'PrevalenceFactValueNumeric',
                'Prevalence_of_cancer_value',
                'Yrs_Lived_With_Disability_Value',
                'Yrs_Lost_With_Disability_Value',
                'Incidence_of_cancer_value',
                'Disability_Adjusted_Life_Yrs_Value',
                'Mortality_val'
            ]
        ]
    )
    min_max_clustering = min_max_data[
        [
            'PrevalenceFactValueNumeric',
            'Prevalence_of_cancer_value',
            'Yrs_Lived_With_Disability_Value',
            'Yrs_Lost_With_Disability_Value',
            'Incidence_of_cancer_value',
            'Disability_Adjusted_Life_Yrs_Value',
            'Mortality_val'
        ]
    ]
    choices = [3,4]

    kmeans = KMeans(n_clusters = 4)
    kmeans.fit(min_max_clustering)
    labels = kmeans.labels_
    plt.scatter(min_max_clustering['PrevalenceFactValueNumeric'], min_max_clustering['Prevalence_of_cancer_value'], c=labels, cmap='viridis')
    plt.title(label='Prevalence to screening methods + Prevalence to cancer')
    plt.xlabel(xlabel='Prevalence to screening methods')
    plt.ylabel(ylabel='Prevalence to cancer')
    plt.savefig('/content/fig1.png')
    plt.show()


    # prevalence fact + Incidence_of_cancer_value
    plt.scatter(min_max_clustering['PrevalenceFactValueNumeric'], min_max_clustering['Incidence_of_cancer_value'], c=labels, cmap='viridis')
    plt.title(label='Prevalence to screening methods + Incidence to cancer')
    plt.xlabel(xlabel='Prevalence to screening methods')
    plt.ylabel(ylabel='Incidence to cancer')
    plt.savefig('/content/fig2.png')
    plt.show()

    # prevalance fact + Yrs_Lived_With_Disability_Value
    plt.scatter(min_max_clustering['PrevalenceFactValueNumeric'], min_max_clustering['Yrs_Lived_With_Disability_Value'], c=labels, cmap='viridis')
    plt.title(label='Prevalence to screening methods + Years lived with disability')
    plt.xlabel(xlabel='Prevalence to screening methods')
    plt.ylabel(ylabel='Years lived with disability')
    plt.savefig('/content/fig3.png')
    plt.show()

     # prevalance fact + Mortality_val
    plt.scatter(min_max_clustering['PrevalenceFactValueNumeric'], min_max_clustering['Mortality_val'], c=labels, cmap='viridis')
    plt.title(label='Prevalence to screening methods + Mortality_val clustering')
    plt.xlabel(xlabel='Prevalence to screening methods')
    plt.ylabel(ylabel='Mortality percentage')
    plt.savefig('/content/fig4.png')
    plt.show()

task_kMeans()

#Lukas

data = pd.read_csv("/content/final_clean_dataset_standardized.csv", delimiter=",")

data = data.drop(['Location', 'CoverageValue', 'age_name', 'CoverageFactValueNumericLow','CoverageFactValueNumericHigh'], axis=1)


data = data.drop(['ScreeningValue'], axis=1)
dbscan = DBSCAN(eps=0.15, min_samples=5)
clusters = dbscan.fit(data).labels_
plt.scatter(data["PrevalenceFactValueNumeric"], data["Prevalence_of_cancer_value"],c=clusters)
plt.xlabel("Prevalence of screening")
plt.ylabel("Prevalence of cancer")
plt.show()

plt.scatter(data["PrevalenceFactValueNumeric"], data["Incidence_of_cancer_value"],c=clusters)
plt.xlabel("Prevalence of screening")
plt.ylabel("Incidence of cancer")
plt.show()

plt.scatter(data["PrevalenceFactValueNumeric"], data["Yrs_Lived_With_Disability_Value"],c=clusters)
plt.xlabel("Prevalence of screening")
plt.ylabel("Years lived with disability")
plt.show()


plt.scatter(data["PrevalenceFactValueNumeric"], data["Mortality_val"],c=clusters)
plt.xlabel("Prevalence of screening")
plt.ylabel("Mortality")
plt.show()