In [99]:
import pandas as pd

# Initialize the combined dataframe
df_combined = pd.DataFrame()

def add_cancer_data_from_csv(csv_file_path, cancer_type, df_combined):
    """
    This function reads cancer data from a CSV file, skips irrelevant rows, sets proper headers,
    adds a 'Cancer Type' column, and appends the data to an existing DataFrame.
    
    Parameters:
    csv_file_path (str): The path to the CSV file containing the new cancer data.
    cancer_type (str): The type of cancer for the new data.
    df_combined (DataFrame): The existing combined dataframe to which new data will be appended.
    
    Returns:
    DataFrame: The updated combined dataframe.
    """
    # Read the CSV file into a DataFrame, skipping the first 4 lines of metadata, and load headers from row 5
    df_new_cancer = pd.read_csv(csv_file_path, skiprows=4, header=1)
    
    # Drop rows from index 20 to 30 (which correspond to the metadata rows you want to exclude)
    df_new_cancer.drop(df_new_cancer.index[-11:], inplace=True)
    
    # Ensure proper column names and reset index to avoid duplicated rows
    df_new_cancer.columns = ["Age at Diagnosis", "All Races (Rate per 100,000)", "All Races (Number of Cases)",
                             "Asian / Pacific Islander (Rate per 100,000)", "Asian / Pacific Islander (Number of Cases)",
                             "Non-Hispanic Black (Rate per 100,000)", "Non-Hispanic Black (Number of Cases)",
                             "Hispanic (Rate per 100,000)", "Hispanic (Number of Cases)", 
                             "Non-Hispanic White (Rate per 100,000)", "Non-Hispanic White (Number of Cases)"]

    # Add the cancer type column
    df_new_cancer['Cancer Type'] = cancer_type
    
    # Append the new data to the existing dataframe
    df_combined = pd.concat([df_combined, df_new_cancer], ignore_index=True)
    
    return df_combined

# Example usage:
# df_combined = add_cancer_data_from_csv('path_to_file.csv', 'Brain Cancer', df_combined)



df_combined = add_cancer_data_from_csv('brain.csv', 'Brain Cancer', df_combined)

df_combined

Unnamed: 0,Age at Diagnosis,"All Races (Rate per 100,000)",All Races (Number of Cases),"Asian / Pacific Islander (Rate per 100,000)",Asian / Pacific Islander (Number of Cases),"Non-Hispanic Black (Rate per 100,000)",Non-Hispanic Black (Number of Cases),"Hispanic (Rate per 100,000)",Hispanic (Number of Cases),"Non-Hispanic White (Rate per 100,000)",Non-Hispanic White (Number of Cases),Cancer Type
0,<1,2.9,65,*a,*a,*a,*a,2.4,27.0,3.6,23.0,Brain Cancer
1,1-4,3.7,344,3.1,45,4.2,24,3.3,155.0,4.2,111.0,Brain Cancer
2,5-9,2.8,343,2.4,42,2.4,18,2.6,163.0,3.4,115.0,Brain Cancer
3,10-14,2.4,315,1.7,29,2.4,19,2.1,144.0,3.2,114.0,Brain Cancer
4,15-19,1.7,219,1.8,33,*a,*a,1.3,89.0,2.1,80.0,Brain Cancer
5,20-24,1.8,246,1.5,29,*a,*a,1.7,113.0,2.3,92.0,Brain Cancer
6,25-29,2.3,345,1.4,33,1.6,17,2.1,138.0,3.1,149.0,Brain Cancer
7,30-34,3.0,434,2.3,60,1.6,15,2.1,127.0,4.3,221.0,Brain Cancer
8,35-39,3.3,457,2.1,53,2.8,23,2.6,146.0,4.6,224.0,Brain Cancer
9,40-44,4.1,522,3.3,75,2.8,21,3.2,169.0,5.8,251.0,Brain Cancer


In [100]:
df_combined = add_cancer_data_from_csv('bone.csv', 'Bone Cancer', df_combined)

In [101]:
df_combined = add_cancer_data_from_csv('breast.csv', 'Breast Cancer', df_combined)

In [102]:
df_combined = add_cancer_data_from_csv('Cervix Uteri.csv', 'Cervical Cancer', df_combined)

In [103]:
df_combined = add_cancer_data_from_csv('colon.csv', 'Colon Cancer', df_combined)

In [104]:
df_combined = add_cancer_data_from_csv('Rectum.csv', 'Rectum Cancer', df_combined)

In [105]:
df_combined = add_cancer_data_from_csv('Esophagus.csv', 'Esophagus Cancer', df_combined)

In [106]:
df_combined = add_cancer_data_from_csv('Gallbladder.csv', 'Gallbladder Cancer', df_combined)

In [107]:
df_combined = add_cancer_data_from_csv('Hodgkin Lymphoma.csv', 'Hodgkin Lymphoma', df_combined)

In [108]:
df_combined = add_cancer_data_from_csv('Kidney.csv', 'Kidney Cancer', df_combined)

In [109]:
df_combined = add_cancer_data_from_csv('Larynx.csv', 'Larynx Cancer', df_combined)

In [110]:
df_combined = add_cancer_data_from_csv('Leukemia.csv', 'Leukemia', df_combined)

In [111]:
df_combined = add_cancer_data_from_csv('Liver.csv', 'Liver', df_combined)

In [112]:
df_combined = add_cancer_data_from_csv('Lung and Bronchus.csv', 'Lung and Bronchus Cancer', df_combined)

In [113]:
df_combined = add_cancer_data_from_csv('Melanoma of the Skin.csv', 'Melanoma of the Skin', df_combined)

In [114]:
df_combined = add_cancer_data_from_csv('Myeloma.csv', 'Myeloma', df_combined)

In [115]:
df_combined = add_cancer_data_from_csv('Oral Cavity and Pharynx.csv', 'Oral Cavity and Pharynx', df_combined)

In [116]:

df_combined = add_cancer_data_from_csv('Pancreas.csv', 'Pancreas Cancer', df_combined)

In [117]:

df_combined = add_cancer_data_from_csv('Prostate.csv', 'Prostate Cancer', df_combined)

In [118]:
df_combined = add_cancer_data_from_csv('Stomach.csv', 'Stomach Cancer', df_combined)

In [119]:

df_combined = add_cancer_data_from_csv('Testis.csv', 'Testis Cancer', df_combined)

In [120]:

df_combined = add_cancer_data_from_csv('Thyroid.csv', 'Thyroid Cancer', df_combined)

In [121]:
df_combined.to_csv('Cancer.csv', index=False)

In [122]:
pd.read_csv("Cancer.csv")


Unnamed: 0,Age at Diagnosis,"All Races (Rate per 100,000)",All Races (Number of Cases),"Asian / Pacific Islander (Rate per 100,000)",Asian / Pacific Islander (Number of Cases),"Non-Hispanic Black (Rate per 100,000)",Non-Hispanic Black (Number of Cases),"Hispanic (Rate per 100,000)",Hispanic (Number of Cases),"Non-Hispanic White (Rate per 100,000)",Non-Hispanic White (Number of Cases),Cancer Type
0,<1,2.9,65,*a,*a,*a,*a,2.4,27.0,3.6,23.0,Brain Cancer
1,1-4,3.7,344,3.1,45,4.2,24,3.3,155.0,4.2,111.0,Brain Cancer
2,5-9,2.8,343,2.4,42,2.4,18,2.6,163.0,3.4,115.0,Brain Cancer
3,10-14,2.4,315,1.7,29,2.4,19,2.1,144.0,3.2,114.0,Brain Cancer
4,15-19,1.7,219,1.8,33,*a,*a,1.3,89.0,2.1,80.0,Brain Cancer
...,...,...,...,...,...,...,...,...,...,...,...,...
413,65-69,30.8,1516,36.5,320,19.0,56,36.5,416,27.0,696,Thyroid Cancer
414,70-74,29.3,1154,31.5,211,26.8,59,33.4,271,26.7,589,Thyroid Cancer
415,75-79,24.1,653,24.9,111,16.4,25,28.0,154,22.5,348,Thyroid Cancer
416,80-84,17.1,326,19.6,64,*a,*a,22.3,86,14.9,160,Thyroid Cancer


In [123]:
list(df_combined["Cancer Type"].unique())

['Brain Cancer',
 'Bone Cancer',
 'Breast Cancer',
 'Cervical Cancer',
 'Colon Cancer',
 'Rectum Cancer',
 'Esophagus Cancer',
 'Gallbladder Cancer',
 'Hodgkin Lymphoma',
 'Kidney Cancer',
 'Larynx Cancer',
 'Leukemia',
 'Liver',
 'Lung and Bronchus Cancer',
 'Melanoma of the Skin',
 'Myeloma',
 'Oral Cavity and Pharynx',
 'Pancreas Cancer',
 'Prostate Cancer',
 'Stomach Cancer',
 'Testis Cancer',
 'Thyroid Cancer']

In [141]:
import pandas as pd

# Load the dataset
cancer_data = pd.read_csv('cancer.csv')

# Define the cancer lists
cancer_list = {
    "Bones": ["Shasta County", "Marin County", "Napa County", "Colusa County/ Glenn County/ Tehama County", "El Dorado County", "Madera County", "Sacramento County", "San Bernardino County", "Stanislaus County", "Fresno County"],
    "Brain": ["Shasta County", "Napa County", "Inyo County/ Mono County", "Lake County", "Sonoma County", "Nevada County", "Marin County", "San Benito County", "Colusa County/ Glenn County/ Tehama County", "Sutter County", "Siskiyou County/ Trinity County"],
    "Breast": ["Marin County", "San Luis Obispo County", "Santa Cruz County", "Placer County", "Santa Barbara County", "San Mateo County", "Shasta County", "Napa County", "Sonoma County", "Ventura County"],
    "Cervix": ["Tulare County", "Lake County", "Kings County", "Sierra County/ Yuba County", "Merced County", "Butte County", "Kern County", "San Bernardino County", "Shasta County", "Colusa County/ Glenn County/ Tehama County"],
    "Colon": ["Colusa County/ Glenn County/ Tehama County", "Butte County", "Shasta County", "San Bernardino County", "Sierra County/ Yuba County", "Stanislaus County", "Sacramento County", "Merced County", "San Joaquin County", "Contra Costa County"],
    "Rectum": ["Butte County", "Merced County", "Mendocino County", "San Benito County", "Colusa County/ Glenn County/ Tehama County", "Lake County", "Madera County", "Tulare County", "San Bernardino County", "Stanislaus County"],
    "Gallbladder": ["Fresno County", "Madera County", "San Mateo County", "Los Angeles County", "Orange County", "San Bernardino County", "Alameda County", "Monterey County", "Santa Cruz County", "Yolo County"],
    "Hodgkin": ["Mariposa County/ Tuolumne County", "Lake County", "Colusa County/ Glenn County/ Tehama County", "Placer County", "Sacramento County", "Solano County", "Del Norte County/ Humboldt County", "Lassen County/ Modoc County/ Plumas County", "El Dorado County", "Ventura County"],
    "Kidney": ["Imperial County", "Kern County", "Solano County", "Stanislaus County", "Fresno County", "Madera County", "San Benito County", "Napa County", "Colusa County/ Glenn County/ Tehama County", "Kings County"],
    "Larynx": ["Butte County", "El Dorado County", "Lake County", "Sierra County/ Yuba County", "Siskiyou County/ Trinity County", "Alpine County/ Amador County/ Calaveras County", "Imperial County", "Shasta County", "Stanislaus County", "Sacramento County"],
    "Leukemia": ["Shasta County", "Butte County", "Santa Barbara County", "Colusa County/ Glenn County/ Tehama County", "San Benito County", "Sutter County", "Lake County", "Sonoma County", "Lassen County/ Modoc County/ Plumas County", "Madera County"],
    "Liver": ["Imperial County", "San Francisco County", "Fresno County", "Kings County", "Sutter County", "Solano County", "San Joaquin County", "Merced County", "Madera County", "Sacramento County"],
    "Lung": ["Lake County", "Sierra County/ Yuba County", "Shasta County", "Colusa County/ Glenn County/ Tehama County", "Butte County", "Del Norte County/ Humboldt County", "Siskiyou County/ Trinity County", "Sutter County", "Solano County", "Alpine County/ Amador County/ Calaveras County"],
    "Melanoma": ["Marin County", "San Luis Obispo County", "Santa Cruz County", "Sonoma County", "Placer County", "El Dorado County", "Shasta County", "Nevada County", "Butte County", "Mariposa County/ Tuolumne County"],
    "Myeloma": ["Imperial County", "San Benito County", "Santa Barbara County", "Madera County", "Merced County", "Solano County", "Placer County", "Fresno County", "San Diego County", "Colusa County/ Glenn County/ Tehama County", "Alameda County"],
    "Oral": ["Shasta County", "Del Norte County/ Humboldt County", "Colusa County/ Glenn County/ Tehama County", "Lake County", "Butte County", "Lassen County/ Modoc County/ Plumas County", "El Dorado County", "Siskiyou County/ Trinity County", "Placer County", "Santa Barbara County"],
    "Pancreas": ["Shasta County", "Butte County", "Merced County", "Napa County", "Solano County", "Sacramento County", "Del Norte County/ Humboldt County", "Fresno County", "Stanislaus County", "El Dorado County"],
    "Prostate": ["Santa Barbara County", "San Luis Obispo County", "Santa Cruz County", "Placer County", "Butte County", "Ventura County", "San Benito County", "San Bernardino County", "Shasta County", "Monterey County"],
    "Stomach": ["Imperial County", "Los Angeles County", "San Francisco County", "Merced County", "Santa Clara County", "Fresno County", "San Bernardino County", "San Joaquin County", "Monterey County", "Alameda County"],
    "Testis": ["Mendocino County", "San Luis Obispo County", "San Benito County", "Marin County", "Imperial County", "Siskiyou County/ Trinity County", "Nevada County", "Santa Cruz County", "Napa County", "Lake County"],
    "Thyroid": ["Ventura County", "Placer County", "Santa Barbara County", "Kings County", "Orange County", "Kern County", "San Mateo County", "El Dorado County", "Stanislaus County", "San Bernardino County"]
}

def rank_cancers_by_location_age_ethnicity(location, age, ethnicity):
    # Determine cancers associated with the given location
    risk_of_cancer = [cancer_name for cancer_name, counties in cancer_list.items() if location in counties]
    
    if not risk_of_cancer:
        print(f"No cancers found associated with {location}.")
        return pd.DataFrame()  # Return an empty DataFrame if no cancers are associated
    
    # Define the age ranges in the dataset
    age_ranges = [
        ('<1', 0, 0), 
        ('1-4', 1, 4),
        ('5-9', 5, 9),
        ('10-14', 10, 14),
        ('15-19', 15, 19),
        ('20-24', 20, 24),
        ('25-29', 25, 29),
        ('30-34', 30, 34),
        ('35-39', 35, 39),
        ('40-44', 40, 44),
        ('45-49', 45, 49),
        ('50-54', 50, 54),
        ('55-59', 55, 59),
        ('60-64', 60, 64),
        ('65-69', 65, 69),
        ('70-74', 70, 74),
        ('75-79', 75, 79),
        ('80-84', 80, 84),
        ('85+', 85, float('inf'))
    ]
    
    # Find the appropriate age range based on the given age
    age_range = None
    for range_name, lower, upper in age_ranges:
        if lower <= age <= upper:
            age_range = range_name
            break
            
    if age_range is None:
        raise ValueError("Age is not within a valid range.")
    
    # Filter the data based on the determined age range and cancer types
    filtered_data = cancer_data[(cancer_data['Age at Diagnosis'] == age_range) &
                                (cancer_data['Cancer Type'].isin(risk_of_cancer))]
    
    # Map ethnicity to the appropriate column for rates
    ethnicity_column_map = {
        'All Races': 'All Races (Rate per 100,000)',
        'Asian': 'Asian / Pacific Islander (Rate per 100,000)',
        'Non-Hispanic Black': 'Non-Hispanic Black (Rate per 100,000)',
        'Hispanic': 'Hispanic (Rate per 100,000)',
        'Non-Hispanic White': 'Non-Hispanic White (Rate per 100,000)'
    }
    
    if ethnicity not in ethnicity_column_map:
        raise ValueError("Ethnicity not recognized. Please choose from: 'All Races', 'Asian', 'Non-Hispanic Black', 'Hispanic', 'Non-Hispanic White'.")
    
    # Select the column based on ethnicity
    rate_column = ethnicity_column_map[ethnicity]
    
    # Convert the rate column to numeric using .loc to avoid SettingWithCopyWarning
    filtered_data.loc[:, rate_column] = pd.to_numeric(filtered_data[rate_column], errors='coerce')
    
    # Sort the filtered data by the rate column in descending order
    ranked_cancers = filtered_data[['Cancer Type', rate_column]].sort_values(by=rate_column, ascending=False)
    
    if ranked_cancers.empty:
        print(f"No data available for cancers associated with {location} and specified parameters.")
        return ranked_cancers  # Return the empty DataFrame if no data available
    else:
        print(f"Ranked cancers for {location}, age {age}, and ethnicity {ethnicity}:")
        return ranked_cancers.reset_index(drop=True)

# Example usage
location = input("Enter Location: ")
age = int(input("Enter Age: "))
ethnicity = input("Enter Ethnicity: ")

ranked_cancers = rank_cancers_by_location_age_ethnicity(location, age, ethnicity)

if not ranked_cancers.empty:
    print(ranked_cancers)


Enter Location: Santa Barbara County
Enter Age: 56
Enter Ethnicity: Non-Hispanic White
Ranked cancers for Santa Barbara County, age 56, and ethnicity Non-Hispanic White:
  Cancer Type Non-Hispanic White (Rate per 100,000)
0    Leukemia                                  12.9
1     Myeloma                                   8.4


In [140]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
cancer_data = pd.read_csv('cancer.csv')

def visualize_and_save_cancer_data(data):
    # Identify the columns that contain cancer rates
    ethnicity_columns = [
        'All Races (Rate per 100,000)',
        'Asian / Pacific Islander (Rate per 100,000)',
        'Non-Hispanic Black (Rate per 100,000)',
        'Hispanic (Rate per 100,000)',
        'Non-Hispanic White (Rate per 100,000)'
    ]
    
    # Convert all rate columns to numeric, forcing errors to NaN
    for column in ethnicity_columns:
        data[column] = pd.to_numeric(data[column], errors='coerce')
    
    # Melt the dataframe for easier plotting with seaborn
    melted_data = pd.melt(
        data,
        id_vars=['Age at Diagnosis', 'Cancer Type'],
        value_vars=ethnicity_columns,
        var_name='Ethnicity',
        value_name='Rate'
    )
    
    # Replace the ethnicity column names for better display
    melted_data['Ethnicity'] = melted_data['Ethnicity'].replace({
        'All Races (Rate per 100,000)': 'All Races',
        'Asian / Pacific Islander (Rate per 100,000)': 'Asian',
        'Non-Hispanic Black (Rate per 100,000)': 'Non-Hispanic Black',
        'Hispanic (Rate per 100,000)': 'Hispanic',
        'Non-Hispanic White (Rate per 100,000)': 'Non-Hispanic White'
    })
    
    # Drop rows with NaN values in the 'Rate' column
    melted_data = melted_data.dropna(subset=['Rate'])
    
    # Define larger age bins
    age_bins = {
        '0-39': ['<1', '1-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39'],
        '40-59': ['40-44', '45-49', '50-54', '55-59'],
        '60-79': ['60-64', '65-69', '70-74', '75-79'],
        '80+': ['80-84', '85+']
    }
    
    # Map the smaller age ranges to the larger bins
    melted_data['Age Group'] = melted_data['Age at Diagnosis'].map(
        lambda x: next((key for key, ranges in age_bins.items() if x in ranges), 'Unknown')
    )
    
    # Filter out any 'Unknown' entries (if any age range does not fit)
    melted_data = melted_data[melted_data['Age Group'] != 'Unknown']
    
    # Calculate the overall average rate per cancer type for sorting the legend
    cancer_type_order = melted_data.groupby('Cancer Type')['Rate'].mean().sort_values(ascending=False).index.tolist()
    
    # Visualization 1: Heatmap for Cancer Type and Ethnicity
    plt.figure(figsize=(12, 8))
    heatmap_data = melted_data.pivot_table(
        index='Cancer Type', columns='Ethnicity', values='Rate', aggfunc='mean'
    )
    sns.heatmap(heatmap_data, cmap='YlGnBu', annot=True, fmt=".1f", linewidths=.5)
    plt.title('Average Cancer Rates per 100,000 by Cancer Type and Ethnicity')
    plt.xlabel('Ethnicity')
    plt.ylabel('Cancer Type')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig('cancer_ethnicity_heatmap.pdf')  # Save as PDF
    plt.close()

    # Visualization 2: Bar Plot for Age Group and Cancer Type, with increased bin width
    plt.figure(figsize=(14, 8))
    age_data = melted_data.groupby(['Age Group', 'Cancer Type'])['Rate'].mean().reset_index()

    # Plotting the bar chart with increased width
    sns.barplot(
        data=age_data,
        x='Age Group', y='Rate', hue='Cancer Type',
        hue_order=cancer_type_order,  # Order the legend based on average rates
        palette='tab20',  # Use the 'tab20' palette for better contrast
        width=1.0  # Increase the width of the bars
    )
    plt.title('Average Cancer Rates per 100,000 by Age Group and Cancer Type (Sorted Legend)')
    plt.xlabel('Age Group')
    plt.ylabel('Average Rate per 100,000')
    plt.xticks(rotation=45)
    plt.legend(title='Cancer Type', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.savefig('cancer_age_barplot.pdf')  # Save as PDF
    plt.close()

# Call the function to visualize and save the data
visualize_and_save_cancer_data(cancer_data)
