In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Import Data & Rename Columns

In [None]:
# Data originally taken from https://data.world/makeovermonday/2018w40-five-year-cancer-survival-rates-in-america/workspace/file?filename=Five+Year+Cancer+Survival+Rates+in+USA.csv

cancer_df = pd.read_csv('/content/drive/MyDrive/CodingDojo/Data/Five Year Cancer Survival Rates in USA.csv')
cancer_df.head()

Unnamed: 0,Survival Rate,Year,Race,Gender,Cancer Type
0,0.559,1977,All races,females,All cancers
1,0.55,1980,All races,females,All cancers
2,0.551,1983,All races,females,All cancers
3,0.576,1986,All races,females,All cancers
4,0.596,1989,All races,females,All cancers


In [None]:
cancer_df.rename(columns = {'Survival Rate':'SurvivalRate', 'Cancer Type':'CancerType'}, inplace = True)
cancer_df.head(0)

Unnamed: 0,SurvivalRate,Year,Race,Gender,CancerType


# Preliminary Analysis

In [None]:
# Shows that SurvivalRate has some missing values, but all other columns have complete values
cancer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1887 entries, 0 to 1886
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SurvivalRate  1666 non-null   float64
 1   Year          1887 non-null   int64  
 2   Race          1887 non-null   object 
 3   Gender        1887 non-null   object 
 4   CancerType    1887 non-null   object 
dtypes: float64(1), int64(1), object(3)
memory usage: 73.8+ KB


In [None]:
cancer_df.groupby(['Race'])[['SurvivalRate']].mean()

Unnamed: 0_level_0,SurvivalRate
Race,Unnamed: 1_level_1
All races,0.467435
Black,0.389642
White,0.454746


In [None]:
cancer_df.groupby(['Gender'])[['SurvivalRate']].mean()

Unnamed: 0_level_0,SurvivalRate
Gender,Unnamed: 1_level_1
females,0.438612
males,0.411025
total,0.455907


In [None]:
cancer_df.groupby(['Year'])[['SurvivalRate']].mean()

Unnamed: 0_level_0,SurvivalRate
Year,Unnamed: 1_level_1
1963,0.27125
1973,0.318594
1977,0.381043
1980,0.390072
1983,0.402014
1986,0.415842
1989,0.434057
1992,0.447079
1995,0.460607
1998,0.470908


# Visualization

## Race

In [None]:
totalgender_filter = cancer_df['Gender'] == 'total'
allraces_filter = cancer_df['Race'] == 'All races'

In [None]:
# ~allraces_filter is used in order to improve readability, only comparing White and Black people's SurvivalRates.
# ~totalgender is used to avoid repeating the same data subjects, because 'total' gender is the average between males and females
fig = px.box(cancer_df.loc[~allraces_filter & ~totalgender_filter,:], x = 'Year', y = 'SurvivalRate', color = 'Race')

fig.show()

Analysis: Black people's survival rate seems to be consistently lower than White people's, possibly caused by racial disparity in access to healthcare. 

Before visualizing the data, I had the hypothesis that probably differences between races' survival rates would decrease as time went by, in general improving survival rates. While survival rates have improved with the years, it should be eye-opening that racial differences in survival rate have only increased with the years.

White people, when compared to Black people in this data set, never have a lower bottom quartile value, and always have a higher upper quartile and median.

## Gender

In [None]:
# ~allraces_filter is used in order to improve readability, only comparing White and Black people's SurvivalRates.
# ~totalgender is used to avoid repeating the same data subjects, because 'total' gender is the average between males and females
fig = px.box(cancer_df.loc[~allraces_filter & ~totalgender_filter,:], x = 'Year', y = 'SurvivalRate', color = 'Gender')

fig.show()

Analysis: It is a known fact that women have a higher life expectancy than men, so it is not surprising that in most years (except for 2006) women had a higher median than men in their survival rates.

It is surprising to me that females' survival rate, though it has increased over the years, has not done so with the same slope as men's survival rates have.