## Cleaning Lung Disease Data for County Analysis
In this interactive notebook, we will take a look at the process of cleaning up some of our cancer data for use in analysis by county.

In [1]:
import pandas as pd

In [2]:
il_cancer_county = pd.read_csv('data/il_uscs_county_map.csv')
ny_cancer_county = pd.read_csv('data/ny_uscs_county_map.csv')
ca_cancer_county = pd.read_csv('data/ca_uscs_county_map.csv')
ga_cancer_county = pd.read_csv('data/ga_uscs_county_map.csv')

In [3]:
il_cancer_county.drop(columns=['lci', 'uci', 'Sex', 'Area', 'CancerType'], inplace=True)
ny_cancer_county.drop(columns=['Area', 'Sex', 'CancerType'], inplace=True)
ca_cancer_county.drop(columns=['Area', 'Sex', 'CancerType'], inplace=True)
ga_cancer_county.drop(columns=['Area', 'Sex', 'CancerType'], inplace=True)

In [4]:
il_cancer_county.head()

Unnamed: 0,County,Year,AgeAdjustedRate,CaseCount,Population
0,'Kane County','2012-2016','51.2','1298','2633345'
1,'DuPage County','2012-2016','51.8','2643','4657275'
2,'Champaign County','2012-2016','52.8','499','1034430'
3,'Woodford County','2012-2016','54.8','135','194822'
4,'Stephenson County','2012-2016','52.4','186','231025'


In [5]:
ny_cancer_county.head()

Unnamed: 0,County,Year,AgeAdjustedRate,CaseCount,Population
0,'Queens County','2012-2016','43.6','5762','11622232'
1,'Bronx County','2012-2016','46.2','3194','7225933'
2,'Kings County','2012-2016','46.7','6208','13106099'
3,'Westchester County','2012-2016','48.4','2955','4858549'
4,'New York County','2012-2016','47.2','4514','8235025'


In [6]:
ca_cancer_county.head()

Unnamed: 0,County,Year,AgeAdjustedRate,CaseCount,Population
0,'Mono County','2012-2016','25.9','20','70372'
1,'Marin County','2012-2016','35.1','680','1299494'
2,'Monterey County','2012-2016','36.8','763','2154029'
3,'Los Angeles County','2012-2016','36.6','18307','50321257'
4,'Santa Cruz County','2012-2016','37.0','538','1357213'


In [7]:
ga_cancer_county.head()

Unnamed: 0,County,Year,AgeAdjustedRate,CaseCount,Population
0,'Fayette County','2012-2016','45.5','311','546383'
1,'Calhoun County','2012-2016','47.2','18','32626'
2,'Hancock County','2012-2016','48.8','34','43857'
3,'DeKalb County','2012-2016','48.7','1615','3637680'
4,'Gwinnett County','2012-2016','49.0','1599','4368652'


In [8]:
str_trim = lambda s: s[1:-1]
il_cancer_county = il_cancer_county.applymap(str_trim)
ny_cancer_county = ny_cancer_county.applymap(str_trim)
ca_cancer_county = ca_cancer_county.applymap(str_trim)
ga_cancer_county = ga_cancer_county.applymap(str_trim)

In [9]:
il_cancer_county.AgeAdjustedRate = il_cancer_county.AgeAdjustedRate.astype(float)
il_cancer_county.CaseCount = il_cancer_county.CaseCount.astype(int)
il_cancer_county.Population = il_cancer_county.Population.astype(int)
il_cancer_county.County = il_cancer_county.County.apply(lambda s: s.replace(' County', ''))

In [10]:
il_cancer_county.head()

Unnamed: 0,County,Year,AgeAdjustedRate,CaseCount,Population
0,Kane,2012-2016,51.2,1298,2633345
1,DuPage,2012-2016,51.8,2643,4657275
2,Champaign,2012-2016,52.8,499,1034430
3,Woodford,2012-2016,54.8,135,194822
4,Stephenson,2012-2016,52.4,186,231025


In [11]:
ny_cancer_county.AgeAdjustedRate = ny_cancer_county.AgeAdjustedRate.astype(float)
ny_cancer_county.CaseCount = ny_cancer_county.CaseCount.astype(int)
ny_cancer_county.Population = ny_cancer_county.Population.astype(int)
ny_cancer_county.County = ny_cancer_county.County.apply(lambda s: s.replace(' County', ''))

In [12]:
ny_cancer_county.head()

Unnamed: 0,County,Year,AgeAdjustedRate,CaseCount,Population
0,Queens,2012-2016,43.6,5762,11622232
1,Bronx,2012-2016,46.2,3194,7225933
2,Kings,2012-2016,46.7,6208,13106099
3,Westchester,2012-2016,48.4,2955,4858549
4,New York,2012-2016,47.2,4514,8235025


In [13]:
ca_cancer_county = ca_cancer_county.loc[ca_cancer_county.AgeAdjustedRate != 'Data Suppressed']
ca_cancer_county.AgeAdjustedRate = ca_cancer_county.AgeAdjustedRate.astype(float)
ca_cancer_county.CaseCount = ca_cancer_county.CaseCount.astype(int)
ca_cancer_county.Population = ca_cancer_county.Population.astype(int)
ca_cancer_county.County = ca_cancer_county.County.apply(lambda s: s.replace(' County', ''))

In [14]:
ca_cancer_county.head()

Unnamed: 0,County,Year,AgeAdjustedRate,CaseCount,Population
0,Mono,2012-2016,25.9,20,70372
1,Marin,2012-2016,35.1,680,1299494
2,Monterey,2012-2016,36.8,763,2154029
3,Los Angeles,2012-2016,36.6,18307,50321257
4,Santa Cruz,2012-2016,37.0,538,1357213


In [15]:
ga_cancer_county = ga_cancer_county.loc[ga_cancer_county.AgeAdjustedRate != 'Data Suppressed']
ga_cancer_county.AgeAdjustedRate = ga_cancer_county.AgeAdjustedRate.astype(float)
ga_cancer_county.CaseCount = ga_cancer_county.CaseCount.astype(int)
ga_cancer_county.Population = ga_cancer_county.Population.astype(int)
ga_cancer_county.County = ga_cancer_county.County.apply(lambda s: s.replace(' County', ''))

In [16]:
print(il_cancer_county.shape)
print(ny_cancer_county.shape)
print(ca_cancer_county.shape)
print(ga_cancer_county.shape)

(102, 5)
(62, 5)
(56, 5)
(155, 5)


In [17]:
il_cancer_county.dtypes

County              object
Year                object
AgeAdjustedRate    float64
CaseCount            int64
Population           int64
dtype: object

In [23]:
il_cancer_county.to_csv('cleaned_data/il_all_counties_cancer_rates.csv')
ny_cancer_county.to_csv('cleaned_data/ny_all_counties_cancer_rates.csv')
ca_cancer_county.to_csv('cleaned_data/ca_all_counties_cancer_rates.csv')
ga_cancer_county.to_csv('cleaned_data/ga_all_counties_cancer_rates.csv')