# Moving raw data to processed data

# Data Collection and Processing
In this notebook, we: 
- collect a number of CSV and XLSX files
- clean, simplify, and reshape them
- merge them together by county

In [1]:
import numpy as np
import pandas as pd

## County Basics - All Counties in the US

In [2]:
counties = pd.read_csv("data/raw/uscounties.csv")
counties.head()

Unnamed: 0,county,county_ascii,county_full,county_fips,state_id,state_name,lat,lng,population
0,Los Angeles,Los Angeles,Los Angeles County,6037,CA,California,34.3209,-118.2247,10040682
1,Cook,Cook,Cook County,17031,IL,Illinois,41.8401,-87.8168,5169517
2,Harris,Harris,Harris County,48201,TX,Texas,29.8578,-95.3936,4680609
3,Maricopa,Maricopa,Maricopa County,4013,AZ,Arizona,33.349,-112.4915,4412779
4,San Diego,San Diego,San Diego County,6073,CA,California,33.0343,-116.735,3323970


## County Census Data - US Census Bureau

In [3]:
age_education = pd.read_excel("data/raw/county_age_education_census.xlsx", sheet_name = "Data")
age_education.head(11)

FileNotFoundError: [Errno 2] No such file or directory: 'data/raw/county_age_education_census.xlsx'

In [26]:
age_education.columns

Index(['Unnamed: 0', 'AGE BY EDUCATIONAL ATTAINMENT', 'Unnamed: 2',
       'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7',
       'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11'],
      dtype='object')

In [28]:
age_education.rename(columns = {"Unnamed: 0" : "county",
                      'Unnamed: 2' : "age_25+",
                      'Unnamed: 10' : "highschool_diploma",
                      'Unnamed: 11' : "college_degree"},
                    inplace = True)
education = age_education[["county",
                           "age_25+",
                           "highschool_diploma",
                           "college_degree"]].copy()

In [29]:
education.head(10)

Unnamed: 0,county,age_25+,highschool_diploma,college_degree
0,,Population 25 years and over,,
1,Label,,High school graduate or higher,Bachelor's degree or higher
2,"Baldwin County, Alabama",,,
3,Total,,,
4,Estimate,155691,140409,48823
5,Margin of Error,"±1,500","±3,572","±3,887"
6,Percent,,,
7,Estimate,(X),90.2%,31.4%
8,Margin of Error,(X),±1.9,±2.5
9,Male,,,


In [33]:
# in this instance, we are choosing to disregard the margin of error metric from the US Census Bureau.
# We believe this is the best education data we can find.
education["county"] = education["county"].shift(2)
education.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  education["county"] = education["county"].shift(2)


Unnamed: 0,county,age_25+,highschool_diploma,college_degree
0,,Population 25 years and over,,
1,,,High school graduate or higher,Bachelor's degree or higher
2,,,,
3,Label,,,
4,"Baldwin County, Alabama",155691,140409,48823
5,Total,"±1,500","±3,572","±3,887"
6,Estimate,,,
7,Margin of Error,(X),90.2%,31.4%
8,Percent,(X),±1.9,±2.5
9,Estimate,,,


In [44]:
education_notnull = education[education["county"].isnull() == False]
education_notnull[education_notnull["county"].str.contains("County")].reset_index()

Unnamed: 0,index,county,age_25+,highschool_diploma,college_degree
0,4,"Baldwin County, Alabama",155691,140409,48823
1,23,"Calhoun County, Alabama",79172,67051,14251
2,42,"Cullman County, Alabama",58319,48091,7059
3,61,"DeKalb County, Alabama",48169,36488,7160
4,80,"Elmore County, Alabama",56793,49303,14341
...,...,...,...,...,...
788,15622,"Waukesha County, Wisconsin",284512,273262,129770
789,15641,"Winnebago County, Wisconsin",115949,106795,29177
790,15660,"Wood County, Wisconsin",52200,48328,11791
791,15679,"Laramie County, Wyoming",68017,63986,18013


In [56]:
age_education.shape

(15924, 12)