# <span style="font-size:1.5em;"> Creating Trim Datasets
Author: Angela Kim

In [1]:
# Import libraries
import pandas as pd
import numpy as np

## Overview

In this notebook, I trim down the merged datasets...

In [2]:
# Import all datasets
O3original = pd.read_csv('O3.csv')
COoriginal = pd.read_csv('CO.csv')
NO2original = pd.read_csv('NO2.csv')
SO2original = pd.read_csv('SO2.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
# Function to trim all four datasets

def trim_dataset(df, pollutant):
    """
    Trims down pollution datasets.
    
    Parameters:
    ----------
    df: DataFrame
    pollutant: str, name of pollutant
    
    Returns:
    ----------
    DataFrame
    """
    
    # Drop columns that are not US states or DC
    df.drop(df[(df['State Name'] == 'Country Of Mexico') | 
               (df['State Name'] == 'Virgin Islands') | 
               (df['State Name'] == 'Canada') | 
               (df['State Name'] == 'Puerto Rico')].index, inplace=True)
    
    # Drop pollutant standards that do not produce AQI values
    if pollutant == 'CO':
        df.drop(df[df['Pollutant Standard'] == ('CO 1-hour 1971')].index, inplace=True)
    elif pollutant == 'SO2':
        df.drop(df[df['Pollutant Standard'] == ('SO2 3-hour 1971')].index, inplace=True)
    else:
        pass
    
    # Drop columns that are redundant or unnecessary
    df.drop(['State Code', 'County Code', 'Site Num', 'Parameter Code', 'POC', 'Latitude', 'Longitude', 
             'Datum', 'Parameter Name', 'Sample Duration', 'Pollutant Standard', 'Units of Measure', 
             'Event Type', 'Observation Count', 'Observation Percent', 'Method Code', 'Method Name', 
             'Local Site Name', 'Address', 'CBSA Name', 'Date of Last Change'], axis=1, inplace=True)
    
    # Convert dates to datetime format
    df['Date Local'] = pd.to_datetime(df['Date Local'])
    
    # Reorder columns for neatness
    reordered = ['Date Local', 'State Name', 'County Name', 'City Name', 
                 'Arithmetic Mean', '1st Max Value', '1st Max Hour', 'AQI']
    
    df = df.reindex(columns=reordered)
    
    # Rename columns for neatness
    df = df.rename(columns={'Date Local': 'Date', 
                            'Arithmetic Mean': 'Mean', 
                            'AQI': '{} AQI'.format(pollutant), 
                            'State Name': 'State', 
                            'County Name': 'County', 
                            'City Name': 'City'})
    
    return df

In [4]:
# Applying the function to datasets
O3 = trim_dataset(O3original, 'O3')
CO = trim_dataset(COoriginal, 'CO')
NO2 = trim_dataset(NO2original, 'NO2')
SO2 = trim_dataset(SO2original, 'SO2')

In [5]:
# Export as csv
# O3.to_csv('O3trim.csv', index=False)
# CO.to_csv('COtrim.csv', index=False)
# NO2.to_csv('NO2trim.csv', index=False)
# SO2.to_csv('SO2trim.csv', index=False)

## Pollution All

In [None]:
# O3df = O3.copy()
# COdf = CO.copy()
# SO2df = SO2.copy()
# NO2df = NO2.copy()

# df1 = pd.merge(O3df, COdf, how='inner', 
#                left_on=['Date', 'Address', 'State', 'County', 'City'], 
#                right_on=['Date', 'Address', 'State', 'County', 'City'])
# df2 = pd.merge(df1, SO2df, how='inner', 
#                left_on=['Date', 'Address', 'State', 'County', 'City'], 
#                right_on=['Date', 'Address', 'State', 'County', 'City'])
# df3 = pd.merge(df2, NO2df, how='inner', 
#                left_on=['Date', 'Address', 'State', 'County', 'City'], 
#                right_on=['Date', 'Address', 'State', 'County', 'City'])
# df_final = df3.copy()
# df_final['Date'] = pd.to_datetime(df_final['Date'])
# df_final['Day'] = df_final['Date'].dt.day
# df_final['Month'] = df_final['Date'].dt.month
# df_final['Year'] = df_final['Date'].dt.year
# reordered = ['Date', 'Year', 'Month', 'Day', 'Address', 'State', 'County', 'City', 
#              'O3 Mean', 'O3 1st Max Value', 'O3 1st Max Hour', 'O3 AQI', 
#              'CO Mean', 'CO 1st Max Value', 'CO 1st Max Hour', 'CO AQI', 
#              'SO2 Mean', 'SO2 1st Max Value', 'SO2 1st Max Hour', 'SO2 AQI', 
#              'NO2 Mean', 'NO2 1st Max Value', 'NO2 1st Max Hour', 'NO2 AQI']
# df_final = df_final.reindex(columns=reordered)
# df_final.to_csv('pollution_2000_2021.csv', index=False)