In [None]:
import pandas as pd
import os

def filter_wisconsin_rows(input_csv, output_csv):
    chunk_size = 10000  
    first_chunk = True  
    for chunk in pd.read_csv(input_csv, chunksize=chunk_size):
        filtered_chunk = chunk[chunk['State Name'] == 'Wisconsin']
        if not filtered_chunk.empty:
            filtered_chunk.to_csv(output_csv, mode='a', header=first_chunk, index=False)
            first_chunk = False
            

# Usage
input_file = 'hourly_88101_2020.csv'
output_file = 'filtered_wisconsin.csv'
filter_wisconsin_rows(input_file, output_file)

In [None]:
df = pd.read_csv('filtered_wisconsin.csv', error_bad_lines=False)
df = df[['State Code','County Code','County Code','Parameter Code','Latitude','Longitude','Date Local','Time Local','Date GMT','Time GMT','Sample Measurement','County Name']]

In [None]:
df.to_csv('cleaned_wi_2020_pm25', mode='a', index=False)

In [None]:
milwaukee_df = df[df['County Name'] == 'Milwaukee']
unique_lat_lon = milwaukee_df[['Latitude', 'Longitude']].drop_duplicates().iloc[0]
# Filter the DataFrame to only include rows with this lat/lon pair
filtered_df = milwaukee_df[(milwaukee_df['Latitude'] == unique_lat_lon['Latitude']) & (milwaukee_df['Longitude'] == unique_lat_lon['Longitude'])]
# Now filtered_df contains rows for Milwaukee County with a single lat/lon pair

# Use this function to split a CSV by date format

In [None]:
def split_csv_on_date_change(input_csv):
    """
    Splits a CSV file into multiple new CSV files each time the date format in 'Date Local' changes.

    Parameters:
    - input_csv (str): The path to the input CSV file.
    """
    df = pd.read_csv(input_csv)
    current_format = None
    start_index = 0

    for i, row in df.iterrows():
        date_local = row['Date Local']
        new_format = '-' in date_local

        if current_format is None:
            current_format = new_format
        elif current_format != new_format:
            # Date format changed, split and save the DataFrame up to the current row
            df_slice = df.iloc[start_index:i]
            output_file = f'split_{start_index}_{i-1}.csv'
            df_slice.to_csv(output_file, index=False)
            start_index = i
            current_format = new_format

    # Save the last slice
    if start_index < len(df):
        output_file = f'split_{start_index}_{len(df)-1}.csv'
        df.iloc[start_index:].to_csv(output_file, index=False)

# Usage
input_csv = 'filtered_wisconsin_1_pm10.csv'
split_csv_on_date_change(input_csv)

In [None]:
input_file = 'hourly_81102_2020.csv'
output_file = 'cleaned_MKE_2020_pm10.csv'
filter_wisconsin_rows(input_file, output_file)

In [None]:
df = pd.read_csv('filtered_wisconsin_1_pm10.csv', error_bad_lines=False)
df = df[['State Code','County Code','County Code','Parameter Code','Latitude','Longitude','Date Local','Time Local','Date GMT','Time GMT','Parameter Name','Sample Measurement','County Name']]

# Use this to find differences

In [None]:
def haversine_distance(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    km = 6367 * c
    return km

def find_nearest(df, lat, lon):
    distances = df.apply(lambda row: haversine_distance(lat, lon, row['Latitude'], row['Longitude']), axis=1)
    min_distance_index = distances.idxmin()
    return df.loc[[min_distance_index]]

# Check SO2 lat example

In [None]:
input_file = 'hourly_42401_2020.csv'
output_file = 'filtered_wisconsin_1_so2.csv'
filter_wisconsin_rows(input_file, output_file)

In [None]:
df = pd.read_csv('filtered_wisconsin_1_so2.csv', error_bad_lines=False)
df = df[['State Code','County Code','County Code','Parameter Code','Latitude','Longitude','Date Local','Time Local','Date GMT','Time GMT','Parameter Name','Sample Measurement','County Name']]

In [None]:
nearest_df = find_nearest(df, input_latitude, input_longitude)
nearest_df['Latitude']

In [None]:
df_matched = df[df['Latitude'] == 43.060975]
df_matched.to_csv('cleaned_MKE_2020_so2.csv')