In [254]:
import pandas as pd

## Load mosquito data

In [256]:
# Set the display option to show all columns
#pd.set_option('display.max_columns', None)
# Read the txt file into a pandas DataFrame
df = pd.read_csv('data/raw_data/West_Nile_Virus__WNV__Mosquito_Test_Results.csv')

# Display the first few rows of the DataFrame
df.head()




Unnamed: 0,SEASON YEAR,WEEK,TEST ID,BLOCK,TRAP,TRAP_TYPE,TEST DATE,NUMBER OF MOSQUITOES,RESULT,SPECIES,LATITUDE,LONGITUDE,LOCATION
0,2021,22,51815,100XX W OHARE,T909,GRAVID,6/3/21 00:06,19,negative,CULEX PIPIENS/RESTUANS,3.0,4.0,
1,2021,22,51816,100XX W OHARE,T909,GRAVID,6/3/21 00:06,5,negative,CULEX RESTUANS,,,
2,2021,23,51918,100XX W OHARE,T909,GRAVID,6/10/21 00:06,50,negative,CULEX PIPIENS/RESTUANS,,,
3,2021,33,52988,100XX W OHARE,T909,GRAVID,8/19/21 00:08,50,negative,CULEX PIPIENS/RESTUANS,,,
4,2022,23,53486,100XX W OHARE,T904,GRAVID,6/10/22 00:06,23,negative,CULEX PIPIENS/RESTUANS,,,


## identify missing data

In [258]:
missing_values_count = data.isnull().sum()
missing_values_count

SEASON YEAR                0
WEEK                       0
TEST ID                    0
BLOCK                      0
TRAP                       0
TRAP_TYPE                  0
TEST DATE                  0
NUMBER OF MOSQUITOES       0
RESULT                     0
SPECIES                    0
LATITUDE                5318
LONGITUDE               5318
LOCATION                5319
dtype: int64

## find the missing lat/long and group them by block

In [260]:
missing_lat_long_data = data[data['LATITUDE'].isnull() | data['LONGITUDE'].isnull()]

# Group by 'BLOCK', count the missing values and list the 'TEST DATE' for each
block_missing_summary = missing_lat_long_data.groupby('BLOCK').agg(
    Missing_Count=pd.NamedAgg(column='LATITUDE', aggfunc='size'), # Count of missing values
    Test_Dates=pd.NamedAgg(column='TEST DATE', aggfunc=lambda x: x.unique().tolist()) # Unique test dates
).reset_index()

# Display the summary
block_missing_summary

Unnamed: 0,BLOCK,Missing_Count,Test_Dates
0,100XX W OHARE,500,"[6/3/21 00:06, 6/10/21 00:06, 8/19/21 00:08, 6..."
1,100XX W OHARE AIRPORT,3426,"[6/16/17 00:06, 6/3/21 00:06, 6/8/18 00:06, 6/..."
2,115XX S AVENUE L,80,"[8/28/07 00:08, 8/1/07 02:08, 10/4/07 00:10, 8..."
3,20XX N DOMINICK ST,142,"[7/19/18 00:07, 7/3/19 00:07, 6/12/20 00:06, 8..."
4,30XX S HOYNE,24,"[9/18/07 00:09, 7/27/07 11:07, 8/21/07 00:08, ..."
5,43XX N ASHLAND,15,"[8/21/07 00:08, 9/18/07 00:09, 6/26/07 04:06, ..."
6,4XX W 127TH,512,"[6/16/17 00:06, 8/3/17 00:08, 6/12/20 00:06, 6..."
7,65XX N OAK PARK AVE,31,"[8/15/07 00:08, 10/4/07 00:10, 6/5/07 00:06, 6..."
8,79XX S CHICAGO,298,"[8/3/17 00:08, 6/10/22 00:06, 6/13/19 00:06, 6..."
9,81XX S ASHLAND,269,"[8/3/17 00:08, 6/19/17 00:06, 6/8/18 00:06, 7/..."


### This code was used to check to see if there were any lat/long listed for at least one of the block records. To check if this code works, I added fake data in two fields. 

In [262]:
# Extract the list of blocks with missing lat/long data
blocks_with_missing_values = block_missing_summary['BLOCK'].unique()

# Filter the original dataset for entries that are in the list of blocks with missing values
# but have valid latitude and longitude data
blocks_with_valid_lat_long = data[
    data['BLOCK'].isin(blocks_with_missing_values) &
    data['LATITUDE'].notnull() &
    data['LONGITUDE'].notnull()
]

# Group by 'BLOCK' and list the unique latitude and longitude values for these entries
block_valid_lat_long_summary = blocks_with_valid_lat_long.groupby('BLOCK').agg({
    'LATITUDE': lambda x: x.unique().tolist(),
    'LONGITUDE': lambda x: x.unique().tolist()
}).reset_index()

# The resulting DataFrame 'block_valid_lat_long_summary' will contain each block along with
# the associated valid latitude and longitude values that exist in the dataset.
block_valid_lat_long_summary


Unnamed: 0,BLOCK,LATITUDE,LONGITUDE
0,100XX W OHARE,[3.0],[4.0]


### Because I am able to get the lat and long for these blocks, I will keep them and dropp all the others.

In [264]:
# Define the blocks to keep even if they have missing lat/long
blocks_to_keep = ['100XX W OHARE', '100XX W OHARE AIRPORT', '4XX W 127TH']

# Filter the data to exclude rows with missing lat/long unless the block contains one of the specified blocks to keep
data_filtered = data[
    (~data['LATITUDE'].isnull() & ~data['LONGITUDE'].isnull()) |  # Keep rows with valid lat/long
    (data['BLOCK'].str.contains('|'.join(blocks_to_keep)))  # Or rows that contain the specified blocks
]

# The resulting DataFrame 'data_filtered' will have the rows with missing values dropped,
# except for the specified blocks.

Adding Lat and long for the 3 blocks and saving new dataset to the processed data folder

In [273]:
# Assuming your dataframe is named data_filtered
# Update the lat/long values for the specified locations
data_filtered.loc[data_filtered['BLOCK'] == '100XX W OHARE', ['LATITUDE', 'LONGITUDE']] = 41.978611, -87.904724
data_filtered.loc[data_filtered['BLOCK'] == '100XX W OHARE AIRPORT', ['LATITUDE', 'LONGITUDE']] = 41.978611, -87.904724
data_filtered.loc[data_filtered['BLOCK'] == '4XX W 127TH', ['LATITUDE', 'LONGITUDE']] = 41.66318849, -87.63267836

# Save the updated dataframe to a new CSV file in the same directory
data_filtered.to_csv('data/processed_data/wnv_cleaned.csv', index=False)

## Load weather data

In [277]:
import pandas as pd

# Set the display option to show all columns
#pd.set_option('display.max_columns', None)
# Read the txt file into a pandas DataFrame
wx = pd.read_csv('data/raw_data/ORD_weather.csv')

# Display the first few rows of the DataFrame
wx.head()

Unnamed: 0,STATION,NAME,DATE,AWND,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN
0,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",12/1/17,4.92,0.0,0.0,0.0,41,53,28
1,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",12/2/17,4.25,0.0,0.0,0.0,42,54,33
2,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",12/3/17,6.26,0.0,0.0,0.0,43,58,29
3,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",12/4/17,21.92,0.16,0.0,0.0,55,65,44
4,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",12/5/17,21.92,0.0,0.0,0.0,41,44,27


### identify missing data

There are no missing values so we can proceed with the next step. date ranges


In [279]:
#date range for the mosquito dataset
year_min_mos = df['SEASON YEAR'].min()
year_max_mos = df['SEASON YEAR'].max()

#date range for the weather datasetab
year_min_wx = wx['DATE'].min()
year_max_wx = wx['DATE'].max()

print(f"The range of the mosquito dataset is {year_min_mos} to {year_max_mos}")
print(f"The range of the mosquito dataset is {year_min_wx} to {year_max_wx}")



The range of the mosquito dataset is 2007 to 2023
The range of the mosquito dataset is 1/1/18 to 9/9/23


In [281]:
# Corrected file path
file_path = 'data/processed_data/wnv_cleaned.csv'

# Read the CSV file
mos_clean = pd.read_csv(file_path)

# Display the first few rows of the DataFrame
mos_clean.head()


Unnamed: 0,SEASON YEAR,WEEK,TEST ID,BLOCK,TRAP,TRAP_TYPE,TEST DATE,NUMBER OF MOSQUITOES,RESULT,SPECIES,LATITUDE,LONGITUDE,LOCATION
0,2021,22,51815,100XX W OHARE,T909,GRAVID,6/3/21 00:06,19,negative,CULEX PIPIENS/RESTUANS,41.978611,-87.904724,
1,2021,22,51816,100XX W OHARE,T909,GRAVID,6/3/21 00:06,5,negative,CULEX RESTUANS,41.978611,-87.904724,
2,2021,23,51918,100XX W OHARE,T909,GRAVID,6/10/21 00:06,50,negative,CULEX PIPIENS/RESTUANS,41.978611,-87.904724,
3,2021,33,52988,100XX W OHARE,T909,GRAVID,8/19/21 00:08,50,negative,CULEX PIPIENS/RESTUANS,41.978611,-87.904724,
4,2022,23,53486,100XX W OHARE,T904,GRAVID,6/10/22 00:06,23,negative,CULEX PIPIENS/RESTUANS,41.978611,-87.904724,


In [283]:
# Filter the DataFrame to keep only the records where 'SEASON YEAR' is >= 2018
mos_trim = mos_clean[mos_clean['SEASON YEAR'] >= 2018]

# Find the minimum value of 'SEASON YEAR' in the filtered DataFrame
min_season_year = mos_trim['SEASON YEAR'].min()

min_season_year

mos_trim.to_csv('data/processed_data/wnv_trim.csv', index=False)