In [3]:
import pandas as pd
import os

# Define the folder path
folder_path = '/Users/himanshunimonkar/Downloads/STA_220/weather-data'

# Define the columns to keep
columns_to_keep = ['Year', 'Month', 'Temp Avg', 'Dew Point Avg', 'Humidity Avg', 'Wind Speed Avg', 'Pressure Avg', 'Precipitation Total']

# Initialize an empty list to store dataframes
all_data = []

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        # Create the full file path
        file_path = os.path.join(folder_path, filename)
        
        # Extract the national park name from the filename (assuming it's the name before ".csv")
        park_name = os.path.splitext(filename)[0]
        
        # Load the dataset
        df = pd.read_csv(file_path)
        
        # Keep only the relevant columns
        df = df[columns_to_keep]
        
        # Convert 'Precipitation Total' to float (to avoid errors when summing)
        df['Precipitation Total'] = pd.to_numeric(df['Precipitation Total'], errors='coerce')
        
        # Add a new column for the national park name
        df['National Park'] = park_name
        
        # Append the dataframe to the list
        all_data.append(df)

# Combine all the dataframes into one
combined_df = pd.concat(all_data, ignore_index=True)

# Group by Year, Month, and National Park
# Sum the 'Precipitation Total' and take the average of the other relevant columns
combined_monthly = combined_df.groupby(['Year', 'Month', 'National Park']).agg(
    {
        'Temp Avg': 'mean',
        'Dew Point Avg': 'mean',
        'Humidity Avg': 'mean',
        'Wind Speed Avg': 'mean',
        'Pressure Avg': 'mean',
        'Precipitation Total': 'sum'  # Sum the total precipitation for the month
    }
).reset_index()

# Sort by National Park, Year, and Month
sorted_combined_monthly = combined_monthly.sort_values(by=['National Park', 'Year', 'Month'])

# Save the result to a new CSV file
sorted_combined_monthly.to_csv('combined_monthly_totals_sorted.csv', index=False)

# Display the first few rows of the processed data
print(sorted_combined_monthly.head())


    Year  Month    National Park   Temp Avg  Dew Point Avg  Humidity Avg  \
0   2015      1  Channel_Islands  55.367742      45.864516     73.909677   
9   2015      2  Channel_Islands  57.810714      49.464286     77.132143   
18  2015      3  Channel_Islands  61.190323      49.703226     70.480645   
27  2015      4  Channel_Islands  60.483333      46.133333     63.280000   
36  2015      5  Channel_Islands  59.293548      49.864516     72.274194   

    Wind Speed Avg  Pressure Avg  Precipitation Total  
0         2.767742     30.080645                 1.33  
9         4.146429     30.007143                 0.32  
18        4.629032     29.993548                 0.30  
27        6.313333     29.923333                 0.18  
36        6.303226     29.922581                 0.32  
