In [1]:
import pandas as pd

def read_csv_files(variables):
    """Read multiple CSV files and return a dictionary of dataframes"""
    dfs = {}
    for var in variables:
        df = pd.read_csv(f'processed_data/{var}.csv')
        # Round coordinates to ensure exact matches
        df['latitude'] = df['latitude'].round(2)
        df['longitude'] = df['longitude'].round(2)
        dfs[var] = df
    return dfs

def merge_dataframes(dfs, merge_cols):
    """Merge multiple dataframes on specified columns"""
    combined_df = None
    for df in dfs.values():
        if combined_df is None:
            combined_df = df
        else:
            # Use inner join to only keep rows where time/lat/lon match exactly
            combined_df = pd.merge(combined_df, df, 
                                 on=merge_cols,
                                 how='inner',
                                 validate='1:1')
    return combined_df.reset_index(drop=True)

In [2]:
# List of variables to process  
variables = ['thetao', 'so', 'uo', 'vo', 'wo', 'kd', 'ph', 'spco2', 'o2', 'no3', 'po4', 'si', 'fe', 'chl']

# Read all CSV files
dfs = read_csv_files(variables)

# Merge all dataframes ensuring exact matches on time/lat/lon
merge_cols = ['time', 'latitude', 'longitude'] 
combined_df = merge_dataframes(dfs, merge_cols)

# Sort by time, latitude, longitude
combined_df = combined_df.sort_values(['time', 'latitude', 'longitude'])

In [3]:
combined_df

Unnamed: 0,time,latitude,longitude,thetao,so,uo,vo,wo,kd,ph,spco2,o2,no3,po4,si,fe,chl
98,2023-11-01,20.00,87.00,28.125040,26.587210,-0.082749,-0.107938,4.573316e-07,0.068661,8.116567,30.903740,213.01695,2.469345,0.005517,1.738706,0.001210,0.345622
80,2023-11-01,20.00,87.25,28.172344,27.636711,-0.167275,-0.112871,-7.577646e-07,0.061908,8.107141,32.123050,211.49420,1.688633,0.015755,1.622466,0.000954,0.290206
114,2023-11-01,20.00,87.50,28.253498,28.553354,-0.183687,-0.107585,1.977740e-06,0.057834,8.094933,33.270065,209.78447,1.299212,0.027425,1.681856,0.000874,0.276692
25,2023-11-01,20.00,87.75,28.354998,31.075155,-0.148304,0.002403,-2.325415e-06,0.048688,8.075578,34.821860,207.91861,0.991888,0.038570,1.743745,0.000818,0.233765
61,2023-11-01,20.00,88.00,28.431435,31.768059,-0.125736,0.073998,-4.150868e-07,0.047435,8.060791,36.053833,206.99332,0.328877,0.042215,1.698190,0.000720,0.232472
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1549,2024-11-01,21.75,91.00,26.728876,2.862218,0.021892,-0.007349,-8.396950e-08,0.391286,8.515196,14.302748,323.76422,83.318730,0.009243,57.331910,0.007397,7.266008
1610,2024-11-01,22.00,90.50,26.683010,4.505866,0.024913,0.008431,6.270751e-07,0.303172,8.402604,11.703587,312.79343,41.128704,0.008304,21.115942,0.008846,5.100220
1576,2024-11-01,22.00,90.75,26.686163,3.455010,0.028493,-0.008834,-2.889241e-07,0.388468,8.462801,12.565397,342.90585,61.300835,0.009588,37.602215,0.008885,7.239828
1670,2024-11-01,22.25,91.00,27.002928,0.795333,0.013149,-0.009046,-2.373275e-07,0.397539,8.459275,24.983982,420.97906,94.054550,0.298521,39.225770,0.010660,7.854579


In [4]:
# Display first few rows and basic information
print("Combined DataFrame Shape:", combined_df.shape)
print("\nDataFrame Info:")
display(combined_df.info())

Combined DataFrame Shape: (1677, 17)

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 1677 entries, 98 to 1662
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   time       1677 non-null   object 
 1   latitude   1677 non-null   float64
 2   longitude  1677 non-null   float64
 3   thetao     1677 non-null   float64
 4   so         1677 non-null   float64
 5   uo         1677 non-null   float64
 6   vo         1677 non-null   float64
 7   wo         1677 non-null   float64
 8   kd         1677 non-null   float64
 9   ph         1677 non-null   float64
 10  spco2      1677 non-null   float64
 11  o2         1677 non-null   float64
 12  no3        1677 non-null   float64
 13  po4        1677 non-null   float64
 14  si         1677 non-null   float64
 15  fe         1677 non-null   float64
 16  chl        1677 non-null   float64
dtypes: float64(16), object(1)
memory usage: 235.8+ KB


None

In [5]:
# Save the combined dataframe to CSV
combined_df.to_csv('combined_data.csv', index=False)

In [6]:
# Import numpy
import numpy as np

# Function to remove outliers from all features
def remove_outliers(data):
    clean_data = data.copy()
    
    # Get all numeric columns except time, latitude, longitude
    numeric_cols = data.select_dtypes(include=[np.number]).columns
    numeric_cols = [col for col in numeric_cols if col not in ['time', 'latitude', 'longitude']]
    
    for column in numeric_cols:
        # Calculate Q1, Q3 and IQR
        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)
        IQR = Q3 - Q1
        
        # Create mask for non-outlier values
        mask = (data[column] >= Q1 - 1.5 * IQR) & (data[column] <= Q3 + 1.5 * IQR)
        
        # Update clean_data to keep only non-outlier values
        clean_data = clean_data[mask]
    
    # Reset index after removing outliers
    clean_data = clean_data.reset_index(drop=True)
    
    return clean_data

# Remove outliers from the combined dataframe
data = remove_outliers(combined_df)

print(f"Shape after removing outliers: {data.shape}")

Shape after removing outliers: (919, 17)


  clean_data = clean_data[mask]
  clean_data = clean_data[mask]
  clean_data = clean_data[mask]
  clean_data = clean_data[mask]
  clean_data = clean_data[mask]
  clean_data = clean_data[mask]
  clean_data = clean_data[mask]
  clean_data = clean_data[mask]
  clean_data = clean_data[mask]
  clean_data = clean_data[mask]
  clean_data = clean_data[mask]
  clean_data = clean_data[mask]
  clean_data = clean_data[mask]


In [7]:
data

Unnamed: 0,time,latitude,longitude,thetao,so,uo,vo,wo,kd,ph,spco2,o2,no3,po4,si,fe,chl
0,2023-11-01,20.0,87.00,28.125040,26.587210,-0.082749,-0.107938,4.573316e-07,0.068661,8.116567,30.903740,213.01695,2.469345,0.005517,1.738706,0.001210,0.345622
1,2023-11-01,20.0,87.25,28.172344,27.636711,-0.167275,-0.112871,-7.577646e-07,0.061908,8.107141,32.123050,211.49420,1.688633,0.015755,1.622466,0.000954,0.290206
2,2023-11-01,20.0,88.00,28.431435,31.768059,-0.125736,0.073998,-4.150868e-07,0.047435,8.060791,36.053833,206.99332,0.328877,0.042215,1.698190,0.000720,0.232472
3,2023-11-01,20.0,88.25,28.467304,32.048350,-0.085959,0.088375,-2.903354e-07,0.047017,8.054842,36.541910,206.21194,0.090459,0.046380,1.801227,0.000730,0.208067
4,2023-11-01,20.0,88.50,28.484333,32.265830,-0.025816,0.105643,-1.168651e-07,0.048022,8.053015,36.686733,205.87086,0.051006,0.045998,1.916590,0.000748,0.213012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
914,2024-11-01,21.5,87.50,27.286037,14.570991,0.020126,-0.002796,4.891955e-07,0.078083,8.199978,16.991192,228.93271,7.028217,0.002554,2.184181,0.005639,0.546488
915,2024-11-01,21.5,88.75,28.427752,24.647715,0.001146,-0.027511,8.167145e-07,0.058208,8.107636,26.236935,214.49101,2.639683,0.001582,2.883434,0.004193,0.326283
916,2024-11-01,21.5,89.00,28.465527,24.899548,0.030324,-0.034723,6.482846e-07,0.056803,8.120710,27.024380,214.12971,2.945366,0.002107,3.069671,0.003823,0.315234
917,2024-11-01,21.5,89.25,27.610617,22.019636,-0.000207,-0.012457,-9.810797e-08,0.059467,8.148086,23.841318,215.81914,4.101088,0.002853,3.241976,0.003865,0.333051


In [8]:
# Display first few rows and basic information
print("Cleaned DataFrame Shape:", data.shape)
print("\nDataFrame Info:")
display(data.info())

Cleaned DataFrame Shape: (919, 17)

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 919 entries, 0 to 918
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   time       919 non-null    object 
 1   latitude   919 non-null    float64
 2   longitude  919 non-null    float64
 3   thetao     919 non-null    float64
 4   so         919 non-null    float64
 5   uo         919 non-null    float64
 6   vo         919 non-null    float64
 7   wo         919 non-null    float64
 8   kd         919 non-null    float64
 9   ph         919 non-null    float64
 10  spco2      919 non-null    float64
 11  o2         919 non-null    float64
 12  no3        919 non-null    float64
 13  po4        919 non-null    float64
 14  si         919 non-null    float64
 15  fe         919 non-null    float64
 16  chl        919 non-null    float64
dtypes: float64(16), object(1)
memory usage: 122.2+ KB


None

In [9]:
# Save the cleaned data to a new CSV file
data.to_csv('cleaned_data.csv', index=False)

In [10]:
# Add new imports at the top of your notebook
from plotly.subplots import make_subplots
import plotly.graph_objects as go

def create_all_timeseries(data):
    """Create time series plots for all parameters"""
    # Get numeric columns except lat/lon
    numeric_cols = data.select_dtypes(include=['float64']).columns
    parameters = [col for col in numeric_cols if col not in ['latitude', 'longitude']]
    
    # Create subplots, 3 columns and enough rows to fit all parameters
    n_cols = 2
    n_rows = (len(parameters) + n_cols - 1) // n_cols
    
    fig = make_subplots(rows=n_rows, cols=n_cols, 
                       subplot_titles=parameters)
    
    # Add each parameter's time series
    for idx, param in enumerate(parameters):
        row = idx // n_cols + 1
        col = idx % n_cols + 1
        
        fig.add_trace(
            go.Scatter(
                x=data['time'], 
                y=data[param], 
                name=param,
                mode='markers',  # Changed from default 'lines' to 'markers'
                marker=dict(
                    size=4,      # Adjust point size as needed
                    opacity=0.6  # Add some transparency to handle overlapping
                )
            ),
            row=row, col=col
        )
        
    # Update layout
    fig.update_layout(height=300*n_rows, 
                     showlegend=False,
                     title_text="Time Series of All Parameters")
    fig.show()

# Generate both visualizations
print("Time Series Plots:")
create_all_timeseries(data)

Time Series Plots:


In [11]:
# Add these imports at the top of your notebook
import folium
from ipywidgets import interact, widgets

def create_interactive_map(data):
    """Create an interactive map with parameter and time selection"""
    # Process data to get monthly averages
    data['time'] = pd.to_datetime(data['time'])
    data['month'] = data['time'].dt.to_period('M')
    monthly_data = data.groupby(['month', 'latitude', 'longitude']).mean().reset_index()
    monthly_data['month'] = monthly_data['month'].astype(str)
    
    def update_map(parameter, selected_month):
        # Create base map with scrollWheelZoom disabled
        center_lat = data['latitude'].mean()
        center_lon = data['longitude'].mean()
        m = folium.Map(location=[center_lat, center_lon], zoom_start=8, scrollWheelZoom=False)  # Disable scroll wheel zoom
        
        # Filter data for selected month
        month_data = monthly_data[monthly_data['month'] == selected_month]
        
        # Create colormap
        colormap = folium.LinearColormap(
            colors=['blue', 'yellow', 'red'],
            vmin=month_data[parameter].min(),
            vmax=month_data[parameter].max(),
            caption=f'{parameter} values'
        )
        
        # Add points with color based on parameter value
        for idx, row in month_data.iterrows():
            folium.CircleMarker(
                location=[row['latitude'], row['longitude']],
                radius=8,
                color=None,
                fill=True,
                fill_color=colormap(row[parameter]),
                fill_opacity=0.7,
                popup=f"{parameter}: {row[parameter]:.2f}"
            ).add_to(m)
        
        # Add colormap to map
        colormap.add_to(m)
        
        # Display map
        display(m)
    
    # Get available months for dropdown
    available_months = sorted(monthly_data['month'].unique())
    
    # Create dropdown options
    numeric_cols = data.select_dtypes(include=['float64']).columns
    parameters = [col for col in numeric_cols if col not in ['latitude', 'longitude']]
    
    # Create interactive widgets
    interact(
        update_map,
        parameter=widgets.Dropdown(
            options=parameters,
            description='Parameter:',
            style={'description_width': 'initial'}
        ),
        selected_month=widgets.Dropdown(
            options=available_months,
            description='Month:',
            style={'description_width': 'initial'}
        )
    )

print("\nInteractive Map:")
create_interactive_map(data)


Interactive Map:


interactive(children=(Dropdown(description='Parameter:', options=('thetao', 'so', 'uo', 'vo', 'wo', 'kd', 'ph'…

In [12]:
# # Add these imports
# from selenium import webdriver
# from selenium.webdriver.chrome.options import Options
# import time
# import os

# def save_map_screenshots(data):
#     """Save screenshots of maps for all parameters and months"""
#     # Process data to get monthly averages (same as before)
#     data['time'] = pd.to_datetime(data['time'])
#     data['month'] = data['time'].dt.to_period('M')
#     monthly_data = data.groupby(['month', 'latitude', 'longitude']).mean().reset_index()
#     monthly_data['month'] = monthly_data['month'].astype(str)
    
#     # Create screenshots directory if it doesn't exist
#     os.makedirs('map_screenshots', exist_ok=True)
    
#     # Setup Chrome options for headless browsing
#     chrome_options = Options()
#     chrome_options.add_argument("--headless")  # Run in headless mode
#     chrome_options.add_argument("--window-size=1920,1080")
    
#     # Get parameters and months
#     numeric_cols = data.select_dtypes(include=['float64']).columns
#     parameters = [col for col in numeric_cols if col not in ['latitude', 'longitude']]
#     available_months = sorted(monthly_data['month'].unique())
    
#     # Initialize webdriver
#     driver = webdriver.Chrome(options=chrome_options)
    
#     try:
#         for parameter in parameters:
#             for month in available_months:
#                 # Create map
#                 center_lat = data['latitude'].mean()
#                 center_lon = data['longitude'].mean()
#                 m = folium.Map(location=[center_lat, center_lon], zoom_start=8)
                
#                 # Filter data for selected month
#                 month_data = monthly_data[monthly_data['month'] == month]
                
#                 # Create colormap
#                 colormap = folium.LinearColormap(
#                     colors=['blue', 'yellow', 'red'],
#                     vmin=month_data[parameter].min(),
#                     vmax=month_data[parameter].max(),
#                     caption=f'{parameter} values'
#                 )
                
#                 # Add points
#                 for idx, row in month_data.iterrows():
#                     folium.CircleMarker(
#                         location=[row['latitude'], row['longitude']],
#                         radius=8,
#                         color=None,
#                         fill=True,
#                         fill_color=colormap(row[parameter]),
#                         fill_opacity=0.7,
#                         popup=f"{parameter}: {row[parameter]:.2f}"
#                     ).add_to(m)
                
#                 colormap.add_to(m)
                
#                 # Save map to temporary HTML file
#                 temp_html = f'temp_map.html'
#                 m.save(temp_html)
                
#                 # Load the HTML in selenium
#                 driver.get(f'file://{os.path.abspath(temp_html)}')
#                 time.sleep(2)  # Wait for map to load
                
#                 # Save screenshot
#                 filename = f'map_screenshots/{parameter}_{month}.png'
#                 driver.save_screenshot(filename)
#                 print(f"Saved {filename}")
                
#                 # Clean up temporary file
#                 os.remove(temp_html)
    
#     finally:
#         driver.quit()

# # Run the function to save all screenshots
# save_map_screenshots(data)

Saved map_screenshots/thetao_2023-11.png
Saved map_screenshots/thetao_2023-12.png
Saved map_screenshots/thetao_2024-01.png
Saved map_screenshots/thetao_2024-02.png
Saved map_screenshots/thetao_2024-03.png
Saved map_screenshots/thetao_2024-04.png
Saved map_screenshots/thetao_2024-05.png
Saved map_screenshots/thetao_2024-06.png
Saved map_screenshots/thetao_2024-07.png
Saved map_screenshots/thetao_2024-08.png
Saved map_screenshots/thetao_2024-09.png
Saved map_screenshots/thetao_2024-10.png
Saved map_screenshots/thetao_2024-11.png
Saved map_screenshots/so_2023-11.png
Saved map_screenshots/so_2023-12.png
Saved map_screenshots/so_2024-01.png
Saved map_screenshots/so_2024-02.png
Saved map_screenshots/so_2024-03.png
Saved map_screenshots/so_2024-04.png
Saved map_screenshots/so_2024-05.png
Saved map_screenshots/so_2024-06.png
Saved map_screenshots/so_2024-07.png
Saved map_screenshots/so_2024-08.png
Saved map_screenshots/so_2024-09.png
Saved map_screenshots/so_2024-10.png
Saved map_screenshots/s