In [1]:
import pandas as pd
import numpy as np

def read_csv_files(variables):
    """Read multiple CSV files and return a dictionary of dataframes"""
    dfs = {}
    for var in variables:
        df = pd.read_csv(f'processed_data/{var}.csv')
        # Round coordinates to ensure exact matches
        df['latitude'] = df['latitude'].round(2)
        df['longitude'] = df['longitude'].round(2)
        dfs[var] = df
    return dfs

def merge_dataframes(dfs, merge_cols):
    """Merge multiple dataframes on specified columns"""
    combined_df = None
    for df in dfs.values():
        if combined_df is None:
            combined_df = df
        else:
            # Use inner join to only keep rows where time/lat/lon match exactly
            combined_df = pd.merge(combined_df, df, 
                                 on=merge_cols,
                                 how='inner',
                                 validate='1:1')
    return combined_df.reset_index(drop=True)

In [2]:
# List of variables to process  
variables = ['thetao', 'so', 'wo', 'ph', 'spco2', 'o2', 'no3', 'po4', 'si', 'fe', 'chl']

# Read all CSV files
dfs = read_csv_files(variables)

# Merge all dataframes ensuring exact matches on time/lat/lon
merge_cols = ['time', 'latitude', 'longitude'] 
combined_df = merge_dataframes(dfs, merge_cols)

# Sort by time, latitude, longitude
combined_df = combined_df.sort_values(['time', 'latitude', 'longitude'])

In [3]:
combined_df

Unnamed: 0,time,longitude,latitude,thetao,so,wo,ph,spco2,o2,no3,po4,si,fe,chl
31,2025-01-01,87.26,19.97,24.463710,30.846056,2.472016e-08,8.081079,33.983177,214.18227,0.028434,0.035814,1.484352,0.001691,0.823264
37,2025-01-01,87.70,19.97,25.259663,32.383970,4.294948e-08,8.060937,35.911453,208.17493,0.033679,0.057536,1.690931,0.001672,1.242999
52,2025-01-01,88.14,19.98,25.749617,33.093174,5.065795e-08,8.048275,37.390057,208.50151,0.070526,0.071205,1.864909,0.001671,1.806067
46,2025-01-01,88.58,19.99,25.936096,33.355045,4.507891e-08,8.043919,38.872276,209.56064,0.031080,0.073519,1.766548,0.001691,2.038893
26,2025-01-01,89.02,19.99,26.054684,33.496220,4.102340e-08,8.042045,38.505657,210.43271,0.028767,0.070911,1.812253,0.001720,2.051734
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3194,2029-12-01,89.42,21.60,26.440117,32.955300,4.825921e-08,8.040186,39.385223,204.36882,0.036979,0.007698,1.272989,0.002298,0.430887
3188,2029-12-01,89.86,21.61,26.751585,32.814823,5.387772e-08,8.036771,39.618610,203.75328,0.068134,0.005435,1.293711,0.002333,0.409029
3199,2029-12-01,90.30,21.62,26.684328,32.612840,5.198349e-08,8.039713,39.213287,203.62840,0.234469,0.002679,1.342976,0.002462,0.416281
3206,2029-12-01,90.74,21.64,27.294374,32.486294,3.779215e-08,8.031207,40.054028,202.53156,0.364145,0.002586,1.372194,0.002528,0.378560


In [4]:
# Display first few rows and basic information
print("Combined DataFrame Shape:", combined_df.shape)
print("\nDataFrame Info:")
display(combined_df.info())

Combined DataFrame Shape: (3240, 14)

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 3240 entries, 31 to 3205
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   time       3240 non-null   object 
 1   longitude  3240 non-null   float64
 2   latitude   3240 non-null   float64
 3   thetao     3240 non-null   float64
 4   so         3240 non-null   float64
 5   wo         3240 non-null   float64
 6   ph         3240 non-null   float64
 7   spco2      3240 non-null   float64
 8   o2         3240 non-null   float64
 9   no3        3240 non-null   float64
 10  po4        3240 non-null   float64
 11  si         3240 non-null   float64
 12  fe         3240 non-null   float64
 13  chl        3240 non-null   float64
dtypes: float64(13), object(1)
memory usage: 379.7+ KB


None

In [5]:
# Function to find nearest point and get value
def get_nearest_point_value(target_lat, target_lon, df, timestamp):
    # Filter df for the specific timestamp
    time_df = df[df['time'] == timestamp]
    
    if time_df.empty:
        return np.nan
        
    # Calculate distances to all points
    distances = np.sqrt(
        (time_df['latitude'] - target_lat)**2 + 
        (time_df['longitude'] - target_lon)**2
    )
    
    # Get index of minimum distance
    nearest_idx = distances.idxmin()
    
    # Return the value from nearest point
    return time_df.loc[nearest_idx]

# Read uo, vo from processed folder and kd from imported folder
uo_df = pd.read_csv('/Users/arup/Documents/ISRO-Project/prediction/processed_data/uo.csv')
vo_df = pd.read_csv('/Users/arup/Documents/ISRO-Project/prediction/processed_data/vo.csv')
kd_df = pd.read_csv('/Users/arup/Documents/ISRO-Project/prediction/imported_data/kd.csv')

# Convert time columns to datetime
uo_df['time'] = pd.to_datetime(uo_df['time'])
vo_df['time'] = pd.to_datetime(vo_df['time'])
kd_df['time'] = pd.to_datetime(kd_df['time'])

# Create new columns for the nearest point values
combined_df['uo'] = np.nan
combined_df['vo'] = np.nan
combined_df['kd'] = np.nan

# Iterate through combined_df rows
for idx, row in combined_df.iterrows():
    # Get nearest uo value
    uo_val = get_nearest_point_value(
        row['latitude'], 
        row['longitude'],
        uo_df,
        row['time']
    )
    if not pd.isna(uo_val).all():
        combined_df.at[idx, 'uo'] = uo_val['uo']
        
    # Get nearest vo value  
    vo_val = get_nearest_point_value(
        row['latitude'],
        row['longitude'], 
        vo_df,
        row['time']
    )
    if not pd.isna(vo_val).all():
        combined_df.at[idx, 'vo'] = vo_val['vo']
        
    # Get nearest kd value
    kd_val = get_nearest_point_value(
        row['latitude'],
        row['longitude'],
        kd_df, 
        row['time']
    )
    if not pd.isna(kd_val).all():
        combined_df.at[idx, 'kd'] = kd_val['kd']

# Drop any rows where we couldn't find matches
combined_df = combined_df.dropna(subset=['uo', 'vo', 'kd'])

In [6]:
# Reorder columns according to specified order
variables = ['thetao', 'so', 'uo', 'vo', 'wo', 'kd', 'ph', 'spco2', 'o2', 'no3', 'po4', 'si', 'fe', 'chl']
combined_df = combined_df[['time', 'longitude', 'latitude'] + variables]

combined_df

Unnamed: 0,time,longitude,latitude,thetao,so,uo,vo,wo,kd,ph,spco2,o2,no3,po4,si,fe,chl
31,2025-01-01,87.26,19.97,24.463710,30.846056,-0.032801,-0.041873,2.472016e-08,0.061784,8.081079,33.983177,214.18227,0.028434,0.035814,1.484352,0.001691,0.823264
37,2025-01-01,87.70,19.97,25.259663,32.383970,-0.032801,-0.066421,4.294948e-08,0.061416,8.060937,35.911453,208.17493,0.033679,0.057536,1.690931,0.001672,1.242999
52,2025-01-01,88.14,19.98,25.749617,33.093174,0.002028,-0.064781,5.065795e-08,0.050328,8.048275,37.390057,208.50151,0.070526,0.071205,1.864909,0.001671,1.806067
46,2025-01-01,88.58,19.99,25.936096,33.355045,-0.086984,-0.010332,4.507891e-08,0.049946,8.043919,38.872276,209.56064,0.031080,0.073519,1.766548,0.001691,2.038893
26,2025-01-01,89.02,19.99,26.054684,33.496220,-0.086984,0.012987,4.102340e-08,0.048111,8.042045,38.505657,210.43271,0.028767,0.070911,1.812253,0.001720,2.051734
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3194,2029-12-01,89.42,21.60,26.440117,32.955300,-0.046968,0.002839,4.825921e-08,0.069539,8.040186,39.385223,204.36882,0.036979,0.007698,1.272989,0.002298,0.430887
3188,2029-12-01,89.86,21.61,26.751585,32.814823,-0.048761,0.007282,5.387772e-08,0.072206,8.036771,39.618610,203.75328,0.068134,0.005435,1.293711,0.002333,0.409029
3199,2029-12-01,90.30,21.62,26.684328,32.612840,-0.052639,0.012583,5.198349e-08,0.129823,8.039713,39.213287,203.62840,0.234469,0.002679,1.342976,0.002462,0.416281
3206,2029-12-01,90.74,21.64,27.294374,32.486294,-0.030491,0.005861,3.779215e-08,0.299895,8.031207,40.054028,202.53156,0.364145,0.002586,1.372194,0.002528,0.378560


In [7]:
# Display first few rows and basic information
print("Combined DataFrame Shape:", combined_df.shape)
print("\nDataFrame Info:")
display(combined_df.info())

Combined DataFrame Shape: (3240, 17)

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 3240 entries, 31 to 3205
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   time       3240 non-null   object 
 1   longitude  3240 non-null   float64
 2   latitude   3240 non-null   float64
 3   thetao     3240 non-null   float64
 4   so         3240 non-null   float64
 5   uo         3240 non-null   float64
 6   vo         3240 non-null   float64
 7   wo         3240 non-null   float64
 8   kd         3240 non-null   float64
 9   ph         3240 non-null   float64
 10  spco2      3240 non-null   float64
 11  o2         3240 non-null   float64
 12  no3        3240 non-null   float64
 13  po4        3240 non-null   float64
 14  si         3240 non-null   float64
 15  fe         3240 non-null   float64
 16  chl        3240 non-null   float64
dtypes: float64(16), object(1)
memory usage: 584.7+ KB


None

In [8]:
# Save the combined dataframe to CSV
combined_df.to_csv('combined_data.csv', index=False)

In [9]:
# Import numpy
import numpy as np

# Function to remove outliers from all features
def remove_outliers(data):
    clean_data = data.copy()
    
    # Get all numeric columns except time, latitude, longitude
    numeric_cols = data.select_dtypes(include=[np.number]).columns
    numeric_cols = [col for col in numeric_cols if col not in ['time', 'latitude', 'longitude']]
    
    for column in numeric_cols:
        # Calculate Q1, Q3 and IQR
        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)
        IQR = Q3 - Q1
        
        # Create mask for non-outlier values
        mask = (data[column] >= Q1 - 1.5 * IQR) & (data[column] <= Q3 + 1.5 * IQR)
        
        # Update clean_data to keep only non-outlier values
        clean_data = clean_data[mask]
    
    # Reset index after removing outliers
    clean_data = clean_data.reset_index(drop=True)
    
    return clean_data

# Remove outliers from the combined dataframe
data = remove_outliers(combined_df)

print(f"Shape after removing outliers: {data.shape}")

Shape after removing outliers: (1708, 17)


  clean_data = clean_data[mask]
  clean_data = clean_data[mask]
  clean_data = clean_data[mask]
  clean_data = clean_data[mask]
  clean_data = clean_data[mask]
  clean_data = clean_data[mask]
  clean_data = clean_data[mask]
  clean_data = clean_data[mask]
  clean_data = clean_data[mask]
  clean_data = clean_data[mask]
  clean_data = clean_data[mask]
  clean_data = clean_data[mask]
  clean_data = clean_data[mask]


In [10]:
data

Unnamed: 0,time,longitude,latitude,thetao,so,uo,vo,wo,kd,ph,spco2,o2,no3,po4,si,fe,chl
0,2025-01-01,87.26,19.97,24.463710,30.846056,-0.032801,-0.041873,2.472016e-08,0.061784,8.081079,33.983177,214.18227,0.028434,0.035814,1.484352,0.001691,0.823264
1,2025-01-01,87.70,19.97,25.259663,32.383970,-0.032801,-0.066421,4.294948e-08,0.061416,8.060937,35.911453,208.17493,0.033679,0.057536,1.690931,0.001672,1.242999
2,2025-01-01,88.58,19.99,25.936096,33.355045,-0.086984,-0.010332,4.507891e-08,0.049946,8.043919,38.872276,209.56064,0.031080,0.073519,1.766548,0.001691,2.038893
3,2025-01-01,89.02,19.99,26.054684,33.496220,-0.086984,0.012987,4.102340e-08,0.048111,8.042045,38.505657,210.43271,0.028767,0.070911,1.812253,0.001720,2.051734
4,2025-01-01,89.90,20.01,26.124374,33.565876,-0.100962,0.016858,3.595334e-08,0.048510,8.044349,38.366405,210.07358,0.057479,0.060628,1.828091,0.001752,1.549865
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1703,2029-12-01,90.75,21.32,27.775320,32.848130,-0.039491,0.024635,5.268714e-08,0.085374,8.019429,41.528930,203.16068,0.057365,0.019050,1.295856,0.002203,0.433064
1704,2029-12-01,87.67,21.57,26.230286,27.032500,0.033251,-0.014300,-1.124159e-07,0.113466,8.079239,30.076532,206.14552,0.029071,0.029042,1.317138,0.001594,0.359576
1705,2029-12-01,88.55,21.58,26.366900,30.978502,0.013266,-0.053600,-8.973735e-08,0.085398,8.053456,36.479973,205.01813,0.030604,0.018437,1.235428,0.001945,0.426599
1706,2029-12-01,88.98,21.59,26.139822,32.877464,-0.032583,-0.008526,4.555405e-08,0.069958,8.045031,38.763214,204.97984,0.036267,0.007088,1.255701,0.002267,0.437492


In [11]:
# Display first few rows and basic information
print("Cleaned DataFrame Shape:", data.shape)
print("\nDataFrame Info:")
display(data.info())

Cleaned DataFrame Shape: (1708, 17)

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1708 entries, 0 to 1707
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   time       1708 non-null   object 
 1   longitude  1708 non-null   float64
 2   latitude   1708 non-null   float64
 3   thetao     1708 non-null   float64
 4   so         1708 non-null   float64
 5   uo         1708 non-null   float64
 6   vo         1708 non-null   float64
 7   wo         1708 non-null   float64
 8   kd         1708 non-null   float64
 9   ph         1708 non-null   float64
 10  spco2      1708 non-null   float64
 11  o2         1708 non-null   float64
 12  no3        1708 non-null   float64
 13  po4        1708 non-null   float64
 14  si         1708 non-null   float64
 15  fe         1708 non-null   float64
 16  chl        1708 non-null   float64
dtypes: float64(16), object(1)
memory usage: 227.0+ KB


None

In [12]:
# Save the cleaned data to a new CSV file
data.to_csv('cleaned_data.csv', index=False)

In [13]:
# Add new imports at the top of your notebook
from plotly.subplots import make_subplots
import plotly.graph_objects as go

def create_all_timeseries(data):
    """Create time series plots for all parameters"""
    # Get numeric columns except lat/lon
    numeric_cols = data.select_dtypes(include=['float64']).columns
    parameters = [col for col in numeric_cols if col not in ['latitude', 'longitude']]
    
    # Create subplots with one column
    n_rows = len(parameters)
    
    fig = make_subplots(rows=n_rows, cols=1,
                       subplot_titles=parameters)
    
    # Add each parameter's time series
    for idx, param in enumerate(parameters):
        row = idx + 1
        
        fig.add_trace(
            go.Scatter(
                x=data['time'],
                y=data[param],
                name=param,
                mode='markers',
                marker=dict(
                    size=4,
                    opacity=0.6
                )
            ),
            row=row, col=1
        )
        
    # Update layout
    fig.update_layout(height=300*n_rows,
                     showlegend=False, 
                     title_text="Time Series of All Parameters")
    fig.show()

# Generate visualization
print("Time Series Plots:")
create_all_timeseries(data)

Time Series Plots:


In [14]:
# Add these imports at the top of your notebook
import folium
from ipywidgets import interact, widgets

def create_interactive_map(data):
    """Create an interactive map with parameter and time selection"""
    # Process data to get monthly averages
    data['time'] = pd.to_datetime(data['time'])
    data['month'] = data['time'].dt.to_period('M')
    monthly_data = data.groupby(['month', 'latitude', 'longitude']).mean().reset_index()
    monthly_data['month'] = monthly_data['month'].astype(str)
    
    def update_map(parameter, selected_month):
        # Create base map with scrollWheelZoom disabled
        center_lat = data['latitude'].mean()
        center_lon = data['longitude'].mean()
        m = folium.Map(location=[center_lat, center_lon], zoom_start=8, scrollWheelZoom=False)  # Disable scroll wheel zoom
        
        # Filter data for selected month
        month_data = monthly_data[monthly_data['month'] == selected_month]
        
        # Create colormap
        colormap = folium.LinearColormap(
            colors=['blue', 'yellow', 'red'],
            vmin=month_data[parameter].min(),
            vmax=month_data[parameter].max(),
            caption=f'{parameter} values'
        )
        
        # Add points with color based on parameter value
        for idx, row in month_data.iterrows():
            folium.CircleMarker(
                location=[row['latitude'], row['longitude']],
                radius=8,
                color=None,
                fill=True,
                fill_color=colormap(row[parameter]),
                fill_opacity=0.7,
                popup=f"{parameter}: {row[parameter]:.2f}"
            ).add_to(m)
        
        # Add colormap to map
        colormap.add_to(m)
        
        # Display map
        display(m)
    
    # Get available months for dropdown
    available_months = sorted(monthly_data['month'].unique())
    
    # Create dropdown options
    numeric_cols = data.select_dtypes(include=['float64']).columns
    parameters = [col for col in numeric_cols if col not in ['latitude', 'longitude']]
    
    # Create interactive widgets
    interact(
        update_map,
        parameter=widgets.Dropdown(
            options=parameters,
            description='Parameter:',
            style={'description_width': 'initial'}
        ),
        selected_month=widgets.Dropdown(
            options=available_months,
            description='Month:',
            style={'description_width': 'initial'}
        )
    )

print("\nInteractive Map:")
create_interactive_map(data)


Interactive Map:


interactive(children=(Dropdown(description='Parameter:', options=('thetao', 'so', 'uo', 'vo', 'wo', 'kd', 'ph'…

In [15]:
# # Add these imports
# from selenium import webdriver
# from selenium.webdriver.chrome.options import Options
# import time
# import os

# def save_map_screenshots(data):
#     """Save screenshots of maps for all parameters and months"""
#     # Process data to get monthly averages (same as before)
#     data['time'] = pd.to_datetime(data['time'])
#     data['month'] = data['time'].dt.to_period('M')
#     monthly_data = data.groupby(['month', 'latitude', 'longitude']).mean().reset_index()
#     monthly_data['month'] = monthly_data['month'].astype(str)
    
#     # Create screenshots directory if it doesn't exist
#     os.makedirs('map_screenshots', exist_ok=True)
    
#     # Setup Chrome options for headless browsing
#     chrome_options = Options()
#     chrome_options.add_argument("--headless")  # Run in headless mode
#     chrome_options.add_argument("--window-size=1920,1080")
    
#     # Get parameters and months
#     numeric_cols = data.select_dtypes(include=['float64']).columns
#     parameters = [col for col in numeric_cols if col not in ['latitude', 'longitude']]
#     available_months = sorted(monthly_data['month'].unique())
    
#     # Initialize webdriver
#     driver = webdriver.Chrome(options=chrome_options)
    
#     try:
#         for parameter in parameters:
#             for month in available_months:
#                 # Create map
#                 center_lat = data['latitude'].mean()
#                 center_lon = data['longitude'].mean()
#                 m = folium.Map(location=[center_lat, center_lon], zoom_start=8)
                
#                 # Filter data for selected month
#                 month_data = monthly_data[monthly_data['month'] == month]
                
#                 # Create colormap
#                 colormap = folium.LinearColormap(
#                     colors=['blue', 'yellow', 'red'],
#                     vmin=month_data[parameter].min(),
#                     vmax=month_data[parameter].max(),
#                     caption=f'{parameter} values'
#                 )
                
#                 # Add points
#                 for idx, row in month_data.iterrows():
#                     folium.CircleMarker(
#                         location=[row['latitude'], row['longitude']],
#                         radius=8,
#                         color=None,
#                         fill=True,
#                         fill_color=colormap(row[parameter]),
#                         fill_opacity=0.7,
#                         popup=f"{parameter}: {row[parameter]:.2f}"
#                     ).add_to(m)
                
#                 colormap.add_to(m)
                
#                 # Save map to temporary HTML file
#                 temp_html = f'temp_map.html'
#                 m.save(temp_html)
                
#                 # Load the HTML in selenium
#                 driver.get(f'file://{os.path.abspath(temp_html)}')
#                 time.sleep(2)  # Wait for map to load
                
#                 # Save screenshot
#                 filename = f'map_screenshots/{parameter}_{month}.png'
#                 driver.save_screenshot(filename)
#                 print(f"Saved {filename}")
                
#                 # Clean up temporary file
#                 os.remove(temp_html)
    
#     finally:
#         driver.quit()

# # Run the function to save all screenshots
# save_map_screenshots(data)

Saved map_screenshots/thetao_2025-01.png
Saved map_screenshots/thetao_2025-02.png
Saved map_screenshots/thetao_2025-03.png
Saved map_screenshots/thetao_2025-04.png
Saved map_screenshots/thetao_2025-05.png
Saved map_screenshots/thetao_2025-06.png
Saved map_screenshots/thetao_2025-07.png
Saved map_screenshots/thetao_2025-08.png
Saved map_screenshots/thetao_2025-09.png
Saved map_screenshots/thetao_2025-10.png
Saved map_screenshots/thetao_2025-11.png
Saved map_screenshots/thetao_2025-12.png
Saved map_screenshots/thetao_2026-01.png
Saved map_screenshots/thetao_2026-02.png
Saved map_screenshots/thetao_2026-03.png
Saved map_screenshots/thetao_2026-04.png
Saved map_screenshots/thetao_2026-05.png
Saved map_screenshots/thetao_2026-06.png
Saved map_screenshots/thetao_2026-07.png
Saved map_screenshots/thetao_2026-08.png
Saved map_screenshots/thetao_2026-09.png
Saved map_screenshots/thetao_2026-10.png
Saved map_screenshots/thetao_2026-11.png
Saved map_screenshots/thetao_2026-12.png
Saved map_screen