In [2]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

In [3]:

"""
Preprocess rainfall and station data for rainfall trend analysis in Eastern Nepal.
This script loads raw data, handles missing values and duplicates, performs feature engineering,
splits data into training and testing sets, and saves preprocessed data.
"""


# Define file paths
RAW_DATA_PATH = '../Data/Raw'
PREPROCESSED_PATH = '../Data/Preprocessed'

# Ensure preprocessed directory exists
os.makedirs(PREPROCESSED_PATH, exist_ok=True)

def load_data():
    """Load station and rainfall data from CSV files."""
    try:
        stations = pd.read_csv(os.path.join(RAW_DATA_PATH, 'Eastern Data.csv'))
        rainfall = pd.read_csv(os.path.join(RAW_DATA_PATH, 'rainfall_data.csv'))
        print("Data loaded successfully.")
        print("Stations columns:", stations.columns.tolist())
        print("Rainfall columns:", rainfall.columns.tolist())
        return stations, rainfall
    except FileNotFoundError as e:
        raise FileNotFoundError(f"Error: {e}. Check if the files exist in {RAW_DATA_PATH}")

def handle_missing_values(stations, rainfall):
    """Handle missing values in the datasets."""
    # Rainfall: Fill missing 'rainfall_sum' with 0 (assuming no rain)
    if 'rainfall_sum' in rainfall.columns:
        rainfall['rainfall_sum'] = rainfall['rainfall_sum'].fillna(0)
    else:
        raise ValueError("Column 'rainfall_sum' not found in rainfall data")
    
    # Stations: Drop rows with missing critical fields (e.g., Index No.)
    if 'Index No.' in stations.columns:
        stations = stations.dropna(subset=['Index No.'])
    else:
        raise ValueError("Column 'Index No.' not found in stations data")
    
    print("Missing values handled.")
    return stations, rainfall

def handle_duplicates(stations, rainfall):
    """Remove duplicate rows from the datasets."""
    stations = stations.drop_duplicates()
    rainfall = rainfall.drop_duplicates()
    print(f"Duplicates removed. Stations: {len(stations)}, Rainfall: {len(rainfall)} rows.")
    return stations, rainfall

def preprocess_and_merge(stations, rainfall):
    """Standardize column names, merge datasets, and create date features."""
    # Standardize column names
    stations.columns = stations.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('.', '_')
    rainfall.columns = rainfall.columns.str.strip().str.lower().str.replace(' ', '_')
    
    # Rename columns for consistency
    stations = stations.rename(columns={
        'index_no_': 'station_id',
        'ele': 'elevation',
        'lat': 'latitude',
        'lon': 'longitude'
    })
    rainfall = rainfall.rename(columns={
        'index_no': 'station_id',
        'station': 'station_name'
    })
    
    # Merge datasets on station_id
    merged_data = pd.merge(rainfall, stations, on='station_id', how='left')
    
    # Create date column
    merged_data['date'] = pd.to_datetime(merged_data[['year', 'month', 'days']], errors='coerce')
    
    # Drop rows with invalid dates
    merged_data = merged_data.dropna(subset=['date'])
    
    # Extract additional time-based features
    merged_data['month'] = merged_data['date'].dt.month
    merged_data['year'] = merged_data['date'].dt.year
    
    print("Data merged and features engineered.")
    return merged_data

def split_data(merged_data):
    """Split data into training and testing sets without shuffling for time series."""
    merged_data = merged_data.sort_values('date')
    train_data, test_data = train_test_split(merged_data, test_size=0.2, shuffle=False)
    print(f"Data split: Training ({len(train_data)} rows), Testing ({len(test_data)} rows)")
    return train_data, test_data

def save_data(train_data, test_data):
    """Save preprocessed training and testing data."""
    train_data.to_csv(os.path.join(PREPROCESSED_PATH, 'train_data.csv'), index=False)
    test_data.to_csv(os.path.join(PREPROCESSED_PATH, 'test_data.csv'), index=False)
    print(f"Preprocessed data saved to {PREPROCESSED_PATH}")

def main():
    """Main function to execute preprocessing steps."""
    stations, rainfall = load_data()
    stations, rainfall = handle_missing_values(stations, rainfall)
    stations, rainfall = handle_duplicates(stations, rainfall)
    merged_data = preprocess_and_merge(stations, rainfall)
    print("Feature scaling skipped as rainfall data is already in consistent units (mm).")
    train_data, test_data = split_data(merged_data)
    save_data(train_data, test_data)
    print("Data preprocessing completed successfully.")

if __name__ == "__main__":
    main()

Data loaded successfully.
Stations columns: ['S.N.', 'Station Name', 'Index No.', 'Basin Office', 'Types of Station', 'District', 'Lat(deg)', 'Lon(deg)', 'Ele(meter)', 'Unnamed: 9']
Rainfall columns: ['gsid', 'index_no', 'station', 'district', 'year', 'month', 'days', 'rainfall_sum', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10']
Missing values handled.
Duplicates removed. Stations: 18, Rainfall: 269380 rows.
Data merged and features engineered.
Feature scaling skipped as rainfall data is already in consistent units (mm).
Data split: Training (215504 rows), Testing (53876 rows)
Preprocessed data saved to ../Data/Preprocessed
Data preprocessing completed successfully.
