In [None]:
import pandas as pd
import numpy as np
import re

def preprocessData(input_file, output_file):
    df = pd.read_csv(input_file)
    print(f"Original data shape: {df.shape}")
    print("Original columns:", df.columns.tolist())
    print("\nFirst few rows before preprocessing:")
    print(df.head())
    def methods(value):
        if pd.isna(value):
            return np.nan

        value_str = str(value)
        if ':' in value_str:
            value_str = value_str.split(':', 1)[1]
        value_str = value_str.strip().rstrip(',')

        if re.match(r'^\d+\.$', value_str):
            value_str = value_str + '00'

        return value_str

    colClean = ['Temperature', 'Humidity', 'Acc_X', 'Acc_Y', 'Acc_Z','Gyro_X', 'Gyro_Y', 'Gyro_Z', 'LDR']
    for i in colClean:
        if i in df.columns:
            df[i] = df[i].apply(methods)

    if 'Timestamp' in df.columns:
        df['Timestamp'] = df['Timestamp'].apply(lambda x: x.split(' ')[1] if ' ' in str(x) else x)

    print("\nAfter cleaning prefixes and formatting:")
    print(df.head())

    numeric_columns = ['Temperature', 'Humidity', 'Acc_X', 'Acc_Y', 'Acc_Z','Gyro_X', 'Gyro_Y', 'Gyro_Z', 'LDR']
    for i in numeric_columns:
        if i in df.columns:
            df[i] = pd.to_numeric(df[i], errors='coerce')

    print(f"\nData shape before dropna: {df.shape}")
    print("Missing values per column:")
    print(df.isnull().sum())
    missing = df.isnull().any(axis=1)
    for i in numeric_columns:
        if i in df.columns:
            empty_mask = (df[i] == '') | (df[i] == ' ')
            missing = missing | empty_mask

    df_cleaned = df[~missing].copy()

    print(f"\nData shape after dropna: {df_cleaned.shape}")
    print(f"Removed {df.shape[0] - df_cleaned.shape[0]} rows with missing values")

    for i in numeric_columns:
        if i in df_cleaned.columns:
            df_cleaned[i] = df_cleaned[i].apply(
                lambda x: f"{x:.2f}" if pd.notna(x) else x)

    print("\nFinal processed data:")
    print(df_cleaned.head(10))
    print(f"\nFinal data shape: {df_cleaned.shape}")
    df_cleaned.to_csv(output_file, index=False)
    print(f"\nProcessed data saved to: {output_file}")

    return df_cleaned


if __name__ == "__main__":
    input_file = "./data/NaadVriksha (Responses) - Rainy.csv"
    output_file = "./preprocessed_data/NaadVriksha_Rainy_Processed.csv"
    try:
        processed_df = preprocessData(input_file, output_file)
        print("\n" + "="*50)
        print("PREPROCESSING COMPLETED SUCCESSFULLY!")
        print("="*50)
        print(f"Input file: {input_file}")
        print(f"Output file: {output_file}")
        print(f"Rows processed: {len(processed_df)}")
    except FileNotFoundError:
        print(f"Error: Could not find the file '{input_file}'")
        print("Please make sure the file exists in the current directory.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

file_paths = ["./preprocessed_data/NaadVriksha_Cloudy_Processed.csv", "./preprocessed_data/NaadVriksha_Rainy_Processed.csv", "./preprocessed_data/NaadVriksha_Stormy_Processed.csv", "./preprocessed_data/NaadVriksha_Sunny_Processed.csv", "./preprocessed_data/NaadVriksha_Windy_Processed.csv"]
dfs = [pd.read_csv(file) for file in file_paths]
combined_df = pd.concat(dfs, ignore_index=True)
numeric_cols = ['Temperature', 'Humidity', 'Acc_X', 'Acc_Y', 'Acc_Z','Gyro_X', 'Gyro_Y', 'Gyro_Z', 'LDR']
scaler = MinMaxScaler()
combined_df[numeric_cols] = scaler.fit_transform(combined_df[numeric_cols])
final_output_path = "./finalData.csv"
combined_df.to_csv(final_output_path, index=False)
final_output_path

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import os

def directoryStruct(base_dir):
    weather_conditions = ['Sunny', 'Rainy', 'Cloudy', 'Stormy', 'Windy']
    splits = ['train', 'val', 'test']
    for weather in weather_conditions:
        for split in splits:
            dir_path = os.path.join(base_dir, f"{weather}Split", split)
            os.makedirs(dir_path, exist_ok=True)
            print(f"Created directory: {dir_path}")
    
    
    combined_dir = os.path.join(base_dir, 'Combined')
    os.makedirs(combined_dir, exist_ok=True)
    print(f"Created directory: {combined_dir}")

def splitData(file_path, weather_name, output_base_dir, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
    df = pd.read_csv(file_path)
    print(f"\nProcessing {weather_name} data...")
    print(f"Original shape: {df.shape}")
    
    total_samples = len(df)
    train_size = int(total_samples * train_ratio)
    val_size = int(total_samples * val_ratio)
    test_size = total_samples - train_size - val_size  
    train_data = df.iloc[:train_size].copy()
    val_data = df.iloc[train_size:train_size + val_size].copy()
    test_data = df.iloc[train_size + val_size:].copy()    
    print(f"Split sizes - Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")
    weather_dir = f"{weather_name}Split"
    
    
    train_path = os.path.join(output_base_dir, weather_dir, 'train', f'{weather_name}_train.csv')
    train_data.to_csv(train_path, index=False)
    print(f"Saved train data to: {train_path}")
    val_path = os.path.join(output_base_dir, weather_dir, 'val', f'{weather_name}_val.csv')
    val_data.to_csv(val_path, index=False)
    print(f"Saved validation data to: {val_path}")
    test_path = os.path.join(output_base_dir, weather_dir, 'test', f'{weather_name}_test.csv')
    test_data.to_csv(test_path, index=False)
    print(f"Saved test data to: {test_path}")
    
    return {'train': train_data,'val': val_data,'test': test_data}

def create_combined_datasets_with_normalization(output_base_dir):
    weather_conditions = ['Sunny', 'Rainy', 'Cloudy', 'Stormy', 'Windy']
    splits = ['train', 'val', 'test']
    numerical_cols = ['Temperature', 'Humidity', 'Acc_X', 'Acc_Y', 'Acc_Z', 'Gyro_X', 'Gyro_Y', 'Gyro_Z', 'LDR']
    combined_dir = os.path.join(output_base_dir, 'Combined')
    combined_datasets = {}
    
    for split in splits:
        combined_data = []
        print(f"\nCombining {split} data from all weather conditions...")
        for weather in weather_conditions:
            file_path = os.path.join(output_base_dir, f"{weather}Split", split, f'{weather}_{split}.csv')
            
            if os.path.exists(file_path):
                df = pd.read_csv(file_path)
                combined_data.append(df)
                print(f"Added {weather} {split} data: {len(df)} samples")
            else:
                print(f"Warning: {file_path} not found")
        
        if combined_data:
            combined_df = pd.concat(combined_data, ignore_index=True)
            combined_datasets[split] = combined_df
            print(f"Combined {split} dataset: {len(combined_df)} samples")
            print(f"Weather distribution in {split}:")
            print(combined_df['Weather'].value_counts())
    
    print(f"\n{'='*50}")
    print("APPLYING NORMALIZATION TO COMBINED DATASETS")
    print(f"{'='*50}")
    if 'train' in combined_datasets:
        train_df = combined_datasets['train']
        existing_numerical_cols = [col for col in numerical_cols if col in train_df.columns]
        print(f"Normalizing columns: {existing_numerical_cols}")
        scaler = MinMaxScaler()
        scaler.fit(train_df[existing_numerical_cols])
        for split, df in combined_datasets.items():
            print(f"\nNormalizing {split} data...")
            normalized_df = df.copy()
            normalized_df[existing_numerical_cols] = scaler.transform(df[existing_numerical_cols])
            print(f"Data ranges after normalization in {split}:")
            for col in existing_numerical_cols:
                print(f"  {col}: {normalized_df[col].min():.3f} to {normalized_df[col].max():.3f}")            
            normalized_path = os.path.join(combined_dir, f'combined_{split}_normalized.csv')
            normalized_df.to_csv(normalized_path, index=False)
            print(f"Saved normalized {split} data to: {normalized_path}")
            non_normalized_path = os.path.join(combined_dir, f'combined_{split}_raw.csv')
            df.to_csv(non_normalized_path, index=False)
            print(f"Saved raw {split} data to: {non_normalized_path}")
    
    return combined_datasets

def main():
    preprocessed_data_dir = 'preprocessed_data'
    output_base_dir = 'split_data'
    files_to_process = {'Sunny': 'NaadVriksha_Sunny_Processed.csv','Rainy': 'NaadVriksha_Rainy_Processed.csv','Cloudy': 'NaadVriksha_Cloudy_Processed.csv','Stormy': 'NaadVriksha_Stormy_Processed.csv','Windy': 'NaadVriksha_Windy_Processed.csv'}
    
    print("Creating directory structure...")
    directoryStruct(output_base_dir)
    print(f"\n{'='*50}")
    print("STEP 1: SPLITTING DATA WITHOUT NORMALIZATION")
    print(f"{'='*50}")
    all_splits = {}
    for weather_name, filename in files_to_process.items():
        file_path = os.path.join(preprocessed_data_dir, filename)
        if os.path.exists(file_path):
            splits = splitData(file_path=file_path,weather_name=weather_name,output_base_dir=output_base_dir,train_ratio=0.7,val_ratio=0.15,test_ratio=0.15)
            all_splits[weather_name] = splits
        else:
            print(f"Warning: File {file_path} not found!")
    
    print(f"\n{'='*50}")
    print("STEP 2: COMBINING AND NORMALIZING DATA")
    print(f"{'='*50}")
    combined_datasets = create_combined_datasets_with_normalization(output_base_dir)
    
    print(f"\n{'='*50}")
    print("SUMMARY")
    print(f"{'='*50}")
    
    print("Individual Weather Splits (Raw Data):")
    for weather_name, splits in all_splits.items():
        print(f"{weather_name}:")
        print(f"  Train: {len(splits['train'])} samples")
        print(f"  Val: {len(splits['val'])} samples")
        print(f"  Test: {len(splits['test'])} samples")
        print(f"  Total: {len(splits['train']) + len(splits['val']) + len(splits['test'])} samples")
    
    print(f"\nCombined Datasets:")
    for split, df in combined_datasets.items():
        print(f"Combined {split}: {len(df)} samples")
        print(f"  Weather distribution: {dict(df['Weather'].value_counts())}")
    
    print(f"\nAll data saved in: {output_base_dir}/")
    print("Final Directory Structure:")
    print("├─ split_data/")
    print("│  ├─ SunnySplit/")
    print("│  │  ├─ train/ (Sunny_train.csv)")
    print("│  │  ├─ val/ (Sunny_val.csv)")
    print("│  │  └─ test/ (Sunny_test.csv)")
    print("│  ├─ RainySplit/ ...")
    print("│  ├─ CloudySplit/ ...")
    print("│  ├─ StormySplit/ ...")
    print("│  ├─ WindySplit/ ...")
    print("│  └─ Combined/")
    print("│     ├─ combined_train_raw.csv")
    print("│     ├─ combined_train_normalized.csv")
    print("│     ├─ combined_val_raw.csv")
    print("│     ├─ combined_val_normalized.csv")
    print("│     ├─ combined_test_raw.csv")
    print("│     └─ combined_test_normalized.csv")

if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
weather_encoding = {"Sunny": 0,"Rainy": 1,"Stormy": 2,"Windy": 3,"Cloudy": 4}
folder_path = "split_data/Combined"
files = {"train": "combined_train_normalized.csv","val": "combined_val_normalized.csv","test": "combined_test_normalized.csv"}

for split, filename in files.items():
    file_path = f"{folder_path}/{filename}"
    df = pd.read_csv(file_path)
    df["Weather"] = df["Weather"].map(weather_encoding)
    df.to_csv(file_path, index=False)
    print(f"{split.capitalize()} file encoded and saved: {file_path}")

In [None]:
print("\nEncoding 'Weather' column in combined datasets...")
print("The weathers are encoded as follows:")
for weather, code in weather_encoding.items():
    print(f"{weather}: {code}")
print("\nEncoding 'Weather' column in combined datasets...")