In [None]:
import kagglehub

path = kagglehub.dataset_download("sajilck/rain-data-master-8k")

print("Path to dataset files:", path)

In [None]:
import pandas as pd
import numpy as np
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import datetime

CSV_FILE_PATH = '/kaggle/input/rain-data-master-8k/rain_data_mechanical_master.csv'
AUDIO_FOLDER_PATH = '/kaggle/input/rain-data-master-8k/rainfall_sound_8k'

try:
    df_rain = pd.read_csv(CSV_FILE_PATH)
    print("Successfully loaded rain_data_mechanical_master.csv")
    print("Shape of the dataframe:", df_rain.shape)
except FileNotFoundError:
    print(f"Error: The file {CSV_FILE_PATH} was not found.")
    df_rain = None

try:
    audio_files = sorted([f for f in os.listdir(AUDIO_FOLDER_PATH) if f.endswith('.wav')])
    print(f"Found {len(audio_files)} audio files in {AUDIO_FOLDER_PATH}")
except FileNotFoundError:
    print(f"Error: The directory {AUDIO_FOLDER_PATH} was not found.")
    audio_files = []

In [None]:
if audio_files:
    sample_audio_path = os.path.join(AUDIO_FOLDER_PATH, audio_files[0])
    try:
        audio_data, sampling_rate = librosa.load(sample_audio_path, sr=None)

        print(f"1.a. The sampling rate of the audio files is: {sampling_rate} Hz")

        length_in_samples = len(audio_data)
        print(f"1.b. The length of an individual audio file is: {length_in_samples} samples")

    except Exception as e:
        print(f"An error occurred while reading the audio file: {e}")
else:
    print("Cannot determine sampling rate and length because no audio files were found.")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

if df_rain is not None:
    def convert_rainfall_to_mm(value):
        try:
            value_str = str(value)
            if 'µm' in value_str:
                numeric_part = value_str.replace('µm', '').strip()
                return float(numeric_part) / 1000
            elif 'mm' in value_str:
                numeric_part = value_str.replace('mm', '').strip()
                return float(numeric_part)
            else:
                return float(value_str)
        except (ValueError, TypeError):
            return None

    df_rain['rainfall_mm'] = df_rain['device_frmpayload_data_rainfall'].apply(convert_rainfall_to_mm)

    df_rain['Time'] = pd.to_datetime(df_rain['Time'])

    plt.style.use('seaborn-v0_8-whitegrid')
    plt.figure(figsize=(16, 6))

    sns.lineplot(data=df_rain, x='Time', y='rainfall_mm')

    plt.title('Rainfall Recorded Over Time', fontsize=16)
    plt.xlabel('Time', fontsize=12)
    plt.ylabel('Rainfall (in mm)', fontsize=12)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

else:
    print("Cannot create plot because the dataframe is not loaded.")

In [None]:
if df_rain is not None:
    max_rain_row = df_rain.loc[df_rain['rainfall_mm'].idxmax()]
    max_rain_date = max_rain_row['Time'].date()
    max_rain_value = max_rain_row['rainfall_mm']
    print(f"1.d. Maximum rainfall of {max_rain_value:.2f} mm was observed on: {max_rain_date}")

    non_zero_rain_df = df_rain[df_rain['rainfall_mm'] > 0]
    min_rain_row = non_zero_rain_df.loc[non_zero_rain_df['rainfall_mm'].idxmin()]
    min_rain_date = min_rain_row['Time'].date()
    min_rain_value = min_rain_row['rainfall_mm']
    print(f"1.d. Minimum (non-zero) rainfall of {min_rain_value:.2f} mm was observed on: {min_rain_date}")

In [None]:
df_rain = pd.read_csv(CSV_FILE_PATH)
df_rain['Time'] = pd.to_datetime(df_rain['Time'])

audio_files = sorted([f for f in os.listdir(AUDIO_FOLDER_PATH) if f.endswith('.wav')])

def get_timestamp_from_filename(filename):
    clean_filename = filename.replace('.wav', '')
    return datetime.datetime.strptime(clean_filename, '%Y_%m_%d_%H_%M_%S_%f')

df_audio = pd.DataFrame({'audio_filename': audio_files})
df_audio['timestamp'] = df_audio['audio_filename'].apply(get_timestamp_from_filename)
df_audio = df_audio.set_index('timestamp')

In [None]:
all_relevant_files_info = []

for index, row in tqdm(df_rain.iterrows(), total=df_rain.shape[0]):
    target_timestamp = row['Time']
    start_time = target_timestamp - datetime.timedelta(minutes=3)
    end_time = target_timestamp

    relevant_audio_df = df_audio[(df_audio.index >= start_time) & (df_audio.index <= end_time)]

    info = {
        'target_time': target_timestamp,
        'found_files_df': relevant_audio_df
    }
    all_relevant_files_info.append(info)

print(f"Processed all {len(all_relevant_files_info)} timestamps.")
print("\nExample result for the first processed timestamp:")
first_result = all_relevant_files_info[0]
print(f"For the timestamp {first_result['target_time']}, we found {len(first_result['found_files_df'])} relevant audio files.")
print(first_result['found_files_df'])

In [None]:
all_combined_audio = {}

for index, row in tqdm(df_rain.iterrows(), total=df_rain.shape[0]):
    target_timestamp = row['Time']
    start_time = target_timestamp - datetime.timedelta(minutes=3)
    end_time = target_timestamp

    relevant_audio_df = df_audio[(df_audio.index >= start_time) & (df_audio.index <= end_time)]

    if not relevant_audio_df.empty:
        files_to_combine = relevant_audio_df.sort_index()['audio_filename'].tolist()

        list_of_audio_arrays = []
        for file_name in files_to_combine:
            audio_path = os.path.join(AUDIO_FOLDER_PATH, file_name)
            audio_segment, _ = librosa.load(audio_path, sr=sampling_rate)
            list_of_audio_arrays.append(audio_segment)

        if list_of_audio_arrays:
            combined_audio = np.concatenate(list_of_audio_arrays)
            all_combined_audio[target_timestamp] = combined_audio

print(f"Processed all timestamps and created {len(all_combined_audio)} combined audio arrays.")
last_timestamp_processed = list(all_combined_audio.keys())[-1]
last_combined_array = all_combined_audio[last_timestamp_processed]
print(f"The combined audio array for timestamp {last_timestamp_processed} has a length of {len(last_combined_array)} samples.")

In [None]:
OUTPUT_NUMPY_FOLDER = '/content/combined_audio_numpy'

if not os.path.exists(OUTPUT_NUMPY_FOLDER):
    os.makedirs(OUTPUT_NUMPY_FOLDER)
    print(f"Created directory: {OUTPUT_NUMPY_FOLDER}")

saved_files_count = 0
for index, row in tqdm(df_rain.iterrows(), total=df_rain.shape[0]):
    target_timestamp = row['Time']
    start_time = target_timestamp - datetime.timedelta(minutes=3)
    end_time = target_timestamp

    relevant_audio_df = df_audio[(df_audio.index >= start_time) & (df_audio.index <= end_time)]

    if not relevant_audio_df.empty:
        files_to_combine = relevant_audio_df.sort_index()['audio_filename'].tolist()

        list_of_audio_arrays = []
        for file_name in files_to_combine:
            audio_path = os.path.join(AUDIO_FOLDER_PATH, file_name)
            audio_segment, _ = librosa.load(audio_path, sr=sampling_rate)
            list_of_audio_arrays.append(audio_segment)

        if list_of_audio_arrays:
            combined_audio = np.concatenate(list_of_audio_arrays)

            numpy_filename = f"audio_{target_timestamp.strftime('%Y%m%d_%H%M%S')}.npy"
            output_path = os.path.join(OUTPUT_NUMPY_FOLDER, numpy_filename)

            np.save(output_path, combined_audio)
            saved_files_count += 1

print(f"\nProcess complete. Saved {saved_files_count} .npy files to the '{OUTPUT_NUMPY_FOLDER}' directory.")

In [None]:
import pandas as pd
import numpy as np
import os
import librosa
import datetime
from tqdm.notebook import tqdm

CSV_FILE_PATH = '/kaggle/input/rain-data-master-8k/rain_data_mechanical_master.csv'
AUDIO_FOLDER_PATH = '/kaggle/input/rain-data-master-8k/rainfall_sound_8k/'
FINAL_CSV_OUTPUT_PATH = '/preprocessed_rainfall_features.csv'

df_rain = pd.read_csv(CSV_FILE_PATH)
df_rain['Time'] = pd.to_datetime(df_rain['Time'])

def convert_rainfall_to_mm(value):
    try:
        value_str = str(value)
        if 'µm' in value_str:
            numeric_part = value_str.replace('µm', '').strip()
            return float(numeric_part) / 1000
        elif 'mm' in value_str:
            numeric_part = value_str.replace('mm', '').strip()
            return float(numeric_part)
        else:
            return float(value_str)
    except (ValueError, TypeError):
        return None

df_rain['rainfall_cleaned'] = df_rain['device_frmpayload_data_rainfall'].apply(convert_rainfall_to_mm)

audio_files = sorted([f for f in os.listdir(AUDIO_FOLDER_PATH) if f.endswith('.wav')])

def get_timestamp_from_filename(filename):
    clean_filename = filename.replace('.wav', '')
    return datetime.datetime.strptime(clean_filename, '%Y_%m_%d_%H_%M_%S_%f')

df_audio = pd.DataFrame({'audio_filename': audio_files})
df_audio['timestamp'] = df_audio['audio_filename'].apply(get_timestamp_from_filename)
df_audio = df_audio.set_index('timestamp')

_, sampling_rate = librosa.load(os.path.join(AUDIO_FOLDER_PATH, audio_files[0]), sr=None)

all_features_list = []

for index, row in tqdm(df_rain.iterrows(), total=df_rain.shape[0]):
    target_timestamp = row['Time']
    target_rainfall = row['rainfall_cleaned']

    start_time = target_timestamp - datetime.timedelta(minutes=3)
    end_time = target_timestamp
    
    relevant_audio_df = df_audio[(df_audio.index >= start_time) & (df_audio.index <= end_time)]

    if not relevant_audio_df.empty:
        files_to_combine = relevant_audio_df.sort_index()['audio_filename'].tolist()
        
        list_of_audio_arrays = []
        for file_name in files_to_combine:
            audio_path = os.path.join(AUDIO_FOLDER_PATH, file_name)
            audio_segment, _ = librosa.load(audio_path, sr=sampling_rate)
            list_of_audio_arrays.append(audio_segment)
        
        if not list_of_audio_arrays:
            continue
            
        combined_audio = np.concatenate(list_of_audio_arrays)

        feature_dict = {}
        feature_dict['zcr_mean'] = np.mean(librosa.feature.zero_crossing_rate(y=combined_audio))
        feature_dict['rms_mean'] = np.mean(librosa.feature.rms(y=combined_audio))
        feature_dict['spectral_centroid_mean'] = np.mean(librosa.feature.spectral_centroid(y=combined_audio, sr=sampling_rate))
        
        mfccs = librosa.feature.mfcc(y=combined_audio, sr=sampling_rate, n_mfcc=13)
        feature_dict['mfcc_mean'] = np.mean(mfccs)
        
        chroma = librosa.feature.chroma_stft(y=combined_audio, sr=sampling_rate)
        feature_dict['chroma_mean'] = np.mean(chroma)
        
        feature_dict['target_rainfall'] = target_rainfall
        
        all_features_list.append(feature_dict)

df_preprocessed = pd.DataFrame(all_features_list)

df_preprocessed.to_csv(FINAL_CSV_OUTPUT_PATH, index=False)

print(f"\nPreprocessing complete.")
print(f"Preprocessed data file saved to: {FINAL_CSV_OUTPUT_PATH}")
print("\n--- Displaying the first 5 rows of the new file ---")
print(df_preprocessed.head())

In [None]:
zcr_mean = np.mean(librosa.feature.zero_crossing_rate(y=combined_audio))
print(f"Zero-Crossing Rate Mean: {zcr_mean}")

rms_mean = np.mean(librosa.feature.rms(y=combined_audio))
print(f"RMS Energy Mean: {rms_mean}")

spectral_centroid_mean = np.mean(librosa.feature.spectral_centroid(y=combined_audio, sr=sampling_rate))
print(f"Spectral Centroid Mean: {spectral_centroid_mean}")

mfccs = librosa.feature.mfcc(y=combined_audio, sr=sampling_rate, n_mfcc=13)
mfcc_mean = np.mean(mfccs)
print(f"MFCC Mean: {mfcc_mean}")

chroma = librosa.feature.chroma_stft(y=combined_audio, sr=sampling_rate)
chroma_mean = np.mean(chroma)
print(f"Chroma Mean: {chroma_mean}")

In [None]:
import pandas as pd
import numpy as np
import os
import librosa
import datetime
from tqdm.notebook import tqdm

CSV_FILE_PATH = '/kaggle/input/rain-data-master-8k/rain_data_mechanical_master.csv'
AUDIO_FOLDER_PATH = '/kaggle/input/rain-data-master-8k/rainfall_sound_8k/'

OUTPUT_NUMPY_FOLDER = '/kaggle/working/combined_audio_numpy/'
FINAL_CSV_OUTPUT_PATH = '/kaggle/working/preprocessed_rainfall_features.csv'

if not os.path.exists(OUTPUT_NUMPY_FOLDER):
    os.makedirs(OUTPUT_NUMPY_FOLDER)

df_rain = pd.read_csv(CSV_FILE_PATH)
df_rain['Time'] = pd.to_datetime(df_rain['Time'])

def convert_rainfall_to_mm(value):
    try:
        value_str = str(value)
        if 'µm' in value_str:
            return float(value_str.replace('µm', '').strip()) / 1000
        elif 'mm' in value_str:
            return float(value_str.replace('mm', '').strip())
        else:
            return float(value_str)
    except (ValueError, TypeError):
        return None
df_rain['rainfall_cleaned'] = df_rain['device_frmpayload_data_rainfall'].apply(convert_rainfall_to_mm)

audio_files = sorted([f for f in os.listdir(AUDIO_FOLDER_PATH) if f.endswith('.wav')])
def get_timestamp_from_filename(filename):
    return datetime.datetime.strptime(filename.replace('.wav', ''), '%Y_%m_%d_%H_%M_%S_%f')

df_audio = pd.DataFrame({'audio_filename': audio_files})
df_audio['timestamp'] = df_audio['audio_filename'].apply(get_timestamp_from_filename)
df_audio = df_audio.set_index('timestamp')

_, sampling_rate = librosa.load(os.path.join(AUDIO_FOLDER_PATH, audio_files[0]), sr=None)


all_features_list = []
for index, row in tqdm(df_rain.iterrows(), total=df_rain.shape[0]):
    target_timestamp = row['Time']
    target_rainfall = row['rainfall_cleaned']
    start_time = target_timestamp - datetime.timedelta(minutes=3)
    end_time = target_timestamp
    
    relevant_audio_df = df_audio[(df_audio.index >= start_time) & (df_audio.index <= end_time)]

    if not relevant_audio_df.empty:
        files_to_combine = relevant_audio_df.sort_index()['audio_filename'].tolist()
        list_of_audio_arrays = [librosa.load(os.path.join(AUDIO_FOLDER_PATH, f), sr=sampling_rate)[0] for f in files_to_combine]
        
        if not list_of_audio_arrays:
            continue
            
        combined_audio = np.concatenate(list_of_audio_arrays)

        numpy_filename = f"audio_{target_timestamp.strftime('%Y%m%d_%H%M%S')}.npy"
        output_path = os.path.join(OUTPUT_NUMPY_FOLDER, numpy_filename)
        np.save(output_path, combined_audio)

        feature_dict = {
            'zcr_mean': np.mean(librosa.feature.zero_crossing_rate(y=combined_audio)),
            'rms_mean': np.mean(librosa.feature.rms(y=combined_audio)),
            'spectral_centroid_mean': np.mean(librosa.feature.spectral_centroid(y=combined_audio, sr=sampling_rate)),
            'mfcc_mean': np.mean(librosa.feature.mfcc(y=combined_audio, sr=sampling_rate, n_mfcc=13)),
            'chroma_mean': np.mean(librosa.feature.chroma_stft(y=combined_audio, sr=sampling_rate)),
            'target_rainfall': target_rainfall
        }
        all_features_list.append(feature_dict)

df_preprocessed = pd.DataFrame(all_features_list)
df_preprocessed.to_csv(FINAL_CSV_OUTPUT_PATH, index=False)

print(f"\nPreprocessing and file generation complete.")
print(f"All .npy files saved to: {OUTPUT_NUMPY_FOLDER}")
print(f"Feature CSV file saved to: {FINAL_CSV_OUTPUT_PATH}")