In [30]:
import pandas as pd
import numpy as np 
from scipy import stats
import warnings
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, GlobalAveragePooling1D, Dense, Concatenate, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns



In [24]:
train = pd.read_csv("../../data/combined/multi_class_kaggle/train.csv")
test = pd.read_csv("../../data/combined/multi_class_kaggle/test.csv")

In [25]:
train.head()

Unnamed: 0,timestamp,ax,ay,az,wx,wy,wz,latitude,longitude,speed,roadQuality,source_folder
0,1577219000.0,0.340007,0.17756,9.828766,0.000761,-0.001328,0.002563,-27.717841,-51.098865,0.009128,0,PVS 1
1,1577219000.0,0.323846,0.156611,9.771904,0.002492,0.000136,0.001365,-27.717841,-51.098865,0.009128,0,PVS 1
2,1577219000.0,0.342999,0.206889,9.848519,0.000628,0.001468,0.000566,-27.717841,-51.098865,0.009128,0,PVS 1
3,1577219000.0,0.344196,0.145837,9.815,-0.002036,0.003865,0.001631,-27.717841,-51.098865,0.009128,0,PVS 1
4,1577219000.0,0.313072,0.142246,9.84373,-0.003367,-0.001728,-0.001831,-27.717841,-51.098865,0.009128,0,PVS 1


In [26]:
counts = train['roadQuality'].value_counts()
percentages = train['roadQuality'].value_counts(normalize=True) * 100

imbalance_df = pd.DataFrame({
    'Count': counts,
    'Percentage': percentages
})

print(imbalance_df)


             Count  Percentage
roadQuality                   
0            28089   40.822288
3            19802   28.778630
2            19721   28.660912
1             1196    1.738170


we can see the number of 1 lables are very low as compared to the rest so we will have to use class weights, i.e., peanilise the model for predicting a wrong 1 then predicting the rest 

we will use the features ax,ay,az,wx,wy,wz,speed to predict roadQuality<br>
model structure i would use is:<br>
create window of 20 rows with step size 10 (50% overlap)<br>
so the final shape of data would look like (samples, 20, 7)<br>
and the convolution used here would be of 1d, since the 7 features are the channels here (the RGB in image context) and it will stride over the window, since the window here is 1d the filter will be of 1d, hence 1d CNN is being used <br>

converting the data into the desired windows 

In [27]:
def create_windows(df, window_size=20, step_size=10):
    """
    Converts a dataframe into windows for the Hybrid Model.
    
    Args:
        df: The dataframe containing sensor data and 'roadQuality' label.
        window_size: Number of samples per window (20 samples @ 10Hz = 2 seconds).
        step_size: Stride/Overlap (10 samples = 50% overlap).
        
    Returns:
        X_raw: The raw sensor data for the CNN (shape: N, 20, 7)
        X_stats: The statistical features for the Dense layer (shape: N, features)
        y: The labels (shape: N, )
    """
    
    # 1. DEFINING THE COLUMNS 
    # ------------------------------------------------------------------
    feature_cols = [
        'ax', 'ay', 'az',   # Accelerometer
        'wx', 'wy', 'wz', # Gyroscope
        'speed'                       # GPS Speed
    ]
    label_col = 'roadQuality'
    # ------------------------------------------------------------------

    X_raw_list = []
    X_stats_list = []
    y_list = []

    # Loop through the data using a sliding window
    # We stop when i + window_size exceeds the dataframe length
    for i in range(0, len(df) - window_size + 1, step_size):
        
        # Extract the window slice
        window = df.iloc[i : i + window_size]
        
        # --- PART A: LABELS ---
        # We take the MODE (most frequent label) of the window.
        # If a window is 60% "Bad" and 40% "Good", we label it "Bad".
        labels = window[label_col].values
        mode_label = stats.mode(labels)[0]
        # Note: If you get a scalar error, use: stats.mode(labels, keepdims=True)[0][0]
        
        # Skip windows that are majority "-1" (Unlabeled data)
        if mode_label == -1:
            continue

        # --- PART B: RAW DATA (For CNN) ---
        # Get the raw 20x7 matrix
        raw_data = window[feature_cols].values
        X_raw_list.append(raw_data)
        
        # --- PART C: STATISTICAL FEATURES (For Feature Engineering Head) ---
        # We calculate stats for EACH column in the window
        # Features: Mean, Std Dev, RMS, Peak-to-Peak, skewness, kurtosis
        # didn't use Min, Max, energy, jerk and zero crossing count (from the sypnosis) because these features are irralavant for one value for the complete row,
        # more complex form of these features will get learned by the raw data CNN
        
        stats_row = []
        for col in feature_cols:
            series = window[col].values
            
            # 1. Mean
            stats_row.append(np.mean(series))
            # 2. Standard Deviation
            stats_row.append(np.std(series))
            # 3. RMS (Root Mean Square) - Measure of energy/intensity
            stats_row.append(np.sqrt(np.mean(series**2)))
            # 4. Range (Peak-to-Peak) - Measure of impact
            stats_row.append(np.max(series) - np.min(series))
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", RuntimeWarning) #while running the code there is warning that variance is almost zero for some data so skew and kurt are irrelevant... it was flodding the output so i ignored it 
                # 5. skewness: differentiates between an dip and a bump
                stats_row.append(stats.skew(series, bias=False))
                # 6. kurtosis: will highlight if there is a dip on a smooth road (differentiates an smooth road with a big aah pothole in middle and a consatly bumpy road)
                stats_row.append(stats.kurtosis(series, fisher=True, bias=False))
                
        X_stats_list.append(stats_row)
        y_list.append(mode_label)

    # Convert to efficient Numpy Arrays
    X_raw = np.array(X_raw_list)
    X_stats = np.array(X_stats_list)
    y = np.array(y_list)
    
    return X_raw, X_stats, y

# =============================================================================
# USAGE
# =============================================================================

# 1. Run the function
train = train.dropna() #ensuring no null rows
X_train_raw, X_train_stats, y_train = create_windows(train, window_size=20, step_size=10)

# 2. Check the shapes (Sanity Check)
print("-"*30)
print ("train")
print(f"Original Rows: {len(train)}")
print(f"Generated Windows: {len(y_train)}")
print("-" * 30)
print(f"X_raw Shape (CNN Input):   {X_train_raw.shape}")   # Should be (N, 20, 7)
print(f"X_stats Shape (MLP Input): {X_train_stats.shape}") # Should be (N, 42) -> 7 cols * 6 stats
print(f"y Shape (Labels):          {y_train.shape}")
print("YAYYYY WINDOWING IS SUCCESSFULLLLLL !!!!!!!!!!")

test = test.dropna() #ensuring no null rows
X_test_raw, X_test_stats, y_test = create_windows(test, window_size=20, step_size=10)

print("-"*30)
print ("test")
print(f"Original Rows: {len(test)}")
print(f"Generated Windows: {len(y_test)}")
print("-" * 30)
print(f"X_raw Shape (CNN Input):   {X_test_raw.shape}")   # Should be (N, 20, 7)
print(f"X_stats Shape (MLP Input): {X_test_stats.shape}") # Should be (N, 42) -> 7 cols * 6 stats
print(f"y Shape (Labels):          {y_test.shape}")
print("YAYYYY WINDOWING IS SUCCESSFULLLLLL !!!!!!!!!!")




------------------------------
train
Original Rows: 68808
Generated Windows: 6879
------------------------------
X_raw Shape (CNN Input):   (6879, 20, 7)
X_stats Shape (MLP Input): (6879, 42)
y Shape (Labels):          (6879,)
YAYYYY WINDOWING IS SUCCESSFULLLLLL !!!!!!!!!!
------------------------------
test
Original Rows: 32082
Generated Windows: 3207
------------------------------
X_raw Shape (CNN Input):   (3207, 20, 7)
X_stats Shape (MLP Input): (3207, 42)
y Shape (Labels):          (3207,)
YAYYYY WINDOWING IS SUCCESSFULLLLLL !!!!!!!!!!
