In [10]:
import scipy.io
import pandas as pd
from scipy.io import loadmat

# Load the .mat file
data = loadmat('/Users/ananyayadav/phase_1_kafka_setup/received_traffic_dataset.mat')

# Extract variables
tra_X_tr = data['tra_X_tr']
tra_Y_tr = data['tra_Y_tr']

# Convert to DataFrames
def convert_to_dataframe(mat_data, var_name):
    if var_name in ['tra_X_tr']:
        flattened_data = [mat.toarray().flatten() if hasattr(mat, 'toarray') else mat.flatten() for mat in mat_data[0]]
        df = pd.DataFrame(flattened_data)
    elif var_name in ['tra_Y_tr']:
        df = pd.DataFrame(mat_data)
    return df

df_tra_X_tr = convert_to_dataframe(tra_X_tr, 'tra_X_tr')
df_tra_Y_tr = convert_to_dataframe(tra_Y_tr, 'tra_Y_tr')

# Create a simple index assuming the data is in 15-minute intervals
time_index_tr = pd.RangeIndex(start=0, stop=df_tra_Y_tr.shape[1], step=1)

# Create DataFrames with the simple index
df_tra_Y_tr_time = pd.DataFrame(df_tra_Y_tr.T.values, index=time_index_tr, columns=[f'Location_{i}' for i in range(df_tra_Y_tr.shape[0])])

# Create time-based features
df_tra_Y_tr_time['hour'] = df_tra_Y_tr_time.index % 96 // 4  # 96 intervals per day, 4 intervals per hour
df_tra_Y_tr_time['day_of_week'] = df_tra_Y_tr_time.index // 96 % 7  # 96 intervals per day, 7 days per week
df_tra_Y_tr_time['quarter'] = df_tra_Y_tr_time.index // (96 * 91) % 4 + 1  # 91 days per quarter
df_tra_Y_tr_time['month'] = df_tra_Y_tr_time.index // (96 * 30) % 12 + 1  # 30 days per month
df_tra_Y_tr_time['year'] = df_tra_Y_tr_time.index // (96 * 365) + 1  # 365 days per year

# Create more granular time periods
df_tra_Y_tr_time['time_of_day'] = pd.cut(df_tra_Y_tr_time['hour'], 
                                         bins=[-1, 6, 12, 18, 24], 
                                         labels=['Night', 'Morning', 'Afternoon', 'Evening'])

# Create a binary feature for weekdays and weekends
df_tra_Y_tr_time['is_weekend'] = df_tra_Y_tr_time['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

# Create rolling averages
df_tra_Y_tr_time['rolling_mean_3'] = df_tra_Y_tr_time['Location_0'].rolling(window=3).mean()
df_tra_Y_tr_time['rolling_mean_6'] = df_tra_Y_tr_time['Location_0'].rolling(window=6).mean()
df_tra_Y_tr_time['rolling_mean_12'] = df_tra_Y_tr_time['Location_0'].rolling(window=12).mean()

# Drop NaN values created by rolling averages
df_tra_Y_tr_time.dropna(inplace=True)

# Save the feature-engineered data to a CSV file
df_tra_Y_tr_time.to_csv('feature_engineered_data.csv')

# Display the DataFrame with new features
df_tra_Y_tr_time.head()

Unnamed: 0,Location_0,Location_1,Location_2,Location_3,Location_4,Location_5,Location_6,Location_7,Location_8,Location_9,...,hour,day_of_week,quarter,month,year,time_of_day,is_weekend,rolling_mean_3,rolling_mean_6,rolling_mean_12
11,0.16114,0.171882,0.114433,0.107426,0.085941,0.080336,0.099019,0.077067,0.244278,0.3078,...,2,0,1,1,1,Night,0,0.141523,0.124864,0.089911
12,0.155068,0.157403,0.13405,0.104157,0.114433,0.090612,0.120972,0.104157,0.260626,0.291453,...,3,0,1,1,1,Night,0,0.151487,0.135996,0.098591
13,0.168613,0.168146,0.125642,0.116768,0.099019,0.091079,0.106959,0.088744,0.228865,0.276506,...,3,0,1,1,1,Night,0,0.161607,0.145181,0.108672
14,0.148062,0.191032,0.124708,0.124241,0.106025,0.101822,0.126109,0.104157,0.249416,0.319944,...,3,0,1,1,1,Night,0,0.157247,0.149385,0.117274
15,0.193835,0.200841,0.121439,0.128445,0.136852,0.141523,0.171882,0.11163,0.295656,0.352639,...,3,0,1,1,1,Night,0,0.17017,0.160828,0.128445
