In [37]:
from torch.utils.data import Dataset
import torch
import glob
import pandas as pd
pd.options.mode.chained_assignment = None
from datetime import datetime
import os
import math
from tqdm import tqdm
import numpy as np
import random

from scipy import signal
from scipy import stats
import json
import matplotlib.pyplot as plt
from sklearn.svm import SVR


In [25]:
class SHMDataset(Dataset):

    def __init__(self, data_path, isPreTrain, isFineTuning, isEvaluation):
        if isPreTrain:
            self.start_time, self.end_time = "05/12/2021 00:00", "05/12/2021 23:59" #"05/12/2021 00:00", "05/12/2021 23:59"
            self.datasetSize = 500000
        elif isFineTuning:
            self.start_time, self.end_time = "06/12/2021 00:06", "06/12/2021 00:10" #"06/12/2021 00:00", "06/12/2021 11:59"
            self.datasetSize = 200000
        elif isEvaluation:
            self.start_time, self.end_time = "06/12/2021 00:11", "06/12/2021 00:15" #"06/12/2021 12:00", "06/12/2021 17:59"
            self.datasetSize = 50000
        else:
            self.start_time, self.end_time = "06/12/2021 00:21", "06/12/2021 00:25" #"06/12/2021 17:59", "06/12/2021 23:59"
            self.datasetSize = 50000
        self.path = data_path #'/home/yhbedoya/Repositories/SHM-MAE/traffic/20211205/'
        self.noisySensors = ["C12.1.4", "C17.1.2"]
        self.minDuration = 0.25
        self.data = self._readCSV()
        self.distanceToSensor = self._readDistanceToSensor()
        self.sensorVarDict = self._calculateThresholds(isPreTrain=isPreTrain)
        self.pesaDataDf = self._readLabels()
        self.labelsDf, self.groupsDf = self._labelAssignment()
        self.sampleRate = 100
        self.frameLength = 198
        self.stepLength = 58
        self.windowLength= 6000
        self.windowStep = 1500
        self.data, self.limits, self.totalWindows = self._partitioner()

    def __len__(self):
        return self.totalWindows

    def __getitem__(self, index):
        
        return self.limits[index]

    def _readCSV(self):
        print(f'reading CSV files')
        start = datetime.strptime(self.start_time, '%d/%m/%Y %H:%M')
        end = datetime.strptime(self.end_time, '%d/%m/%Y %H:%M')

        ldf = list()
        for p in tqdm(glob.glob(self.path + "*.csv")):
            name = os.path.split(p)[-1]
            nstr = datetime.strptime(name, 'traffic_%Y%m%dH%H%M%S.csv')
            if start <= nstr < end:
                df_tmp = pd.read_csv(p)
                c_drop = set(df_tmp.columns) - set(["sens_pos", "x", "y", "z", "ts"])
                if len(c_drop) > 0:
                    df_tmp.drop(columns=list(c_drop), inplace=True)
                ldf.append(df_tmp)
        df = pd.concat(ldf).sort_values(by=['sens_pos', 'ts'])
        df.reset_index(inplace=True, drop=True)

        #df = df[df['sens_pos'].isin(self.sensors)]
        df['ts'] = pd.to_datetime(df['ts'], unit='ms')
        df['Time'] = df['ts'].dt.strftime('%Y-%m-%d %H:%M:00')
        df["xN"] = df["x"]-np.mean(df["x"])
        df["yN"] = df["y"]-np.mean(df["y"])
        df["zN"] = df["z"]-np.mean(df["z"])
        df["vars"] = df["zN"].rolling(window=100).var().fillna(0)
        df["vars"] = df["vars"].rolling(window=100).mean().fillna(0)
        print(f'finish reading process')
        return df

    def _readDistanceToSensor(self):
        distanceToSensor = {}
        with open('/home/yhbedoya/Repositories/SHM-MAE/LabelGeneration/distanceToSensor.csv') as f: #/home/yvelez/sacertis/distanceToSensor.csv
            for line in f.readlines():
                sensor, distance = line.replace("'", "").replace("\n","").split(",")
                distanceToSensor[sensor] = float(distance)
        return distanceToSensor

    def _readLabels(self):
        start_time = datetime.strptime(self.start_time, '%d/%m/%Y %H:%M')
        end_time = datetime.strptime(self.end_time, '%d/%m/%Y %H:%M')
        pesaDataDf = pd.read_csv("/home/yhbedoya/Repositories/SHM-MAE/dati_pese_dinamiche/dati 2021-12-04_2021-12-12 pesa km 104,450.csv", sep=";", index_col=0) #/home/yvelez/sacertis/dati_pese_dinamiche/dati 2021-12-04_2021-12-12 pesa km 104,450.csv
        pesaDataDf = pesaDataDf[["Id", "StartTimeStr", "ClassId", "GrossWeight", "Velocity", "VelocityUnit"]]
        pesaDataDf["Time"] = pd.to_datetime(pesaDataDf["StartTimeStr"])
        pesaDataDf["Time"] = pesaDataDf["Time"].dt.strftime('%Y-%d-%m %H:%M:00')
        pesaDataDf["Time"] = pd.to_datetime(pesaDataDf["Time"]) + pd.to_timedelta(-1,'H')
        pesaDataDf.sort_values(by="Id", inplace=True)
        pesaDataDf = pesaDataDf[(pesaDataDf["Time"]>=start_time) & (pesaDataDf["Time"]<=end_time)]
        pesaDataDf.reset_index(drop=True, inplace=True)
        
        return pesaDataDf

    def groupsGenerator(self, sensorData, minTime, maxTime, threshold):
        slice = sensorData[(sensorData["ts"]>= minTime) & (sensorData["ts"]<= maxTime)]
        
        slice["outlier"] = slice["vars"].apply(lambda x: x>=threshold)
        outliers = slice[slice["outlier"] == True].reset_index().to_dict("records")

        if len(outliers) == 0:
            return pd.DataFrame()

        last = minTime
        timeStart = outliers[0]["ts"]
        flag = True
        groups = []
        groupTimes = []
        groupIndexes = []
        groupVars = []
        label = np.nan
        groupId = 0
        for outlier in outliers:
            if ((outlier["ts"] - last).total_seconds() < 2) or flag:
                groupTimes.append(outlier["ts"])
                groupVars.append(outlier["vars"])
                flag = False
                timeEnd = outlier["ts"]
            else:
                start, end = min(groupTimes), max(groupTimes)
                groupSignal = sensorData[(sensorData["ts"]>= start) & (sensorData["ts"]<= end)]["zN"]
                signalPower = np.sqrt(np.mean(np.array(groupSignal)**2))**2 
                pointMaxVar = groupTimes[np.argmax(groupVars)]
                if ((end - start).total_seconds() > self.minDuration):
                    label = {"groupId": groupId,"start": start, "end": end, "signalPower": signalPower, 
                    "pointMaxVar": pointMaxVar}
                    groups.append(label)
                groupId += 1
                groupTimes = [outlier["ts"],]
                groupVars = [outlier["vars"],]
            last = outlier["ts"]

        start, end = min(groupTimes), max(groupTimes)
        groupSignal = sensorData[(sensorData["ts"]>= start) & (sensorData["ts"]<= end)]["zN"]
        signalPower = np.sqrt(np.mean(np.array(groupSignal)**2))**2 
        pointMaxVar = groupTimes[np.argmax(groupVars)]
        if ((end-start).total_seconds() > self.minDuration):
            label = {"groupId": groupId,"start": start, "end": end, "signalPower": signalPower, 
            "pointMaxVar": pointMaxVar}
            groups.append(label)

        if len(groups)>0:
            groupsDf = pd.DataFrame(groups).sort_values("signalPower", ascending=False)
        else:
            groupsDf = pd.DataFrame()

        return groupsDf

    def _labelAssignment(self,):
        sensorLabelsDfList = []
        groupsDfList = []

        sensorsList = self.data["sens_pos"].unique()
        for sensor in sensorsList:
            if (sensor in self.noisySensors) or (sensor not in self.distanceToSensor.keys()) or (sensor not in self.sensorVarDict.keys()):
                continue
            assignedLabels = {}
            assignedLabels2 = {}
            sensorLabelsDf = self.pesaDataDf.copy(deep=True)
            sensorLabelsDf["EstimatedTime"] = sensorLabelsDf["Time"] + pd.to_timedelta((float(self.distanceToSensor[sensor])/(sensorLabelsDf["Velocity"]/3.6))-20,'S')
            sensorLabelsDf["MaxTime"] = sensorLabelsDf["EstimatedTime"] + pd.to_timedelta(120,'S')
            minTime = sensorLabelsDf["EstimatedTime"].min()
            maxTime = sensorLabelsDf["MaxTime"].max()
            sensorLabelsDf.sort_values("GrossWeight", inplace=True, ascending=False)

            sensorData = self.data[self.data["sens_pos"]==sensor]
            threshold = self.sensorVarDict[sensor]["threshold"]

            groupsDf = self.groupsGenerator(sensorData, minTime, maxTime, threshold)
            print(f"Total groups found for sensor {sensor}: {groupsDf.shape[0]}")
            if groupsDf.empty:
                continue

            availableGroupsDf = groupsDf.copy(deep=True)
            for index, row in sensorLabelsDf.iterrows():
                if row["Id"] in assignedLabels:
                    continue
                
                if availableGroupsDf.empty:
                    break

                candidatesDf = availableGroupsDf[(row["EstimatedTime"] <= availableGroupsDf["pointMaxVar"]) & (availableGroupsDf["pointMaxVar"] <= row["MaxTime"])]
                if not candidatesDf.empty:
                    assignedLabels[row["Id"]] = candidatesDf.iloc[0].to_dict()
                    assignedLabels2[candidatesDf.iloc[0]["groupId"]] = row["Id"]
                    availableGroupsDf.drop(candidatesDf.index[0], inplace=True)
            
            sensorLabelsDf["sens_pos"] = sensor
            sensorLabelsDf["labels"] = sensorLabelsDf.apply(lambda row: assignedLabels[row["Id"]] if row["Id"] in assignedLabels else np.nan, axis=1)
            sensorLabelsDf.sort_values("Id", inplace=True)
            groupsDf["sens_pos"] = sensor
            groupsDf["labels"] = groupsDf.apply(lambda row: assignedLabels2[row["groupId"]] if row["groupId"] in assignedLabels2 else np.nan, axis=1)
            groupsDf.sort_values("groupId", inplace=True)
            groupsDf.dropna(inplace=True)

            sensorLabelsDfList.append(sensorLabelsDf)
            groupsDfList.append(groupsDf)

        labelsDf = pd.concat(sensorLabelsDfList)
        groupsDf =  pd.concat(groupsDfList)

        print(f"Total labels: {len(labelsDf)}")
        totnan = labelsDf["labels"].isna().sum()
        print(f"Total nan labels: {totnan}")
        print(f"Proportion of match labels: {1-(totnan/len(labelsDf))}")

        return labelsDf, groupsDf

    def _labelAssigner(self, timeSlice, sensor):
        start, end = timeSlice.min(), timeSlice.max()
        vehiclesInSliceDf = self.groupsDf[(self.groupsDf["pointMaxVar"]>=start) &
        (self.groupsDf["pointMaxVar"]<=end) &
        (self.groupsDf["sens_pos"]==sensor)]
        return vehiclesInSliceDf.shape[0]

    def _partitioner(self):
        sensors = self.data['sens_pos'].unique().tolist()
        print(f'start partitioner')
        partitions = {}
        cumulatedWindows = 0
        limits = dict()
        print(f'Generating windows')
        for sensor in tqdm(sensors):
            if (sensor in self.noisySensors) or (sensor not in self.distanceToSensor.keys()):
                continue
            sensorData = self.data[self.data['sens_pos']==sensor]
            totalFrames = sensorData.shape[0]
            totalWindows = math.ceil((totalFrames-self.windowLength)/self.windowStep)
            start = cumulatedWindows
            cumulatedWindows += totalWindows
            end = cumulatedWindows
            indexStart = sensorData.index[0]
            partitions[sensor]= (start, end, indexStart)

        timeData = torch.tensor(self.data["z"].values, dtype=torch.float64)
        timestamps = self.data["ts"]
        cummulator = -1

        print(f'Defining useful windows limits')
        indexes = list(range(0, cumulatedWindows))
        random.shuffle(indexes)
        
        for index in tqdm(indexes):
            if cummulator >= self.datasetSize:
                break
            for sensor,v in partitions.items():
                if index in range(v[0], v[1]):
                    start = v[2]+(index-v[0])*self.windowStep
                    timeSlice = timestamps[start: start+self.windowLength]
                    label = self._labelAssigner(timeSlice, sensor)
                    signalPower = self.power(timeData[start: start+self.windowLength])

                    if (signalPower>1.25*10**-6) or (label>0):
                        cummulator += 1
                        limits[cummulator] = (start, start+self.windowLength, label, (timestamps[start], timestamps[start+self.windowLength]), sensor)

                    break
        print(f'Total windows in dataset: {cummulator}')
        return timeData, limits, cummulator

    def _transformation(self, slice):
        sliceN = slice-torch.mean(slice)
        frequencies, times, spectrogram = signal.spectrogram(sliceN,self.sampleRate,nfft=self.frameLength,noverlap=(self.frameLength - self.stepLength), nperseg=self.frameLength,mode='psd')

        return frequencies, times, np.log10(spectrogram)
    
    def _normalizer(self, spectrogram):
        spectrogramNorm = (spectrogram - self.min) / (self.max - self.min)
        return spectrogramNorm

    def power(self, slice):
        return np.sqrt(np.mean(np.array(slice)**2))**2

    def interquartileRule(self, data):
        # Calculate the first quartile (Q1)
        Q1 = np.percentile(data, 25)

        # Calculate the third quartile (Q3)
        Q3 = np.percentile(data, 75)

        # Calculate the interquartile range (IQR)
        IQR = Q3 - Q1

        # Define the upper and lower bounds
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        return lower_bound, upper_bound

    def _calculateThresholds(self, isPreTrain):
        if isPreTrain:
            print(f'Start creating thresholds')
            varDf = self.data[["sens_pos", "vars"]]
            sensorsList = self.data["sens_pos"].unique()
            sensorVarDict = {}
            for sensor in tqdm(sensorsList):
                if (sensor in self.noisySensors) or (sensor not in self.distanceToSensor.keys()):
                    continue
                sensorVarDf = varDf[varDf["sens_pos"]==sensor]
                lower_bound, upper_bound = self.interquartileRule(sensorVarDf["vars"])
                sensorVarDf = sensorVarDf[(sensorVarDf["vars"]>lower_bound) & (sensorVarDf["vars"]<upper_bound)]
                mean = sensorVarDf["vars"].mean()
                std = sensorVarDf["vars"].std()
                threshold = mean + 3.5 * std
                sensorVarDict[sensor] = {"mean": mean, "std": std, "threshold": threshold}
                #with open("/content/drive/MyDrive/Data Science and Engineering - PoliTo2/Thesis/models/MAE-SHM/output_dir_TE/sensorVarDict.json", "w") as f:
                #    # Write the dict to the file
                #    json.dump(sensorVarDict, f)
            print(f'Finish thresholds creation')
        else:
            print(f'Start reading thresholds')
            with open("/home/yhbedoya/Repositories/SHM-MAE/util/DataLoadersSacertisLabels/sensorVarDict.json", "r") as f: #/home/yvelez/sacertis/sensorVarDict.json
                # Load the dict from the file
                sensorVarDict = json.load(f)

            print(f'Finish thresholds reading')

        return sensorVarDict


In [26]:
def groupAssigner(row, groupsDict):
    sensor = row["sensor"]
    for k, v in groupsDict.items():
        if sensor in v:
            return int(k)
        
def featureExtraction(window, sensNumber):
    statistics = dict()
    axis = ["xN", "yN", "zN"]
    for ax in axis:
        axSens = ax + sensNumber
        serie = window[ax]

        axStatistics={
            axSens+"mean": np.mean(serie),
            axSens+"std": np.std(serie),
            axSens+"min": np.min(serie),
            axSens+"max": np.max(serie),
            axSens+"med": np.median(serie),
            axSens+"kurt": stats.kurtosis(serie),
            axSens+"skew": stats.skew(serie),
            axSens+"rms": np.sqrt(np.mean(serie**2)),
            axSens+"sabs": np.sum(np.abs(serie)),
            axSens+"eom": serie[serie>np.mean(serie)].sum(),
            axSens+"ener": np.sqrt(np.mean(np.array(serie)**2))**2,
            axSens+"mad": np.median(np.absolute(serie - np.median(serie)))
        }

        statistics = {**statistics, **axStatistics}

    return statistics

def getDataset(training=False, evaluation=False, test=False):

    data_path = "/home/yhbedoya/Repositories/SHM-MAE/traffic/20211206/"
    dataGenerator = SHMDataset(data_path=data_path ,isPreTrain=False, isFineTuning=training, isEvaluation=evaluation)

    windows = {"indexStart":[],
        "indexEnd":[],
        "times": [],
        "label":[],
        "sensor":[]}

    for i in tqdm(range(len(dataGenerator))):
        start, end, label, timeSlice, sensor = dataGenerator[i]
        windows["indexStart"].append(start)
        windows["indexEnd"].append(end)
        windows["label"].append(label)
        windows["times"].append(timeSlice)
        windows["sensor"].append(sensor)

    df = dataGenerator._readCSV()
    df = df[["sens_pos", "ts", "xN", "yN", "zN"]]

    sensors = df["sens_pos"].unique()

    groupsDict = {}
    for sensor in list(sensors):
        section = sensor.split(".")[0]
        group = section[1:]

        if section[0] == "P":
            continue  
        if int(group) in groupsDict.keys():
            groupsDict[int(group)].append(sensor)
        else:
            groupsDict[int(group)] = [sensor]

    toDelete = list()
    for k, v in groupsDict.items():
        if len(v) != 6:
            toDelete.append(k)

    for k in toDelete:
        del groupsDict[k]
            
    windowsDf = pd.DataFrame(windows)
            
    windowsDf["group"] = windowsDf.apply(groupAssigner, groupsDict = groupsDict, axis=1)
    windowsDf.dropna(inplace=True)

    extractedFeaturesList = []
    group = 2
    groupDataDf = df[df["sens_pos"].isin(groupsDict[group])]
    groupWindowsDf = windowsDf[windowsDf["group"]==group]
    for index, row in tqdm(groupWindowsDf.iterrows()):
        statistics = dict()
        section = row["sensor"].split(".")[0]
        group = section[1:]
        
        if group =="8":
            continue
        window = groupDataDf[(groupDataDf["ts"] >= row["times"][0]) & (groupDataDf["ts"] < row["times"][1])]
        for sensor in groupsDict[int(group)]:
            sensorWindow =  window[window["sens_pos"]==row["sensor"]]
            sensorStatistics = featureExtraction(sensorWindow, sensor[-3:])
            statistics = {**statistics, **sensorStatistics}
        statistics["label"] = row["label"]

        extractedFeaturesList.append(statistics)

    dataDf = pd.DataFrame(extractedFeaturesList)
    return dataDf


In [27]:
trainDf = getDataset(training=True, evaluation=False, test=False)
evalDf = getDataset(training=False, evaluation=True, test=False)
testDf = getDataset(training=False, evaluation=False, test=True)

reading CSV files


100%|██████████| 67/67 [00:05<00:00, 12.94it/s]


finish reading process
Start reading thresholds
Finish thresholds reading
Total groups found for sensor C1.1.1: 1
Total groups found for sensor C1.1.2: 8
Total groups found for sensor C1.1.3: 1
Total groups found for sensor C1.2.1: 2
Total groups found for sensor C1.2.2: 1
Total groups found for sensor C1.2.3: 2
Total groups found for sensor C10.1.2: 0
Total groups found for sensor C10.1.3: 8
Total groups found for sensor C10.1.4: 3
Total groups found for sensor C10.2.2: 3
Total groups found for sensor C10.2.3: 2
Total groups found for sensor C10.2.4: 1
Total groups found for sensor C11.1.2: 1
Total groups found for sensor C11.1.3: 1
Total groups found for sensor C11.1.4: 1
Total groups found for sensor C11.2.2: 0
Total groups found for sensor C11.2.3: 2
Total groups found for sensor C11.2.4: 1
Total groups found for sensor C12.1.2: 2
Total groups found for sensor C12.1.3: 3
Total groups found for sensor C12.2.2: 2
Total groups found for sensor C12.2.3: 1
Total groups found for sensor 

100%|██████████| 141/141 [00:30<00:00,  4.70it/s]


Defining useful windows limits


100%|██████████| 1155/1155 [00:02<00:00, 470.06it/s]


Total windows in dataset: 1154


100%|██████████| 1154/1154 [00:00<00:00, 573554.55it/s]


reading CSV files


100%|██████████| 67/67 [00:05<00:00, 13.26it/s]


finish reading process


66it [00:07,  8.42it/s]


reading CSV files


100%|██████████| 67/67 [00:06<00:00, 10.12it/s]


finish reading process
Start reading thresholds
Finish thresholds reading
Total groups found for sensor C1.1.1: 0
Total groups found for sensor C1.1.2: 6
Total groups found for sensor C1.1.3: 0
Total groups found for sensor C1.2.1: 0
Total groups found for sensor C1.2.2: 0
Total groups found for sensor C1.2.3: 1
Total groups found for sensor C10.1.2: 0
Total groups found for sensor C10.1.3: 0
Total groups found for sensor C10.1.4: 0
Total groups found for sensor C10.2.2: 0
Total groups found for sensor C10.2.3: 0
Total groups found for sensor C10.2.4: 0
Total groups found for sensor C11.1.2: 0
Total groups found for sensor C11.1.3: 0
Total groups found for sensor C11.1.4: 0
Total groups found for sensor C11.2.2: 0
Total groups found for sensor C11.2.3: 0
Total groups found for sensor C11.2.4: 0
Total groups found for sensor C12.1.2: 0
Total groups found for sensor C12.1.3: 1
Total groups found for sensor C12.2.2: 3
Total groups found for sensor C12.2.3: 0
Total groups found for sensor 

100%|██████████| 141/141 [00:40<00:00,  3.50it/s]


Defining useful windows limits


100%|██████████| 1575/1575 [00:03<00:00, 456.73it/s]


Total windows in dataset: 1574


100%|██████████| 1574/1574 [00:00<00:00, 450760.24it/s]


reading CSV files


100%|██████████| 67/67 [00:06<00:00, 10.20it/s]


finish reading process


90it [00:10,  8.42it/s]


reading CSV files


100%|██████████| 67/67 [00:05<00:00, 13.27it/s]


finish reading process
Start reading thresholds
Finish thresholds reading
Total groups found for sensor C1.1.1: 1
Total groups found for sensor C1.1.2: 13
Total groups found for sensor C1.1.3: 0
Total groups found for sensor C1.2.1: 1
Total groups found for sensor C1.2.2: 1
Total groups found for sensor C1.2.3: 1
Total groups found for sensor C10.1.2: 2
Total groups found for sensor C10.1.3: 5
Total groups found for sensor C10.1.4: 1
Total groups found for sensor C10.2.2: 2
Total groups found for sensor C10.2.3: 1
Total groups found for sensor C10.2.4: 1
Total groups found for sensor C11.1.2: 1
Total groups found for sensor C11.1.3: 1
Total groups found for sensor C11.1.4: 1
Total groups found for sensor C11.2.2: 0
Total groups found for sensor C11.2.3: 1
Total groups found for sensor C11.2.4: 1
Total groups found for sensor C12.1.2: 1
Total groups found for sensor C12.1.3: 7
Total groups found for sensor C12.2.2: 5
Total groups found for sensor C12.2.3: 1
Total groups found for sensor

100%|██████████| 141/141 [00:30<00:00,  4.68it/s]


Defining useful windows limits


100%|██████████| 1155/1155 [00:02<00:00, 479.87it/s]


Total windows in dataset: 1154


100%|██████████| 1154/1154 [00:00<00:00, 345804.59it/s]


reading CSV files


100%|██████████| 67/67 [00:04<00:00, 13.44it/s]


finish reading process


66it [00:07,  8.32it/s]


In [21]:
trainDf

Unnamed: 0,xN1.2mean,xN1.2std,xN1.2min,xN1.2max,xN1.2med,xN1.2kurt,xN1.2skew,xN1.2rms,xN1.2sabs,xN1.2eom,...,zN2.4max,zN2.4med,zN2.4kurt,zN2.4skew,zN2.4rms,zN2.4sabs,zN2.4eom,zN2.4ener,zN2.4mad,label
0,-0.016306,0.000155,-0.017431,-0.015294,-0.016286,3.691886,-0.057042,0.016307,97.834603,-53.141577,...,-0.000710,-0.002083,-0.146670,-0.056111,0.002114,12.482324,-4.932020,0.000004,0.000229,0
1,0.000869,0.000150,0.000041,0.002177,0.000880,3.337144,0.137020,0.000882,5.216897,3.371172,...,0.017067,0.015541,-0.422639,-0.102635,0.015571,93.384389,47.663628,0.000242,0.000305,0
2,0.035912,0.000137,0.034908,0.036662,0.035900,2.657631,-0.046561,0.035912,215.470654,91.028549,...,-0.013680,-0.014825,-0.029816,0.068818,0.014794,88.748517,-42.504153,0.000219,0.000229,0
3,-0.014408,0.000153,-0.015523,-0.013540,-0.014379,2.856051,-0.195952,0.014408,86.445879,-45.417206,...,-0.002770,-0.004296,0.410406,0.012688,0.004287,25.640483,-11.446887,0.000018,0.000229,1
4,-0.014412,0.000154,-0.015447,-0.013540,-0.014379,2.649339,-0.148002,0.014413,86.474261,-44.045984,...,-0.002770,-0.004296,0.446935,0.013276,0.004312,25.793913,-13.163433,0.000019,0.000229,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,-0.026493,0.000165,-0.027578,-0.025289,-0.026510,1.897039,0.186729,0.026494,158.958887,-70.480968,...,0.013633,0.012107,0.087199,0.042352,0.012081,72.459921,37.346088,0.000146,0.000229,0
62,-0.016300,0.000209,-0.018804,-0.012624,-0.016286,44.261883,1.084675,0.016301,97.797447,-54.325014,...,0.000740,-0.002083,1.158004,0.320081,0.002084,12.239394,-4.996579,0.000004,0.000305,1
63,-0.014364,0.000151,-0.015676,-0.013463,-0.014379,4.192638,-0.349802,0.014365,86.186704,-37.142354,...,-0.002999,-0.004296,-0.012649,-0.022983,0.004304,25.752256,-11.247154,0.000019,0.000229,0
64,-0.014406,0.000152,-0.015447,-0.013540,-0.014379,2.273216,-0.108447,0.014406,86.433214,-45.369034,...,-0.002770,-0.004296,0.435489,-0.003216,0.004316,25.813750,-13.043657,0.000019,0.000229,1


In [34]:
from sklearn.feature_selection import SelectKBest, f_classif

# Separate your features and target variable
X = trainDf.drop('label', axis=1)
y = trainDf['label']

# Select the k best features using the F-test
selector = SelectKBest(score_func=f_classif, k=10) # choose the number of features you want to keep
X_new = selector.fit_transform(X, y)

# Get the indices of the selected features
mask = selector.get_support() # an array of booleans indicating which features are selected
selected_features = list(X.columns[mask]) + ["label"] # a list of the selected feature names


# Print the selected features
print(list(selected_features))

['zN1.2kurt', 'zN1.3kurt', 'xN1.4skew', 'zN1.4kurt', 'xN2.2skew', 'zN2.2kurt', 'xN2.3skew', 'zN2.3kurt', 'xN2.4skew', 'zN2.4kurt', 'label']


In [35]:
trainFDf = trainDf[selected_features]
X = trainFDf.drop('label', axis=1)
y = trainFDf['label']
evalFDf = evalDf[selected_features]
X_ev = evalFDf.drop('label', axis=1)
y_ev = evalFDf['label']
testFDf = testDf[selected_features]
X_test = testFDf.drop('label', axis=1)
y_test = testFDf['label']

In [38]:
svr_rbf = SVR(kernel='rbf', C=10, verbose=2)
svr_rbf.fit(X, y)

SVR(C=100, gamma=0.1)

In [39]:
# Make predictions on test data
y_pred_test = svr_rbf.predict(X_test)

# Calculate mean absolute error on test data
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
mae = mean_absolute_error(y_test, y_pred_test)
print("Mean Absolute Error on Test Data:", mae)

Mean Absolute Error on Test Data: 0.49396064959007535
