In [None]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [None]:
#https://github.com/fraunhoferhhi/BerlinV2X
sidelink='/data22/bde/Data/BerlinV2X/sidelink_dataframe.parquet'
cellular='/data22/bde/Data/BerlinV2X/cellular_dataframe.parquet'
df = pd.read_parquet(cellular)
df.info(verbose=True)

In [None]:
# Filter only for downlink datarate measurements

filtered_data = df.query("direction == 'downlink' & measured_qos == 'datarate'")

# Remove incomplete measurements without datarate
filtered_data = filtered_data.dropna(subset='datarate')

# Train and test split along operators
train_data = filtered_data.query("operator == 1")
train_data_1 = train_data.query("device == 'pc1'")
train_data_2 = train_data.query("device == 'pc2'")
train_data_3 = train_data.query("device == 'pc3'")
train_data_4 = train_data.query("device == 'pc4'")

test_data = filtered_data.query("operator == 2")
test_data_1 = test_data.query("device == 'pc1'")
test_data_2 = test_data.query("device == 'pc2'")
test_data_3 = test_data.query("device == 'pc3'")
test_data_4 = test_data.query("device == 'pc4'")

In [None]:
print(len(train_data_1))
print(len(train_data_2))
print(len(train_data_3))
print(len(train_data_4))

In [None]:
print(len(test_data_1))
print(len(test_data_2))
print(len(test_data_3))
print(len(test_data_4))

In [None]:
qos_column = 'datarate'
feature_columns = [
     'PCell_RSRP_max',
     'PCell_RSRQ_max',
     'PCell_RSSI_max',
     'PCell_SNR_1',
     'PCell_SNR_2',
     'PCell_Downlink_Num_RBs',
     'PCell_Downlink_TB_Size',
     'PCell_Downlink_Average_MCS',
     'PCell_Downlink_bandwidth_MHz',
     'PCell_Cell_Identity',
     'PCell_freq_MHz',
     'SCell_RSRP_max',
     'SCell_RSRQ_max',
     'SCell_RSSI_max',
     'SCell_SNR_1',
     'SCell_SNR_2',
     'SCell_Downlink_Num_RBs',
     'SCell_Downlink_TB_Size',
     'SCell_Downlink_Average_MCS',
     'SCell_Downlink_bandwidth_MHz',
     'SCell_Cell_Identity',
     'SCell_freq_MHz',
     'Latitude',
     'Longitude',
     'Altitude',
     'speed_kmh',
     'COG',
     'precipIntensity',
     'precipProbability',
     'temperature',
     'apparentTemperature',
     'dewPoint',
     'humidity',
     'pressure',
     'windSpeed',
     'cloudCover',
     'uvIndex',
     'visibility',
     'Traffic Jam Factor']

all_columns = [
     'datarate',
     'PCell_RSRP_max',
     'PCell_RSRQ_max',
     'PCell_RSSI_max',
     'PCell_SNR_1',
     'PCell_SNR_2',
     'PCell_Downlink_Num_RBs',
     'PCell_Downlink_TB_Size',
     'PCell_Downlink_Average_MCS',
     'PCell_Downlink_bandwidth_MHz',
     'PCell_Cell_Identity',
     'PCell_freq_MHz',
     'SCell_RSRP_max',
     'SCell_RSRQ_max',
     'SCell_RSSI_max',
     'SCell_SNR_1',
     'SCell_SNR_2',
     'SCell_Downlink_Num_RBs',
     'SCell_Downlink_TB_Size',
     'SCell_Downlink_Average_MCS',
     'SCell_Downlink_bandwidth_MHz',
     'SCell_Cell_Identity',
     'SCell_freq_MHz',
     'Latitude',
     'Longitude',
     'Altitude',
     'speed_kmh',
     'COG',
     'precipIntensity',
     'precipProbability',
     'temperature',
     'apparentTemperature',
     'dewPoint',
     'humidity',
     'pressure',
     'windSpeed',
     'cloudCover',
     'uvIndex',
     'visibility',
     'Traffic Jam Factor']

x_train, y_train = train_data_1[feature_columns], train_data_1[qos_column]
x_test, y_test = test_data_3[feature_columns], test_data_3[qos_column]

all_data_informer = train_data_1[all_columns]

# Missing value imputation
#x_train = x_train.fillna(0)
#x_test = x_test.fillna(0)
all_data_informer = all_data_informer.fillna(0)
all_data_informer['date'] = all_data_informer.index.tz_localize(None)
all_data_informer.to_csv(f'data/BERLINV2X/berlin-operator1-pc1.csv', encoding='utf-8', index=False)

used_features = len(feature_columns)
print(f"The implementation uses {used_features} features")

len_x_train = len(x_train)
print(f"The implementation uses {len_x_train} time steps for training")

len_x_test = len(x_test)
print(f"The implementation uses {len_x_test} time steps for testing")


print(f"The Informer uses {len(all_data_informer)} time steps and {len(all_columns)} features")

In [None]:
informer_1 = train_data_1[all_columns].fillna(0)
informer_2 = train_data_4[all_columns].fillna(0)
informer_3 = test_data_2[all_columns].fillna(0)
informer_4 = test_data_3[all_columns].fillna(0)

informer_all = pd.concat([informer_1, informer_2, informer_3, informer_4])
informer_all['date'] = informer_all.index.tz_localize(None)
informer_all['PCell_Downlink_bandwidth_MHz'] = pd.to_numeric(informer_all['PCell_Downlink_bandwidth_MHz'])
informer_all['SCell_Downlink_bandwidth_MHz'] = pd.to_numeric(informer_all['SCell_Downlink_bandwidth_MHz'])
informer_all.info()
informer_all.to_csv(f'data/BERLINV2X/berlin.csv', encoding='utf-8', index=False)

In [None]:
#x_train['PCell_Downlink_bandwidth_MHz'] = pd.to_numeric(x_train['PCell_Downlink_bandwidth_MHz'], downcast='float')
x_train['PCell_Downlink_bandwidth_MHz'] = pd.to_numeric(x_train['PCell_Downlink_bandwidth_MHz'])
x_train['SCell_Downlink_bandwidth_MHz'] = pd.to_numeric(x_train['SCell_Downlink_bandwidth_MHz'])
x_train.info()

In [None]:
x_train.head()