In [1]:
import xarray as xr

lp_ds = xr.open_dataset("./lightpath_dataset.nc", engine="netcdf4")
data = lp_ds.data.to_pandas()
target = lp_ds.target.to_pandas()

Looking at this dataset, it appears to be a **Lightpath Quality of Transmission (QoT) Dataset** for optical network communications. Here's a detailed breakdown:

### Dataset Overview
This is a machine learning dataset with **1,321,452 samples** containing features and targets for optical lightpath analysis in telecommunications networks.

### Feature Columns (35 total):

**Connection Identifiers:**
- `conn_id`: Connection ID - unique identifier for each connection
- `src_id`: Source node ID - origin point in the network  
- `dst_id`: Destination node ID - endpoint in the network

**Path Characteristics:**
- `path_len`: Total physical path length in km
- `avg_link_len`: Average length of links in the path
- `min_link_len`: Shortest link length in the path
- `max_link_len`: Longest link length in the path
- `num_links`: Number of fiber links in the path
- `num_spans`: Number of amplifier spans

**Optical Parameters:**
- `freq`: Optical frequency (likely in THz)
- `mod_order`: Modulation order (16, 32, 64 - higher means more data per symbol)
- `lp_linerate`: Lightpath line rate (data transmission speed)
- `conn_linerate`: Connection line rate

**Network Topology:**
- `src_degree`: Number of connections at source node
- `dst_degree`: Number of connections at destination node

**Link Occupancy Stats:**
- `sum_link_occ`: Total occupancy across all links
- `min_link_occ`: Minimum link occupancy
- `max_link_occ`: Maximum link occupancy  
- `avg_link_occ`: Average link occupancy
- `std_link_occ`: Standard deviation of link occupancy

**Bit Error Rate (BER) Statistics:**
- `max_ber`: Maximum BER along the path
- `min_ber`: Minimum BER along the path
- `avg_ber`: Average BER along the path

**Neighboring Path Analysis (Left/Right sides):**
- `min/max_mod_order_left/right`: Modulation orders of adjacent channels
- `min/max_lp_linerate_left/right`: Line rates of adjacent lightpaths
- `min/max_ber_left/right`: BER values of adjacent channels

### Target Variables (4 total):
- `class`: **Binary classification target** (0/1) - likely indicates if the lightpath meets quality requirements
- `osnr`: **Optical Signal-to-Noise Ratio** - key quality metric
- `snr`: **Signal-to-Noise Ratio** - another quality metric
- `ber`: **Bit Error Rate** - error rate measurement

This dataset is designed for **predicting optical network performance** and **quality of transmission** in telecommunications systems, allowing machine learning models to predict whether a lightpath will successfully transmit data with acceptable quality.

In [2]:
data.head()

feature,conn_id,src_id,dst_id,path_len,avg_link_len,min_link_len,max_link_len,num_links,num_spans,freq,...,min_mod_order_right,max_mod_order_right,min_lp_linerate_left,max_lp_linerate_left,min_lp_linerate_right,max_lp_linerate_right,min_ber_left,max_ber_left,min_ber_right,max_ber_right
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,12344.0,23.0,24.0,152.0,152.0,152.0,152.0,1.0,2.0,192.5375,...,64.0,64.0,112.0,112.0,336.0,336.0,3e-06,3e-06,0.000576,0.000576
1,82906.0,17.0,15.0,133.0,133.0,133.0,133.0,1.0,2.0,192.2375,...,64.0,64.0,112.0,112.0,336.0,336.0,1e-06,1e-06,2e-06,2e-06
2,10869.0,19.0,12.0,500.0,125.0,81.0,194.0,4.0,9.0,193.7,...,0.0,32.0,0.0,168.0,0.0,280.0,0.000402,0.000402,0.000178,0.000178
3,7833.0,11.0,14.0,368.0,184.0,154.0,214.0,2.0,5.0,192.3125,...,0.0,0.0,112.0,112.0,0.0,0.0,0.00098,0.00098,0.0,0.0
4,40521.0,15.0,29.0,1100.0,137.5,89.0,226.0,8.0,17.0,193.5125,...,0.0,16.0,0.0,224.0,0.0,224.0,0.000637,0.000637,0.000415,0.000415


In [3]:
target.head()

metric,class,osnr,snr,ber
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1.0,29.568406,26.058197,3e-06
1,1.0,29.702076,26.191867,2e-06
2,1.0,23.326712,19.816503,0.000764
3,1.0,26.106274,22.596065,0.00095
4,0.0,20.729836,17.219627,0.00787


In [4]:
data.columns

Index(['conn_id', 'src_id', 'dst_id', 'path_len', 'avg_link_len',
       'min_link_len', 'max_link_len', 'num_links', 'num_spans', 'freq',
       'mod_order', 'lp_linerate', 'conn_linerate', 'src_degree', 'dst_degree',
       'sum_link_occ', 'min_link_occ', 'max_link_occ', 'avg_link_occ',
       'std_link_occ', 'max_ber', 'min_ber', 'avg_ber', 'min_mod_order_left',
       'max_mod_order_left', 'min_mod_order_right', 'max_mod_order_right',
       'min_lp_linerate_left', 'max_lp_linerate_left', 'min_lp_linerate_right',
       'max_lp_linerate_right', 'min_ber_left', 'max_ber_left',
       'min_ber_right', 'max_ber_right'],
      dtype='object', name='feature')

In [5]:
# Filter rows where 'src_id' is equal to 'dst_id'
data[data['src_id']== data['dst_id']]

# Create a mask to identify duplicate rows based on all columns
mask = ~data.duplicated(keep="first")

# Apply the mask to filter the DataFrame
data = data.loc[mask].reset_index(drop=True)
target = target.loc[mask].reset_index(drop=True)

# Drop unnecessary columns
data = data.drop(["conn_id", "src_id", "dst_id"], axis = 1)

In [6]:
data.shape , target.shape

((835269, 32), (835269, 4))

In [7]:
# Set the target variable to the 'class' column
target = target['class']

____

## Apply some vaidation checks on original data

### 

In [8]:
data.describe()

feature,path_len,avg_link_len,min_link_len,max_link_len,num_links,num_spans,freq,mod_order,lp_linerate,conn_linerate,...,min_mod_order_right,max_mod_order_right,min_lp_linerate_left,max_lp_linerate_left,min_lp_linerate_right,max_lp_linerate_right,min_ber_left,max_ber_left,min_ber_right,max_ber_right
count,835269.0,835269.0,835269.0,835269.0,835269.0,835269.0,835269.0,835269.0,835269.0,835269.0,...,835269.0,835269.0,835269.0,835269.0,835269.0,835269.0,835269.0,835269.0,835269.0,835269.0
mean,517.65567,135.48761,101.616319,177.389811,3.895746,8.281846,193.257375,47.701615,189.997091,251.216755,...,10.906532,30.99129,71.079999,189.688821,48.586178,145.917495,0.00108,0.001425,0.000813,0.001115
std,217.255528,22.174926,25.373692,42.724862,1.714004,3.529185,0.870854,17.165056,74.078158,136.350445,...,19.641922,23.720516,99.381762,76.255839,86.373455,106.954549,0.00107,0.001174,0.001036,0.001207
min,84.0,84.0,52.0,84.0,1.0,2.0,192.2,16.0,112.0,112.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,368.0,119.5,89.0,154.0,3.0,6.0,192.5375,32.0,112.0,112.0,...,0.0,0.0,0.0,112.0,0.0,0.0,0.00024,0.000397,0.0,0.0
50%,505.0,129.4,101.0,156.0,4.0,8.0,193.025,64.0,224.0,224.0,...,0.0,32.0,0.0,224.0,0.0,168.0,0.000571,0.001091,0.000389,0.000553
75%,652.0,144.333,107.0,214.0,5.0,11.0,193.8125,64.0,224.0,448.0,...,16.0,64.0,112.0,224.0,112.0,224.0,0.001763,0.002543,0.001288,0.002172
max,1382.0,209.0,193.0,313.0,9.0,21.0,195.7625,64.0,336.0,448.0,...,64.0,64.0,336.0,336.0,336.0,336.0,0.0038,0.0038,0.0038,0.0038


In [9]:
# Identify rows where 'avg_link_len' is outside the range defined by 'min_link_len' and 'max_link_len'
data[(data['avg_link_len'] < data['min_link_len']) | (data['avg_link_len'] > data['max_link_len'])]

feature,path_len,avg_link_len,min_link_len,max_link_len,num_links,num_spans,freq,mod_order,lp_linerate,conn_linerate,...,min_mod_order_right,max_mod_order_right,min_lp_linerate_left,max_lp_linerate_left,min_lp_linerate_right,max_lp_linerate_right,min_ber_left,max_ber_left,min_ber_right,max_ber_right


In [10]:
# Identify rows where the product of 'avg_link_len' and 'num_links' does not approximately equal 'path_len'
data[((data['avg_link_len'] * data['num_links']) / data['path_len'] < 0.999) |
     ((data['avg_link_len'] * data['num_links']) / data['path_len'] > 1.001)]

feature,path_len,avg_link_len,min_link_len,max_link_len,num_links,num_spans,freq,mod_order,lp_linerate,conn_linerate,...,min_mod_order_right,max_mod_order_right,min_lp_linerate_left,max_lp_linerate_left,min_lp_linerate_right,max_lp_linerate_right,min_ber_left,max_ber_left,min_ber_right,max_ber_right


In [11]:
# Identify rows where 'num_spans' is less than or equal to 'num_links'
data[data['num_spans'] <= data['num_links']]

feature,path_len,avg_link_len,min_link_len,max_link_len,num_links,num_spans,freq,mod_order,lp_linerate,conn_linerate,...,min_mod_order_right,max_mod_order_right,min_lp_linerate_left,max_lp_linerate_left,min_lp_linerate_right,max_lp_linerate_right,min_ber_left,max_ber_left,min_ber_right,max_ber_right


In [12]:
# Analyze the frequency of each unique value in the 'freq' column
data[['freq']].value_counts()

freq    
192.2000    27015
192.2375    25958
192.2750    24769
192.3125    23654
192.3500    22797
            ...  
195.6125     1380
195.6500     1282
195.6875     1232
195.7250     1143
195.7625     1066
Name: count, Length: 96, dtype: int64

In [13]:
# Analyze the frequency of each unique value in the 'mod_order' column
data[['mod_order']].value_counts()

mod_order
64.0         430906
32.0         362243
16.0          42120
Name: count, dtype: int64

In [14]:
# Analyze the frequency of each unique value in the 'lp_linerate' column
data[['lp_linerate']].value_counts()

lp_linerate
112.0          336891
224.0          308360
336.0           77325
280.0           62327
168.0           50366
Name: count, dtype: int64

In [15]:
# Analyze the frequency of each unique value in the 'conn_linerate' column
data[['conn_linerate']].value_counts()

conn_linerate
112.0            293774
224.0            293120
448.0            248375
Name: count, dtype: int64

In [16]:
# Analyze the frequency of each unique value in the 'sum_link_occ' column
data[['sum_link_occ']].value_counts()

sum_link_occ
50.0            3612
48.0            3596
56.0            3561
52.0            3548
54.0            3519
                ... 
538.0              1
530.0              1
540.0              1
543.0              1
534.0              1
Name: count, Length: 542, dtype: int64

In [17]:
# Identify rows where the product of 'avg_link_occ' and 'num_links' does not approximately equal 'sum_link_occ'
data[((data['avg_link_occ'] * data['num_links']) / data['sum_link_occ'] < 0.999) |
     ((data['avg_link_occ'] * data['num_links']) / data['sum_link_occ'] > 1.001)]

feature,path_len,avg_link_len,min_link_len,max_link_len,num_links,num_spans,freq,mod_order,lp_linerate,conn_linerate,...,min_mod_order_right,max_mod_order_right,min_lp_linerate_left,max_lp_linerate_left,min_lp_linerate_right,max_lp_linerate_right,min_ber_left,max_ber_left,min_ber_right,max_ber_right


In [18]:
# Identify rows where 'avg_ber' is outside the range defined by 'min_ber' and 'max_ber'
data[(data['avg_ber'] < data['min_ber']) | (data['avg_ber'] > data['max_ber'])]

feature,path_len,avg_link_len,min_link_len,max_link_len,num_links,num_spans,freq,mod_order,lp_linerate,conn_linerate,...,min_mod_order_right,max_mod_order_right,min_lp_linerate_left,max_lp_linerate_left,min_lp_linerate_right,max_lp_linerate_right,min_ber_left,max_ber_left,min_ber_right,max_ber_right


In [19]:
# Statistical summary of 'avg_ber', 'min_ber', and 'max_ber' columns
data[['avg_ber', 'min_ber', 'max_ber']].describe()

feature,avg_ber,min_ber,max_ber
count,835269.0,835269.0,835269.0
mean,0.001339,4.6e-05,0.00353
std,0.00033,9.9e-05,0.000501
min,0.0,0.0,0.0
25%,0.001156,1e-06,0.003542
50%,0.00135,2e-06,0.00369
75%,0.001544,9.4e-05,0.003758
max,0.003521,0.003521,0.0038


In [20]:
# Identify rows where 'min_mod_order_left' is greater than 'max_mod_order_left'
data[data['min_mod_order_left'] > data['max_mod_order_left']]

feature,path_len,avg_link_len,min_link_len,max_link_len,num_links,num_spans,freq,mod_order,lp_linerate,conn_linerate,...,min_mod_order_right,max_mod_order_right,min_lp_linerate_left,max_lp_linerate_left,min_lp_linerate_right,max_lp_linerate_right,min_ber_left,max_ber_left,min_ber_right,max_ber_right


In [21]:
# Identify rows where 'min_mod_order_right' is greater than 'max_mod_order_right'
data[data['min_mod_order_right'] > data['max_mod_order_right']]

feature,path_len,avg_link_len,min_link_len,max_link_len,num_links,num_spans,freq,mod_order,lp_linerate,conn_linerate,...,min_mod_order_right,max_mod_order_right,min_lp_linerate_left,max_lp_linerate_left,min_lp_linerate_right,max_lp_linerate_right,min_ber_left,max_ber_left,min_ber_right,max_ber_right


In [22]:
# Identify rows where 'min_lp_linerate_left' is greater than 'max_lp_linerate_left'
data[data['min_lp_linerate_left'] > data['max_lp_linerate_left']]

feature,path_len,avg_link_len,min_link_len,max_link_len,num_links,num_spans,freq,mod_order,lp_linerate,conn_linerate,...,min_mod_order_right,max_mod_order_right,min_lp_linerate_left,max_lp_linerate_left,min_lp_linerate_right,max_lp_linerate_right,min_ber_left,max_ber_left,min_ber_right,max_ber_right


In [23]:
# Identify rows where 'min_lp_linerate_right' is greater than 'max_lp_linerate_right'
data[data['min_lp_linerate_right'] > data['max_lp_linerate_right']]

feature,path_len,avg_link_len,min_link_len,max_link_len,num_links,num_spans,freq,mod_order,lp_linerate,conn_linerate,...,min_mod_order_right,max_mod_order_right,min_lp_linerate_left,max_lp_linerate_left,min_lp_linerate_right,max_lp_linerate_right,min_ber_left,max_ber_left,min_ber_right,max_ber_right


In [24]:
# Identify rows where 'min_ber_left' is greater than 'max_ber_left'
data[data['min_ber_left'] > data['max_ber_left']]

feature,path_len,avg_link_len,min_link_len,max_link_len,num_links,num_spans,freq,mod_order,lp_linerate,conn_linerate,...,min_mod_order_right,max_mod_order_right,min_lp_linerate_left,max_lp_linerate_left,min_lp_linerate_right,max_lp_linerate_right,min_ber_left,max_ber_left,min_ber_right,max_ber_right


In [25]:
# Identify rows where 'min_ber_right' is greater than 'max_ber_right'
data[data['min_ber_right'] > data['max_ber_right']]

feature,path_len,avg_link_len,min_link_len,max_link_len,num_links,num_spans,freq,mod_order,lp_linerate,conn_linerate,...,min_mod_order_right,max_mod_order_right,min_lp_linerate_left,max_lp_linerate_left,min_lp_linerate_right,max_lp_linerate_right,min_ber_left,max_ber_left,min_ber_right,max_ber_right


In [26]:
# Analyze the frequency of each unique value in the target variable
target.value_counts()

class
1.0    599970
0.0    235299
Name: count, dtype: int64

____

## Validation checks on data2

In [27]:
import xarray as xr

lp_ds2 = xr.open_dataset("./lightpath_dataset_2.nc", engine="netcdf4")
data2 = lp_ds2.data.to_pandas()
target2 = lp_ds2.target.to_pandas()

In [28]:
data2.head()

feature,conn_id,src_id,dst_id,path_len,avg_link_len,min_link_len,max_link_len,num_links,num_spans,freq,...,min_mod_order_right,max_mod_order_right,min_lp_linerate_left,max_lp_linerate_left,min_lp_linerate_right,max_lp_linerate_right,min_ber_left,max_ber_left,min_ber_right,max_ber_right
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,93303.0,70.0,42.0,1810.741,226.343,126.626,383.669,8.0,26.0,193.4375,...,0.0,8.0,112.0,168.0,0.0,168.0,0.000879,0.001126,0.002722,0.002722
1,80635.0,8.0,58.0,3267.61,544.602,222.458,880.042,6.0,45.0,192.95,...,0.0,8.0,0.0,112.0,0.0,56.0,6.8e-05,6.8e-05,0.001126,0.001126
2,77702.0,62.0,36.0,1930.201,275.743,107.244,473.565,7.0,27.0,193.6625,...,0.0,4.0,0.0,112.0,0.0,112.0,0.001525,0.001525,0.000134,0.000134
3,23407.0,57.0,23.0,2684.759,536.952,222.458,964.453,5.0,37.0,193.7,...,0.0,8.0,0.0,112.0,0.0,112.0,0.001504,0.001504,3.2e-05,0.002657
4,1033.0,46.0,72.0,3077.523,384.69,246.577,554.111,8.0,42.0,194.075,...,0.0,8.0,0.0,168.0,0.0,168.0,5.4e-05,0.003666,0.000274,0.000441


In [29]:
# Filter rows where 'src_id' is equal to 'dst_id'
data2[data2['src_id']== data2['dst_id']]

# Create a mask to identify duplicate rows based on all columns
mask = ~data2.duplicated(keep="first")

# Apply the mask to filter the DataFrame
data2 = data2.loc[mask].reset_index(drop=True)
target2 = target2.loc[mask].reset_index(drop=True)

# Drop unnecessary columns
data2 = data2.drop(["conn_id", "src_id", "dst_id"], axis = 1)

In [30]:
data2.shape , target2.shape

((963811, 32), (963811, 4))

In [31]:
# Set the target variable to the 'class' column
target2 = target2['class']

In [32]:
data2.describe()

feature,path_len,avg_link_len,min_link_len,max_link_len,num_links,num_spans,freq,mod_order,lp_linerate,conn_linerate,...,min_mod_order_right,max_mod_order_right,min_lp_linerate_left,max_lp_linerate_left,min_lp_linerate_right,max_lp_linerate_right,min_ber_left,max_ber_left,min_ber_right,max_ber_right
count,963811.0,963811.0,963811.0,963811.0,963811.0,963811.0,963811.0,963811.0,963811.0,963811.0,...,963811.0,963811.0,963811.0,963811.0,963811.0,963811.0,963811.0,963811.0,963811.0,963811.0
mean,2759.790034,393.320806,162.877396,709.265701,7.217736,37.92487,193.414083,13.637549,145.950789,255.396944,...,1.00887,9.824391,20.532905,143.729349,13.901472,129.332319,0.00066,0.001248,0.000586,0.001133
std,1455.618259,152.208947,129.181948,277.484489,3.430577,19.608798,1.000644,12.935778,55.388069,131.930099,...,3.887065,9.741044,53.584591,57.665677,42.908832,67.821045,0.000846,0.001105,0.000819,0.001113
min,24.214,24.214,24.214,24.214,1.0,1.0,192.2,4.0,56.0,112.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1695.97,284.973,107.244,473.565,5.0,23.0,192.5375,8.0,112.0,112.0,...,0.0,4.0,0.0,112.0,0.0,112.0,0.00011,0.000276,7.6e-05,0.000174
50%,2515.68,374.685,143.553,690.409,7.0,35.0,193.175,8.0,112.0,224.0,...,0.0,8.0,0.0,112.0,0.0,112.0,0.000281,0.000897,0.000225,0.000717
75%,3933.867,474.282,179.195,920.337,9.0,54.0,194.15,16.0,168.0,448.0,...,0.0,8.0,0.0,168.0,0.0,168.0,0.000854,0.002107,0.000715,0.001962
max,7834.746,1221.189,1221.189,1221.189,25.0,106.0,195.7625,64.0,336.0,448.0,...,64.0,64.0,336.0,336.0,336.0,336.0,0.0038,0.0038,0.0038,0.0038


In [33]:
# Identify rows where 'avg_link_len' is outside the range defined by 'min_link_len' and 'max_link_len'
data2[(data2['avg_link_len'] < data2['min_link_len']) | (data2['avg_link_len'] > data2['max_link_len'])]

feature,path_len,avg_link_len,min_link_len,max_link_len,num_links,num_spans,freq,mod_order,lp_linerate,conn_linerate,...,min_mod_order_right,max_mod_order_right,min_lp_linerate_left,max_lp_linerate_left,min_lp_linerate_right,max_lp_linerate_right,min_ber_left,max_ber_left,min_ber_right,max_ber_right


In [34]:
# Identify rows where the product of 'avg_link_len' and 'num_links' does not approximately equal 'path_len'
data2[((data2['avg_link_len'] * data2['num_links']) / data2['path_len'] < 0.999) |
     ((data2['avg_link_len'] * data2['num_links']) / data2['path_len'] > 1.001)]

feature,path_len,avg_link_len,min_link_len,max_link_len,num_links,num_spans,freq,mod_order,lp_linerate,conn_linerate,...,min_mod_order_right,max_mod_order_right,min_lp_linerate_left,max_lp_linerate_left,min_lp_linerate_right,max_lp_linerate_right,min_ber_left,max_ber_left,min_ber_right,max_ber_right


In [35]:
data2.shape

(963811, 32)

In [36]:
# Identify rows where 'num_spans' is less than or equal to 'num_links'
data2 = data2[data2['num_spans'] > data2['num_links']]

In [37]:
data2.shape

(962008, 32)

In [38]:
# Analyze the frequency of each unique value in the 'mod_order' column
data2[['mod_order']].value_counts()

mod_order
8.0          424513
16.0         227229
4.0          183055
32.0          89902
64.0          37309
Name: count, dtype: int64

In [39]:
# Remove all entries where mod_order is 4 or 8 and accordingly filter the targets
data2 = data2[(data2['mod_order'] != 4) & (data2['mod_order'] != 8)]
target2 = target2[data2.index]

In [40]:
data2.columns

Index(['path_len', 'avg_link_len', 'min_link_len', 'max_link_len', 'num_links',
       'num_spans', 'freq', 'mod_order', 'lp_linerate', 'conn_linerate',
       'src_degree', 'dst_degree', 'sum_link_occ', 'min_link_occ',
       'max_link_occ', 'avg_link_occ', 'std_link_occ', 'max_ber', 'min_ber',
       'avg_ber', 'min_mod_order_left', 'max_mod_order_left',
       'min_mod_order_right', 'max_mod_order_right', 'min_lp_linerate_left',
       'max_lp_linerate_left', 'min_lp_linerate_right',
       'max_lp_linerate_right', 'min_ber_left', 'max_ber_left',
       'min_ber_right', 'max_ber_right'],
      dtype='object', name='feature')

In [41]:
data2 = data2[(data2['min_mod_order_left'] != 4) & (data2['min_mod_order_left'] != 8)]
target2 = target2[data2.index]
data2 = data2[(data2['max_mod_order_left'] != 4) & (data2['max_mod_order_left'] != 8)]
target2 = target2[data2.index]
data2 = data2[(data2['min_mod_order_right'] != 4) & (data2['min_mod_order_right'] != 8)]
target2 = target2[data2.index]
data2 = data2[(data2['max_mod_order_right'] != 4) & (data2['max_mod_order_right'] != 8)]
target2 = target2[data2.index]

In [42]:
data2 = data2[(data2['min_lp_linerate_left'] != 56) ]
target2 = target2[data2.index]
data2 = data2[(data2['max_lp_linerate_right'] != 56) ]
target2 = target2[data2.index]
data2 = data2[(data2['min_lp_linerate_left'] != 56) ]
target2 = target2[data2.index]
data2 = data2[(data2['max_lp_linerate_right'] != 56) ]
target2 = target2[data2.index]

In [43]:
# Analyze the frequency of each unique value in the 'lp_linerate' column
data2[['lp_linerate']].value_counts()

lp_linerate
224.0          60128
112.0          47387
280.0           7506
336.0           4282
168.0           3900
Name: count, dtype: int64

In [44]:
# Analyze the frequency of each unique value in the 'conn_linerate' column
data2[['conn_linerate']].value_counts()

conn_linerate
224.0            45086
112.0            44877
448.0            33240
Name: count, dtype: int64

In [45]:
# Analyze the frequency of each unique value in the 'sum_link_occ' column
data2[['sum_link_occ']].value_counts()

sum_link_occ
76.0            563
52.0            562
63.0            561
104.0           561
94.0            556
               ... 
579.0             1
590.0             1
581.0             1
587.0             1
679.0             1
Name: count, Length: 617, dtype: int64

In [46]:
# Identify rows where the product of 'avg_link_occ' and 'num_links' does not approximately equal 'sum_link_occ'
data2[((data2['avg_link_occ'] * data2['num_links']) / data2['sum_link_occ'] < 0.999) |
     ((data2['avg_link_occ'] * data2['num_links']) / data2['sum_link_occ'] > 1.001)]

feature,path_len,avg_link_len,min_link_len,max_link_len,num_links,num_spans,freq,mod_order,lp_linerate,conn_linerate,...,min_mod_order_right,max_mod_order_right,min_lp_linerate_left,max_lp_linerate_left,min_lp_linerate_right,max_lp_linerate_right,min_ber_left,max_ber_left,min_ber_right,max_ber_right


In [47]:
# Identify rows where 'avg_ber' is outside the range defined by 'min_ber' and 'max_ber'
data2[(data2['avg_ber'] < data2['min_ber']) | (data2['avg_ber'] > data2['max_ber'])]

feature,path_len,avg_link_len,min_link_len,max_link_len,num_links,num_spans,freq,mod_order,lp_linerate,conn_linerate,...,min_mod_order_right,max_mod_order_right,min_lp_linerate_left,max_lp_linerate_left,min_lp_linerate_right,max_lp_linerate_right,min_ber_left,max_ber_left,min_ber_right,max_ber_right


In [48]:
# Statistical summary of 'avg_ber', 'min_ber', and 'max_ber' columns
data2[['avg_ber', 'min_ber', 'max_ber']].describe()

feature,avg_ber,min_ber,max_ber
count,123203.0,123203.0,123203.0
mean,0.000967,4.3e-05,0.003494
std,0.000216,5.9e-05,0.000417
min,0.0,0.0,0.0
25%,0.000832,2.7e-05,0.003424
50%,0.000958,3.5e-05,0.003636
75%,0.001093,4.8e-05,0.003737
max,0.003528,0.003528,0.0038


In [49]:
# Identify rows where 'min_mod_order_left' is greater than 'max_mod_order_left'
data2[data2['min_mod_order_left'] > data2['max_mod_order_left']]

feature,path_len,avg_link_len,min_link_len,max_link_len,num_links,num_spans,freq,mod_order,lp_linerate,conn_linerate,...,min_mod_order_right,max_mod_order_right,min_lp_linerate_left,max_lp_linerate_left,min_lp_linerate_right,max_lp_linerate_right,min_ber_left,max_ber_left,min_ber_right,max_ber_right


In [50]:
# Identify rows where 'min_mod_order_right' is greater than 'max_mod_order_right'
data2[data2['min_mod_order_right'] > data2['max_mod_order_right']]

feature,path_len,avg_link_len,min_link_len,max_link_len,num_links,num_spans,freq,mod_order,lp_linerate,conn_linerate,...,min_mod_order_right,max_mod_order_right,min_lp_linerate_left,max_lp_linerate_left,min_lp_linerate_right,max_lp_linerate_right,min_ber_left,max_ber_left,min_ber_right,max_ber_right


In [51]:
# Identify rows where 'min_lp_linerate_left' is greater than 'max_lp_linerate_left'
data2[data2['min_lp_linerate_left'] > data2['max_lp_linerate_left']]

feature,path_len,avg_link_len,min_link_len,max_link_len,num_links,num_spans,freq,mod_order,lp_linerate,conn_linerate,...,min_mod_order_right,max_mod_order_right,min_lp_linerate_left,max_lp_linerate_left,min_lp_linerate_right,max_lp_linerate_right,min_ber_left,max_ber_left,min_ber_right,max_ber_right


In [52]:
# Identify rows where 'min_lp_linerate_right' is greater than 'max_lp_linerate_right'
data2[data2['min_lp_linerate_right'] > data2['max_lp_linerate_right']]

feature,path_len,avg_link_len,min_link_len,max_link_len,num_links,num_spans,freq,mod_order,lp_linerate,conn_linerate,...,min_mod_order_right,max_mod_order_right,min_lp_linerate_left,max_lp_linerate_left,min_lp_linerate_right,max_lp_linerate_right,min_ber_left,max_ber_left,min_ber_right,max_ber_right


In [53]:
# Identify rows where 'min_ber_left' is greater than 'max_ber_left'
data2[data2['min_ber_left'] > data2['max_ber_left']]

feature,path_len,avg_link_len,min_link_len,max_link_len,num_links,num_spans,freq,mod_order,lp_linerate,conn_linerate,...,min_mod_order_right,max_mod_order_right,min_lp_linerate_left,max_lp_linerate_left,min_lp_linerate_right,max_lp_linerate_right,min_ber_left,max_ber_left,min_ber_right,max_ber_right


In [54]:
# Identify rows where 'min_ber_right' is greater than 'max_ber_right'
data2[data2['min_ber_right'] > data2['max_ber_right']]

feature,path_len,avg_link_len,min_link_len,max_link_len,num_links,num_spans,freq,mod_order,lp_linerate,conn_linerate,...,min_mod_order_right,max_mod_order_right,min_lp_linerate_left,max_lp_linerate_left,min_lp_linerate_right,max_lp_linerate_right,min_ber_left,max_ber_left,min_ber_right,max_ber_right


In [55]:
# Analyze the frequency of each unique value in the target variable
target2.value_counts()

class
1.0    83221
0.0    39982
Name: count, dtype: int64

In [56]:
data2.shape, target2.shape

((123203, 32), (123203,))

In [57]:
data.shape, target.shape

((835269, 32), (835269,))

In [58]:
354440/835269

0.42434233761818047

______

### Transform data to csv and save!

In [59]:
# save cleaned datasets
data.to_csv("cleaned_lightpath_dataset.csv", index=False)
target.to_csv("cleaned_lightpath_target.csv", index=False)

data2.to_csv("cleaned_lightpath_dataset_2.csv", index=False)
target2.to_csv("cleaned_lightpath_target_2.csv", index=False)


_____

In [60]:
import pandas as pd

data = pd.read_csv("cleaned_lightpath_dataset.csv")
data2 = pd.read_csv("cleaned_lightpath_dataset_2.csv")

target = pd.read_csv("cleaned_lightpath_target.csv")
target2 = pd.read_csv("cleaned_lightpath_target_2.csv")

In [61]:
data.shape, target.shape, data2.shape, target2.shape

((835269, 32), (835269, 1), (123203, 32), (123203, 1))

In [62]:
pd.DataFrame({
    "nunique_data": data.nunique(),
    "nunique_data2": data2.nunique()
})

Unnamed: 0,nunique_data,nunique_data2
path_len,193,2354
avg_link_len,187,2388
min_link_len,25,99
max_link_len,27,93
num_links,9,14
num_spans,20,38
freq,96,96
mod_order,3,3
lp_linerate,5,5
conn_linerate,3,3


In [70]:
# ['min_mod_order_left', 'max_mod_order_left', 'min_mod_order_right', 'max_mod_order_right',
#            'min_lp_linerate_left', 'max_lp_linerate_left', 'min_lp_linerate_right', 'max_lp_linerate_right']
for col in ['src_degree', 'dst_degree']:
    unique_data = set(data[col].unique())
    unique_data2 = set(data2[col].unique())
    
    in_data_not_in_data2 = unique_data - unique_data2
    in_data2_not_in_data = unique_data2 - unique_data
    
    print(f"Column: {col}")
    print(f"Values in data but not in data2: {in_data_not_in_data2}")
    print(f"Values in data2 but not in data: {in_data2_not_in_data}")
    print()

Column: src_degree
Values in data but not in data2: set()
Values in data2 but not in data: {2.0}

Column: dst_degree
Values in data but not in data2: set()
Values in data2 but not in data: {2.0}



In [71]:
target.value_counts(), target2.value_counts()

(class
 1.0      599970
 0.0      235299
 Name: count, dtype: int64,
 class
 1.0      83221
 0.0      39982
 Name: count, dtype: int64)

In [72]:
target.value_counts(normalize=True), target2.value_counts(normalize=True)

(class
 1.0      0.718296
 0.0      0.281704
 Name: proportion, dtype: float64,
 class
 1.0      0.675479
 0.0      0.324521
 Name: proportion, dtype: float64)

In [73]:
data.columns

Index(['path_len', 'avg_link_len', 'min_link_len', 'max_link_len', 'num_links',
       'num_spans', 'freq', 'mod_order', 'lp_linerate', 'conn_linerate',
       'src_degree', 'dst_degree', 'sum_link_occ', 'min_link_occ',
       'max_link_occ', 'avg_link_occ', 'std_link_occ', 'max_ber', 'min_ber',
       'avg_ber', 'min_mod_order_left', 'max_mod_order_left',
       'min_mod_order_right', 'max_mod_order_right', 'min_lp_linerate_left',
       'max_lp_linerate_left', 'min_lp_linerate_right',
       'max_lp_linerate_right', 'min_ber_left', 'max_ber_left',
       'min_ber_right', 'max_ber_right'],
      dtype='object')

________________________________________________

### Generating main datasets for training and evaluation

In [74]:
import numpy as np

data1 = pd.read_csv("cleaned_lightpath_dataset.csv")
data2 = pd.read_csv("cleaned_lightpath_dataset_2.csv")
target1 = pd.read_csv("cleaned_lightpath_target.csv").iloc[:, 0]
target2 = pd.read_csv("cleaned_lightpath_target_2.csv").iloc[:, 0]

seed = 42
rng = np.random.default_rng(seed)


def build_dataset(pct, balanced=False):
    n_total = int(round(len(data2) * pct))

    if balanced:
        per_class = n_total // 2
        idx0 = target2[target2 == 0].index
        idx1 = target2[target2 == 1].index
        per_class = min(per_class, len(idx0), len(idx1))
        pick0 = rng.choice(idx0, size=per_class, replace=False)
        pick1 = rng.choice(idx1, size=per_class, replace=False)
        idx = np.concatenate([pick0, pick1])
        rng.shuffle(idx)
    else:
        idx = rng.choice(data2.index, size=n_total, replace=False)

    new_data = pd.concat([data1, data2.loc[idx]], ignore_index=True)
    new_target = pd.concat([target1, target2.loc[idx]], ignore_index=True)
    return new_data, new_target

# Unbalanced mixes
data1_plus_5,  target1_plus_5  = build_dataset(0.05, balanced=False)
data1_plus_10, target1_plus_10 = build_dataset(0.10, balanced=False)
data1_plus_15, target1_plus_15 = build_dataset(0.15, balanced=False)
data1_plus_20, target1_plus_20 = build_dataset(0.20, balanced=False)

# Balanced mixes (equal class 0/1 samples from DATA2)
data1_plus_5_bal,  target1_plus_5_bal  = build_dataset(0.05, balanced=True)
data1_plus_10_bal, target1_plus_10_bal = build_dataset(0.10, balanced=True)
data1_plus_15_bal, target1_plus_15_bal = build_dataset(0.15, balanced=True)
data1_plus_20_bal, target1_plus_20_bal = build_dataset(0.20, balanced=True)

In [75]:
# Save
data1_plus_5.to_csv("data1_plus_5.csv", index=False)
target1_plus_5.to_csv("target1_plus_5.csv", index=False)

data1_plus_10.to_csv("data1_plus_10.csv", index=False)
target1_plus_10.to_csv("target1_plus_10.csv", index=False)

data1_plus_15.to_csv("data1_plus_15.csv", index=False)
target1_plus_15.to_csv("target1_plus_15.csv", index=False)

data1_plus_20.to_csv("data1_plus_20.csv", index=False)
target1_plus_20.to_csv("target1_plus_20.csv", index=False)

data1_plus_5_bal.to_csv("data1_plus_5_balanced.csv", index=False)
target1_plus_5_bal.to_csv("target1_plus_5_balanced.csv", index=False)

data1_plus_10_bal.to_csv("data1_plus_10_balanced.csv", index=False)
target1_plus_10_bal.to_csv("target1_plus_10_balanced.csv", index=False)

data1_plus_15_bal.to_csv("data1_plus_15_balanced.csv", index=False)
target1_plus_15_bal.to_csv("target1_plus_15_balanced.csv", index=False)

data1_plus_20_bal.to_csv("data1_plus_20_balanced.csv", index=False)
target1_plus_20_bal.to_csv("target1_plus_20_balanced.csv", index=False)

In [3]:
from pathlib import Path

data = pd.read_csv("datasets/cleaned_lightpath_dataset_2.csv")
target = pd.read_csv("datasets/cleaned_lightpath_target_2.csv")

rng = np.random.default_rng(42)
idx = rng.permutation(len(data))

sizes = [len(data) // 3] * 3
for i in range(len(data) % 3):
    sizes[i] += 1

out_dir = Path("datasets/dataset2_shards")
out_dir.mkdir(parents=True, exist_ok=True)

start = 0
for i, size in enumerate(sizes, 1):
    shard_idx = idx[start:start + size]
    data_shard = data.iloc[shard_idx].reset_index(drop=True)
    target_shard = target.iloc[shard_idx].reset_index(drop=True)
    data_shard.to_csv(out_dir / f"dataset2_shard_{i}.csv", index=False)
    target_shard.to_csv(out_dir / f"target2_shard_{i}.csv", index=False)
    start += size
