This python script preprocess the data for the interaction of two signals in an optical fiber. The target variable is the energy E. There are three input files that are merged and concatenated in a single data-set. Finally, the new data-set is cleaned and explored.

In [1]:
import numpy as np
import pandas as pd

# Load the datasets

In [2]:
df1 = pd.read_csv('Data-Energy-1.dat',sep=',')
df1.head()

Unnamed: 0,B2,G,DO,Dz,P1,P2,E
0,0.005,0.002,5,0.5,100,10,0.79466
1,0.0075,0.002,5,0.5,100,10,1.306685
2,0.01,0.002,5,0.5,100,10,1.56549
3,0.015,0.002,5,0.5,100,10,1.743379
4,0.02,0.002,5,0.5,100,10,1.789472


In [3]:
df2 = pd.read_csv('Data-Energy-2-P1.dat',sep=',').drop_duplicates()
df2.head()

Unnamed: 0,B2,G,DO,Dz,P1,E
0,0.02,0.002,4,0.5,100,2.121925
1,0.02,0.002,5,0.5,100,1.789472
2,0.02,0.002,6,0.5,100,1.460797
3,0.02,0.002,7,0.5,100,1.173408
4,0.02,0.002,8,0.5,100,0.9329


In [4]:
df3 = pd.read_csv('Data-Energy-2-P2.dat',sep=',').drop_duplicates()
df3.head()

Unnamed: 0,B2,G,DO,Dz,P2,E
0,0.02,0.002,4,0.5,10,2.121925
1,0.02,0.002,5,0.5,10,1.789472
2,0.02,0.002,6,0.5,10,1.460797
3,0.02,0.002,7,0.5,10,1.173408
4,0.02,0.002,8,0.5,10,0.9329


# Merge df2 and df3

In [5]:
df23 = pd.merge(df2, df3, on=['B2','G','DO','Dz','E'])
df23.head()

Unnamed: 0,B2,G,DO,Dz,P1,E,P2
0,0.02,0.002,4,0.5,100,2.121925,10
1,0.02,0.002,5,0.5,100,1.789472,10
2,0.02,0.002,6,0.5,100,1.460797,10
3,0.02,0.002,7,0.5,100,1.173408,10
4,0.02,0.002,8,0.5,100,0.9329,10


# Concatenate df23 and df1

In [6]:
df=pd.concat([df1, df23]).reset_index(drop=True)
df.head()

Unnamed: 0,B2,DO,Dz,E,G,P1,P2
0,0.005,5,0.5,0.79466,0.002,100,10
1,0.0075,5,0.5,1.306685,0.002,100,10
2,0.01,5,0.5,1.56549,0.002,100,10
3,0.015,5,0.5,1.743379,0.002,100,10
4,0.02,5,0.5,1.789472,0.002,100,10


# Check for NANs

In [7]:
df.isnull().values.sum()

1

# Replace NANs with 0 and reorder columns

In [8]:
df = df.fillna(0).loc[:,['B2','G','DO','Dz','P1','P2','E']] #df = df.replace('?',np.nan).fillna(df.median())

# Save new data-frame

In [9]:
df.to_csv('Data-Energy.dat',index=None)

# Explore the data set

In [10]:
df.shape

(43, 7)

In [11]:
df.describe()

Unnamed: 0,B2,G,DO,Dz,P1,P2,E
count,43.0,43.0,43.0,43.0,43.0,43.0,43.0
mean,0.01936,0.001965,5.209302,0.467442,91.627907,15.232558,1.557556
std,0.003822,0.000334,0.832613,0.110671,20.345847,13.970781,1.017889
min,0.005,0.0005,4.0,0.0,20.0,5.0,0.0
25%,0.02,0.002,5.0,0.5,100.0,10.0,0.780392
50%,0.02,0.002,5.0,0.5,100.0,10.0,1.460797
75%,0.02,0.002,5.0,0.5,100.0,10.0,2.116922
max,0.03,0.003,9.0,0.6,100.0,70.0,4.323701


In [12]:
df.columns.values

array(['B2', 'G', 'DO', 'Dz', 'P1', 'P2', 'E'], dtype=object)

In [13]:
np.sort(df['P1'].unique())

array([ 20,  30,  40,  50,  60,  70,  80,  90, 100])

In [14]:
df['P1'].value_counts(normalize=True)

100    0.813953
60     0.023256
50     0.023256
40     0.023256
30     0.023256
90     0.023256
20     0.023256
80     0.023256
70     0.023256
Name: P1, dtype: float64

In [15]:
df.groupby(['P1']).mean()

Unnamed: 0_level_0,B2,G,DO,Dz,P2,E
P1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
20,0.02,0.002,5.0,0.5,10.0,0.058021
30,0.02,0.002,5.0,0.5,10.0,0.162934
40,0.02,0.002,5.0,0.5,10.0,0.317412
50,0.02,0.002,5.0,0.5,10.0,0.507779
60,0.02,0.002,5.0,0.5,10.0,0.729145
70,0.02,0.002,5.0,0.5,10.0,0.971387
80,0.02,0.002,5.0,0.5,10.0,1.233964
90,0.02,0.002,5.0,0.5,10.0,1.496756
100,0.019214,0.001957,5.257143,0.46,16.428571,1.757072


In [16]:
df[(df['P1']>20) & (df['P1']<=50)] #and

Unnamed: 0,B2,G,DO,Dz,P1,P2,E
27,0.02,0.002,5,0.5,30,10,0.162934
28,0.02,0.002,5,0.5,40,10,0.317412
29,0.02,0.002,5,0.5,50,10,0.507779


In [17]:
df[df['P1']==30]

Unnamed: 0,B2,G,DO,Dz,P1,P2,E
27,0.02,0.002,5,0.5,30,10,0.162934


In [18]:
df[(df['P1']<20) | (df['P1']>50)] #or

Unnamed: 0,B2,G,DO,Dz,P1,P2,E
0,0.005,0.002,5,0.5,100,10,0.79466
1,0.0075,0.002,5,0.5,100,10,1.306685
2,0.01,0.002,5,0.5,100,10,1.56549
3,0.015,0.002,5,0.5,100,10,1.743379
4,0.02,0.002,5,0.5,100,10,1.789472
5,0.025,0.002,5,0.5,100,10,1.782381
6,0.03,0.002,5,0.5,100,10,1.746672
7,0.02,0.002,5,0.0,100,10,0.0
8,0.02,0.002,5,0.1,100,10,0.346017
9,0.02,0.002,5,0.2,100,10,0.696188


# Calculate correlation with the energy E and sort by importance

In [19]:
df.corr()['E'].abs().sort_values(ascending=False)

E     1.000000
P2    0.787279
P1    0.462497
Dz    0.358922
G     0.289597
DO    0.185260
B2    0.101498
Name: E, dtype: float64