# Data Preprocessing
1. delete the records with missing features
2. encode the values

In [62]:
import pandas as pd
from sklearn import preprocessing

In [5]:
raw_df = pd.read_csv('all_records_and_signals.csv')
raw_df.head()

Unnamed: 0,Gestation,Age,Parity,Abortions,Weight,Hypertension,Diabetes,Placental_position,Bleeding_first_trimester,Bleeding_second_trimester,Funneling,Smoker,Root Mean Square,Median Frequency,Peak Frequency,Sample Entropy
0,35.0,30.0,0.0,0.0,58,no,no,front,no,no,negative,no,27.9148,0.3109,0.3159,0.502
1,38.6,,,,63,,,end,yes,no,negative,,54.239,0.1426,0.1126,0.243
2,38.6,,,,70,,,end,yes,no,negative,,26.1735,0.1598,0.1049,0.396
3,37.1,27.0,0.0,1.0,100,no,no,front,no,no,negative,no,7.5545,0.175,0.1522,0.732
4,38.6,28.0,0.0,2.0,72,no,no,front,no,no,negative,no,12.7331,0.1816,0.1816,0.817


## Drop the incomplete records
and store it in a new csv file

In [47]:
dropped_incomplete_df = raw_df.drop(raw_df[raw_df.eq('None').any(1)].index, inplace=False)
dropped_incomplete_df.to_csv('incomplete_record_dropped.csv')

In [50]:
# rename the untilted column
incomplete_record_dropped_df = pd.read_csv('incomplete_record_dropped.csv')
incomplete_record_dropped_df = incomplete_record_dropped_df.rename(columns={'Unnamed: 0':'record_number'})
incomplete_record_dropped_df.to_csv('incomplete_record_dropped.csv', index=False)
pre_encoding_df = pd.read_csv('incomplete_record_dropped.csv')
pre_encoding_df.head()

Unnamed: 0,record_number,Gestation,Age,Parity,Abortions,Weight,Hypertension,Diabetes,Placental_position,Bleeding_first_trimester,Bleeding_second_trimester,Funneling,Smoker,Root Mean Square,Median Frequency,Peak Frequency,Sample Entropy
0,0,35.0,30,0,0,58,no,no,front,no,no,negative,no,27.9148,0.3109,0.3159,0.502
1,3,37.1,27,0,1,100,no,no,front,no,no,negative,no,7.5545,0.175,0.1522,0.732
2,4,38.6,28,0,2,72,no,no,front,no,no,negative,no,12.7331,0.1816,0.1816,0.817
3,5,38.9,30,0,0,64,no,no,end,yes,no,negative,no,8.4929,0.3059,0.3402,0.747
4,6,40.3,37,1,1,79,no,no,end,no,no,negative,no,14.922,0.1294,0.1144,0.499


### checking the data

In [104]:
# 300 -> 169 records, 131 incomplete records dropped
len(raw_df),len(pre_encoding_df)

(300, 169)

In [106]:
# 38 -> 19 preterm records
len(raw_df[raw_df['Gestation'] < 37]), len(pre_encoding_df[pre_encoding_df['Gestation'] < 37]), 

(38, 19)

## Encoding Strategy: Ordinal Variables
assigning each feature to a number in a series (1,2,3...) with sklearn.preprocessing

replace text with numbers: 

- no: 0
- yes: 1
- front:1
- end: 0
- negative: 0
- positive: 1

check the unique values for each category

In [61]:
CATEGORICAL_FEATURES = [ 
'Hypertension', 
'Diabetes', 
'Placental_position', 
'Bleeding_first_trimester', 
'Bleeding_second_trimester', 
'Funneling', 
'Smoker'
]
unique_categorical_values = []
for feature in CATEGORICAL_FEATURES:
    unique_categorical_values.append(pre_encoding_df['{}'.format(feature)].unique())
unique_categorical_values

[array(['no', 'yes'], dtype=object),
 array(['no', 'yes'], dtype=object),
 array(['front', 'end'], dtype=object),
 array(['no', 'yes'], dtype=object),
 array(['no', 'yes'], dtype=object),
 array(['negative', 'positive'], dtype=object),
 array(['no', 'yes'], dtype=object)]

map the unique categorical values to its corresponsing ordinal value

In [76]:
ordinal_encoding_df = pre_encoding_df.copy()
encoding_dict = {}
for value in unique_categorical_values:
    le = preprocessing.LabelEncoder()
    le.fit(value)
    vals = list(le.transform(value))
    for i,j in zip(list(le.classes_), vals):
        encoding_dict[i] = vals[j]
encoding_dict

{'no': 0, 'yes': 1, 'end': 0, 'front': 1, 'negative': 0, 'positive': 1}

Replace the original values with the ordinal values

In [91]:
for key in encoding_dict.keys():
    ordinal_encoding_df.replace(key, encoding_dict[key], inplace=True)
ordinal_encoding_df.head()

Unnamed: 0,record_number,Gestation,Age,Parity,Abortions,Weight,Hypertension,Diabetes,Placental_position,Bleeding_first_trimester,Bleeding_second_trimester,Funneling,Smoker,Root Mean Square,Median Frequency,Peak Frequency,Sample Entropy
0,0,35.0,30,0,0,58,0,0,1,0,0,0,0,27.9148,0.3109,0.3159,0.502
1,3,37.1,27,0,1,100,0,0,1,0,0,0,0,7.5545,0.175,0.1522,0.732
2,4,38.6,28,0,2,72,0,0,1,0,0,0,0,12.7331,0.1816,0.1816,0.817
3,5,38.9,30,0,0,64,0,0,0,1,0,0,0,8.4929,0.3059,0.3402,0.747
4,6,40.3,37,1,1,79,0,0,0,0,0,0,0,14.922,0.1294,0.1144,0.499


check the data types before writing to csv file

In [92]:
ordinal_encoding_df.dtypes

record_number                  int64
Gestation                    float64
Age                            int64
Parity                         int64
Abortions                      int64
Weight                         int64
Hypertension                   int64
Diabetes                       int64
Placental_position             int64
Bleeding_first_trimester       int64
Bleeding_second_trimester      int64
Funneling                      int64
Smoker                         int64
Root Mean Square             float64
Median Frequency             float64
Peak Frequency               float64
Sample Entropy               float64
dtype: object

## Output csv file
imbalanced dataset: 
with 19 perterm records and 150 term records

In [107]:
ordinal_encoding_df.to_csv('ordinal_encoding.csv', index=False)
test = pd.read_csv('ordinal_encoding.csv')
test.head()

Unnamed: 0,record_number,Gestation,Age,Parity,Abortions,Weight,Hypertension,Diabetes,Placental_position,Bleeding_first_trimester,Bleeding_second_trimester,Funneling,Smoker,Root Mean Square,Median Frequency,Peak Frequency,Sample Entropy
0,0,35.0,30,0,0,58,0,0,1,0,0,0,0,27.9148,0.3109,0.3159,0.502
1,3,37.1,27,0,1,100,0,0,1,0,0,0,0,7.5545,0.175,0.1522,0.732
2,4,38.6,28,0,2,72,0,0,1,0,0,0,0,12.7331,0.1816,0.1816,0.817
3,5,38.9,30,0,0,64,0,0,0,1,0,0,0,8.4929,0.3059,0.3402,0.747
4,6,40.3,37,1,1,79,0,0,0,0,0,0,0,14.922,0.1294,0.1144,0.499
