In [2]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from pyspark.sql import SparkSession


spark = SparkSession.builder \
    .appName("Birth Data Analysis") \
    .getOrCreate()

table_path = "hdfs:///user/abdur_zeybey/data_table_1920"

In [3]:
old_cols = [
            "DOB_YY", #Year
            "DOB_MM", #Month
            "DOB_TT",#Time
            "DOB_WK",#Weekday
            "MAGER", #Mother's Age
            "MEDUC", #Mother’s Education
            "FAGECOMB", #Father’s Age
            "FEDUC",  #Father’s Education
            "CIG_0", #Cigarettes Before Pregnancy
            "RF_PDIAB", #Pre-pregnancy Diabetes
            "RF_PHYPE", #Pre-pregnancy Hypertension
            "RF_INFTR", #Infertility Treatment Used
            "RF_FEDRG", #Fertility Enhancing Drugs
            "DPLURAL", #Plurality Recode
            "SEX", #Sex of Infant
            "DBWT", #Birth Weight – Detail in Grams
            "AB_AVEN1", #Assisted Ventilation (immediately)	 
            "AB_SEIZ", #Seizures	
        ]

cols = [
            "DOB_MM", #Month
            "MAGER", #Mother's Age
            "FAGECOMB", #Father’s Age
            "PRIORLIVE", #Prior Births Now Living
            "PRIORDEAD", #Prior Births Now Dead
            "ILOP_R", #Interval Since Last Other Pregnancy Recode
            "CIG_0", #Cigarettes Before Pregnancy
            "BMI", #Mother Body Mass Index
            "RF_PDIAB", #Pre-pregnancy Diabetes
            "RF_PHYPE", #Pre-pregnancy Hypertension
            "DPLURAL", #Plurality Recode
            "SEX", #Sex of Infant
            "AB_AVEN1", #Assisted Ventilation (immediately)
        ]

In [4]:
data_table = spark.read.parquet(table_path).select(cols)


                                                                                

# Data Selection and Pre-Processing

In [5]:
data_table.head

<bound method DataFrame.head of DataFrame[DOB_MM: int, MAGER: string, FAGECOMB: string, PRIORLIVE: int, PRIORDEAD: int, ILOP_R: int, CIG_0: int, BMI: string, RF_PDIAB: string, RF_PHYPE: string, DPLURAL: string, SEX: string, AB_AVEN1: string]>

In [6]:
data_table = data_table.filter(data_table.DOB_MM == 1)

In [7]:
data_table = data_table.drop("DOB_MM")

In [8]:
res = data_table.groupBy("AB_AVEN1").count()
res.show()

                                                                                

+--------+------+
|AB_AVEN1| count|
+--------+------+
|       Y| 30254|
|       U|   443|
|       N|586517|
+--------+------+



In [9]:
data_table.head()

                                                                                

Row(MAGER='25', FAGECOMB='29', PRIORLIVE=1, PRIORDEAD=0, ILOP_R=888, CIG_0=0, BMI='22.7', RF_PDIAB='N', RF_PHYPE='N', DPLURAL='1', SEX='M', AB_AVEN1='N')

In [10]:
#Remove null and Unknown values of Assisted Ventilation column
data_table = data_table.na.drop(subset=["AB_AVEN1"])
data_table = data_table.filter(data_table.AB_AVEN1 != 'U')

In [11]:
res = data_table.groupBy("AB_AVEN1").count()
res.show()



+--------+------+
|AB_AVEN1| count|
+--------+------+
|       Y| 30254|
|       N|586517|
+--------+------+



                                                                                

In [12]:
#Remove Unknown values of Pre-pregnancy Diabetes and Hypertension columns
data_table = data_table.filter(data_table.RF_PDIAB != 'U')
data_table = data_table.filter(data_table.RF_PHYPE != 'U')


In [13]:
from pyspark.sql.functions import when, col

# Convert "Y" and "N" columns to numerical values (1 for "Y" and 0 for "N")
yn_columns = ["RF_PDIAB", "RF_PHYPE","AB_AVEN1"] # Replace with your "Y" and "N" column names
for col_name in yn_columns:
    data_table = data_table.withColumn(col_name, when(col(col_name) == 'N', 0).otherwise(1))

data_table = data_table.withColumn("SEX", when(col("SEX") == 'M', 0).otherwise(1))

In [14]:
data_table.head()

Row(MAGER='25', FAGECOMB='29', PRIORLIVE=1, PRIORDEAD=0, ILOP_R=888, CIG_0=0, BMI='22.7', RF_PDIAB=0, RF_PHYPE=0, DPLURAL='1', SEX=0, AB_AVEN1=0)

In [15]:
from pyspark.sql.types import IntegerType, DoubleType

# Replace 'int_col' with the name of the column containing integer values as strings
data_table = data_table.withColumn('MAGER', col('MAGER').cast(IntegerType()))
data_table = data_table.withColumn('FAGECOMB', col('FAGECOMB').cast(IntegerType()))
data_table = data_table.withColumn('DPLURAL', col('DPLURAL').cast(IntegerType()))

# If you have a column containing float values stored as strings, you can cast them to double
# Replace 'float_col' with the name of the column containing float values as strings
data_table = data_table.withColumn('BMI', col('BMI').cast(DoubleType()))

In [16]:
data_table.head()

Row(MAGER=25, FAGECOMB=29, PRIORLIVE=1, PRIORDEAD=0, ILOP_R=888, CIG_0=0, BMI=22.7, RF_PDIAB=0, RF_PHYPE=0, DPLURAL=1, SEX=0, AB_AVEN1=0)

In [17]:
df = data_table.toPandas()

                                                                                

In [18]:
df.shape

(616382, 12)

In [19]:
df = df.dropna()

In [20]:
df.shape

(616382, 12)

In [21]:
summary = df.describe()
print(summary)

               MAGER       FAGECOMB      PRIORLIVE      PRIORDEAD  \
count  616382.000000  616382.000000  616382.000000  616382.000000   
mean       29.058548      39.864767       1.283770       0.248898   
std         5.865198      22.731872       3.932441       4.803890   
min        12.000000      10.000000       0.000000       0.000000   
25%        25.000000      28.000000       0.000000       0.000000   
50%        29.000000      33.000000       1.000000       0.000000   
75%        33.000000      39.000000       2.000000       0.000000   
max        50.000000      99.000000      99.000000      99.000000   

              ILOP_R          CIG_0            BMI       RF_PDIAB  \
count  616382.000000  616382.000000  616382.000000  616382.000000   
mean      744.656895       1.543457      28.957594       0.010140   
std       332.641749       8.383737      12.512235       0.100185   
min         3.000000       0.000000      13.000000       0.000000   
25%       888.000000       0.0000

In [22]:
X = df.drop('AB_AVEN1', axis=1)
y = df['AB_AVEN1']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Selection and Training

In [23]:
# Define the models
log_reg = LogisticRegression(max_iter=1000)
svc = SVC()
rand_forest = RandomForestClassifier()

In [24]:
# Define the utility function
def print_performance_metric(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

In [25]:
# Baseline performance
print_performance_metric(log_reg, X_train, y_train, X_test, y_test)


              precision    recall  f1-score   support

           0       0.95      1.00      0.98    117405
           1       0.39      0.00      0.01      5872

    accuracy                           0.95    123277
   macro avg       0.67      0.50      0.49    123277
weighted avg       0.93      0.95      0.93    123277

[[117378     27]
 [  5855     17]]


In [26]:
print_performance_metric(rand_forest, X_train, y_train, X_test, y_test)


              precision    recall  f1-score   support

           0       0.95      0.99      0.97    117405
           1       0.16      0.03      0.05      5872

    accuracy                           0.95    123277
   macro avg       0.56      0.51      0.51    123277
weighted avg       0.92      0.95      0.93    123277

[[116455    950]
 [  5694    178]]


In [27]:
#Support Vector Machines (SVM) with the default Radial basis function (RBF) kernel can be computationally expensive, 
#especially when dealing with large datasets or high-dimensional data. 
#print_performance_metric(svc, X_train, y_train, X_test, y_test)


# Experiment 1 - Scaling Continuous Features

Row(MAGER=25, FAGECOMB=29, PRIORLIVE=1, PRIORDEAD=0, ILOP_R=888, CIG_0=0, BMI=22.7, RF_PDIAB=0, RF_PHYPE=0, DPLURAL=1, SEX=0, AB_AVEN1=0)

In [28]:
# Identify continuous features to scale
continuous_features = ['MAGER', 'FAGECOMB', 'BMI', 'PRIORLIVE', 'PRIORDEAD', 'ILOP_R', 'CIG_0']

# Scale the continuous features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[continuous_features])

# Replace original features with the scaled ones
scaled_df = df.copy()
scaled_df[continuous_features] = scaled_features

scaled_df.head()

Unnamed: 0,MAGER,FAGECOMB,PRIORLIVE,PRIORDEAD,ILOP_R,CIG_0,BMI,RF_PDIAB,RF_PHYPE,DPLURAL,SEX,AB_AVEN1
0,-0.691972,-0.477953,-0.072161,-0.051812,0.430924,-0.184101,-0.500118,0,0,1,0,0
1,1.524494,-0.038042,-0.326457,-0.051812,0.430924,-0.184101,-0.83579,0,0,1,0,0
2,0.160515,2.601426,0.182134,-0.051812,0.430924,1.008685,-0.683939,0,0,1,1,0
3,-0.521474,2.601426,0.182134,-0.051812,0.430924,-0.184101,-0.572048,0,0,1,0,0
4,-1.544459,-0.873874,-0.326457,-0.051812,0.430924,-0.184101,-0.635986,0,0,1,1,1


In [30]:
X = scaled_df.drop('AB_AVEN1', axis=1)
y = scaled_df['AB_AVEN1']

X_train_2, X_test_2, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Retrain and evaluate the models using 2nd order polynomial features
print("Performance with scaled features:")
print("\nLogistic Regression:")
print_performance_metric(log_reg, X_train_2, y_train, X_test_2, y_test)
print("\nRandom Forest:")
print_performance_metric(rand_forest, X_train_2, y_train, X_test_2, y_test)
#print("\nSupport Vector Machine:")
#print_performance_metric(svc, X_train_2, y_train, X_test_2, y_test)

Performance with scaled features:

Logistic Regression:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98    117405
           1       0.45      0.00      0.01      5872

    accuracy                           0.95    123277
   macro avg       0.70      0.50      0.49    123277
weighted avg       0.93      0.95      0.93    123277

[[117387     18]
 [  5857     15]]

Random Forest:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97    117405
           1       0.17      0.03      0.05      5872

    accuracy                           0.95    123277
   macro avg       0.56      0.51      0.51    123277
weighted avg       0.92      0.95      0.93    123277

[[116470    935]
 [  5687    185]]


# Experiment 2 - Polynomial Features



In [31]:
from sklearn.preprocessing import PolynomialFeatures

# Create PolynomialFeatures objects for 2nd order features
poly_2 = PolynomialFeatures(degree=2, include_bias=False)

# Generate the polynomial features
X_poly_2 = poly_2.fit_transform(scaled_df.drop('AB_AVEN1', axis=1))

# Split the data into train and test sets
X_train_3, X_test_3, y_train, y_test = train_test_split(X_poly_2, y, test_size=0.2, random_state=42)

# Retrain and evaluate the models using 2nd order polynomial features
print("Performance with 2nd order polynomial features:")
print("\nLogistic Regression:")
print_performance_metric(log_reg, X_train_3, y_train, X_test_3, y_test)
print("\nRandom Forest:")
print_performance_metric(rand_forest, X_train_3, y_train, X_test_3, y_test)
#print("\nSupport Vector Machine:")
#print_performance_metric(svc, X_train_2, y_train, X_test_2, y_test)



Performance with 2nd order polynomial features:

Logistic Regression:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           0       0.95      1.00      0.98    117405
           1       0.50      0.00      0.00      5872

    accuracy                           0.95    123277
   macro avg       0.73      0.50      0.49    123277
weighted avg       0.93      0.95      0.93    123277

[[117397      8]
 [  5864      8]]

Random Forest:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97    117405
           1       0.17      0.03      0.05      5872

    accuracy                           0.95    123277
   macro avg       0.56      0.51      0.51    123277
weighted avg       0.92      0.95      0.93    123277

[[116488    917]
 [  5681    191]]


# Experiment 3 - Dimensinatility Reduction


In [41]:
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# Apply PCA and LDA transformations
pca = PCA(n_components=0.95)
lda = LDA()

X_pca = pca.fit_transform(scaled_df.drop('AB_AVEN1', axis=1))
X_lda = lda.fit_transform(scaled_df.drop('AB_AVEN1', axis=1), y)

# Split the data into train and test sets
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.2, random_state=42)
X_train_lda, X_test_lda, y_train_lda, y_test_lda = train_test_split(X_lda, y, test_size=0.2, random_state=42)



In [42]:
# Retrain and evaluate the models using PCA transformed features
print("Performance with PCA transformed features:")
print("\nLogistic Regression:")
print_performance_metric(log_reg, X_train_pca, y_train_pca, X_test_pca, y_test_pca)

print("\nRandom Forest:")
print_performance_metric(rand_forest, X_train_pca, y_train_pca, X_test_pca, y_test_pca)
#print("\nSupport Vector Machine:")
#print_performance_metric(svc, X_train_pca, y_train, X_test_pca, y_test)

# Retrain and evaluate the models using LDA transformed features
print("\n\nPerformance with LDA transformed features:")
print("\nLogistic Regression:")
print_performance_metric(log_reg, X_train_lda, y_train_lda, X_test_lda, y_test_lda)

print("\nRandom Forest:")
print_performance_metric(rand_forest, X_train_lda, y_train_lda, X_test_lda, y_test_lda)
#print("\nSupport Vector Machine:")
#print_performance_metric(svc, X_train_lda, y_train, X_test_lda, y_test)

Performance with PCA transformed features:

Logistic Regression:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.95      1.00      0.98    117405
           1       0.00      0.00      0.00      5872

    accuracy                           0.95    123277
   macro avg       0.48      0.50      0.49    123277
weighted avg       0.91      0.95      0.93    123277

[[117405      0]
 [  5872      0]]

Random Forest:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97    117405
           1       0.10      0.02      0.03      5872

    accuracy                           0.95    123277
   macro avg       0.53      0.50      0.50    123277
weighted avg       0.91      0.95      0.93    123277

[[116554    851]
 [  5773     99]]


Performance with LDA transformed features:

Logistic Regression:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98    117405
           1       0.41      0.00      0.01      5872

    accuracy                 

# GridSearch for Random Forest

In [47]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [2, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the RandomForestClassifier
rand_forest = RandomForestClassifier()

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rand_forest, param_grid=param_grid,
                           cv=4, verbose=1, n_jobs=-1, scoring="recall")

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found:", best_params)

# Get the best estimator
best_rand_forest = grid_search.best_estimator_

# Evaluate the best model
print_performance_metric(best_rand_forest, X_train, y_train, X_test, y_test)

Fitting 4 folds for each of 81 candidates, totalling 324 fits
Best parameters found: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
              precision    recall  f1-score   support

           0       0.95      1.00      0.98    117405
           1       0.62      0.01      0.02      5872

    accuracy                           0.95    123277
   macro avg       0.79      0.51      0.50    123277
weighted avg       0.94      0.95      0.93    123277

[[117360     45]
 [  5799     73]]


# Creating Pipeline

LDA transformed features and Random Forest with parameters {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50} look like best model for the Birth Data.

In [56]:
from sklearn.metrics import accuracy_score

# Create the pipeline
pipeline = Pipeline([
    ('lda', LDA(n_components=1)),  # LDA transformation
    ('rf', RandomForestClassifier(max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=50, random_state=42))  # Random Forest classifier
])

# Train the pipeline
pipeline.fit(X_train, y_train)

# Test the pipeline
y_pred = pipeline.predict(X_test)

# Calculate accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:", report)
print("Confusion Matrix:", conf)


Accuracy: 0.9525783398363036
Classification Report:               precision    recall  f1-score   support

           0       0.95      1.00      0.98    117405
           1       0.61      0.01      0.02      5872

    accuracy                           0.95    123277
   macro avg       0.78      0.51      0.50    123277
weighted avg       0.94      0.95      0.93    123277

Confusion Matrix: [[117360     45]
 [  5801     71]]
