In [1]:
import arff
import pandas as pd

# Path to your ARFF file
arff_file_path = 'caesarian.csv.arff'

# Open and read the ARFF file
with open(arff_file_path, 'r') as f:
    # Load ARFF file
    arff_data = arff.load(f)

# Extract data and attributes from the ARFF file
data = arff_data['data']
attributes = arff_data['attributes']

# Extract attribute names
column_names = [attr[0] for attr in attributes]

# Create a DataFrame from the data
df = pd.DataFrame(data, columns=column_names)

# Display the DataFrame
print(df)


   Age Delivery number Delivery time Blood Pressure Heart Problem Caesarian
0   22               1             0              2             0         0
1   26               2             0              1             0         1
2   26               2             1              1             0         0
3   28               1             0              2             0         0
4   22               2             0              1             0         1
..  ..             ...           ...            ...           ...       ...
75  27               2             1              1             0         0
76  33               4             0              1             0         1
77  29               2             1              2             0         1
78  25               1             2              0             0         1
79  24               2             2              1             0         0

[80 rows x 6 columns]


In [2]:
df.head()

Unnamed: 0,Age,Delivery number,Delivery time,Blood Pressure,Heart Problem,Caesarian
0,22,1,0,2,0,0
1,26,2,0,1,0,1
2,26,2,1,1,0,0
3,28,1,0,2,0,0
4,22,2,0,1,0,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Age              80 non-null     object
 1   Delivery number  80 non-null     object
 2   Delivery time    80 non-null     object
 3   Blood Pressure   80 non-null     object
 4   Heart Problem    80 non-null     object
 5   Caesarian        80 non-null     object
dtypes: object(6)
memory usage: 3.9+ KB


Changing datatypes of columns to make memory consumption low

In [5]:
df["Age"]= df["Age"].astype(int)
df["Delivery number"] = df["Delivery number"].astype(int)
df["Delivery time"] = df["Delivery time"].astype("category")
df["Blood Pressure"] = df["Blood Pressure"].astype("category")
df["Heart Problem"] = df["Heart Problem"].astype("category")
df["Caesarian"] = df["Caesarian"].astype("category")
df["Caesarian"] = df["Caesarian"].astype("category")

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Age              80 non-null     int32   
 1   Delivery number  80 non-null     int32   
 2   Delivery time    80 non-null     category
 3   Blood Pressure   80 non-null     category
 4   Heart Problem    80 non-null     category
 5   Caesarian        80 non-null     category
dtypes: category(4), int32(2)
memory usage: 1.6 KB


In [42]:
df[df["Caesarian"]=='1'].sort_values("Age")

Unnamed: 0,Age,Delivery number,Delivery time,Blood Pressure,Heart Problem,Caesarian
70,17,1,0,0,0,1
26,18,1,1,2,1,1
61,19,1,0,1,0,1
41,19,1,0,1,0,1
17,20,1,2,2,0,1
24,20,1,0,2,1,1
57,21,1,0,0,0,1
51,21,2,1,0,1,1
4,22,2,0,1,0,1
43,22,1,0,1,0,1


Exploring Age related data

Count of mothers less than 30 years of age

In [45]:
Agelessthan30 = df["Age"] < 30
df[Agelessthan30]["Age"].count()

53

Count of mothers less than 30 years of age and had C-section delivery

In [51]:
isCaesarian = df["Caesarian"] == '1'
df[Agelessthan30 & isCaesarian]["Age"].count()

29

In [54]:
29*100/53

54.716981132075475

Count of mothers greater than or equal to 30 years of age

In [52]:
Agegreaterthanequalto30 = df["Age"] >= 30
df[Agegreaterthanequalto30]["Age"].count()

27

Count of mothers greater than or equal to 30 years of age and had C-section delivery

In [53]:
df[Agegreaterthanequalto30 & isCaesarian]["Age"].count()

17

In [55]:
17*100/27

62.96296296296296

To explore age-related data, I grouped the sample data into mothers less than 30 years of age and mothers greater than or equal to 30 years of age. The less than 30 years of age group had lower chances of C-section as expected (55%) compared to mothers greater than or equal to 30 years of age (63%).

Exploring when baby is premie or latecomer

In [11]:
df[(df["Delivery time"] == '1') | (df["Delivery time"] == '2')]

Unnamed: 0,Age,Delivery number,Delivery time,Blood Pressure,Heart Problem,Caesarian
2,26,2,1,1,0,0
5,26,1,1,0,0,0
9,27,1,1,1,0,1
11,33,1,1,0,0,1
12,23,1,1,1,0,0
14,29,1,2,0,1,1
15,25,1,2,0,0,0
17,20,1,2,2,0,1
19,24,1,2,0,1,1
20,26,1,1,1,0,0


In [12]:
df[df["Caesarian"]=='1'].groupby("Delivery time", observed=False).size() #adding observed=False because of a warning in group by

Delivery time
0    30
1     8
2     8
dtype: int64

In [13]:
df.groupby("Delivery time", observed=False).size()

Delivery time
0    46
1    17
2    17
dtype: int64

In [14]:
30/46 * 100

65.21739130434783

In [15]:
16/34 * 100

47.05882352941176

Conclusion - 
65% of deliveries that were full term resulted in C section. 
47% of deliveries that were either pre-term or post-term resulted in C section.
Hence, this sample data set does not prove my assumption that pre/post term deliveries 
lead to a higher chance of C section.

Exploring data when mother's blood pressure at time of delivery is abnormal

In [16]:
df[(df["Blood Pressure"] == '0') | (df["Blood Pressure"] == '2')]

Unnamed: 0,Age,Delivery number,Delivery time,Blood Pressure,Heart Problem,Caesarian
0,22,1,0,2,0,0
3,28,1,0,2,0,0
5,26,1,1,0,0,0
11,33,1,1,0,0,1
14,29,1,2,0,1,1
15,25,1,2,0,0,0
17,20,1,2,2,0,1
19,24,1,2,0,1,1
21,33,2,0,0,1,1
22,25,1,1,2,0,0


Grouping total data by Blood Pressure category

In [17]:
df.groupby("Blood Pressure", observed=False).size()

Blood Pressure
0    20
1    40
2    20
dtype: int64

Grouping data by Blood Pressure category when delivery is C-section

In [18]:
df[df["Caesarian"]=='1'].groupby("Blood Pressure", observed=False).size()

Blood Pressure
0    15
1    17
2    14
dtype: int64

In [19]:
17/40 * 100

42.5

In [20]:
29/40 * 100

72.5

Conclusion
42% of deliveries resulted in a C-section when the blood pressure of the mother at the time of delivery was normal. Whereas 72% of deliveries resulted in a C-section when the blood pressure was either low or high. So we can conclude from this data sample that chances of C-section increases as blood pressure becomes abnormal.

Exploring data based on number of deliveries:

Grouping total data by Delivery number category

In [21]:
df.groupby("Delivery number", observed=False).size()

Delivery number
1    41
2    27
3    10
4     2
dtype: int64

Grouping data by Delivery number category when delivery is C-section

In [22]:
df[df["Caesarian"]=='1'].groupby("Delivery number", observed=False).size()

Delivery number
1    22
2    15
3     7
4     2
dtype: int64

In [23]:
22/41 * 100

53.65853658536586

In [24]:
15/27 * 100

55.55555555555556

In [25]:
7/10*100

70.0

Conclusion:
From this limited dataset of 80 women, I can conclude that chances of C-section increase as the number of deliveries that a mother has gone through increases.

Exploring data based on heart problem:

Grouping total data by Heart Problem category

In [26]:
df.groupby("Heart Problem", observed=False).size() #adding observed=False because of a warning in group by

Heart Problem
0    50
1    30
dtype: int64

Grouping data by Heart Problem category when delivery is C-section

In [27]:
df[df["Caesarian"]=='1'].groupby("Heart Problem", observed=False).size()

Heart Problem
0    22
1    24
dtype: int64

In [28]:
22/50*100

44.0

In [29]:
24/30*100

80.0

Conclusion
From this sample set of mothers having heart problems, I can conclude that chances of C-section is 80% when the mother has a heart problem vs only 44% when the mother has a healthy heart.