In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('Preprocessing_V1.csv')

In [4]:
df.shape

(4000, 11)

What is the total number of missing or unknown values in the column Gender?

In [5]:
df['Gender'].isna().sum()


np.int64(0)

In [6]:
df['Gender'].value_counts(dropna=False)


Gender
Female     2366
Male       1627
Unknown       7
Name: count, dtype: int64

In [7]:
unknown_count = df['Gender'].isin(['Unknown', 'unknown', '?']).sum()
unknown_count


np.int64(7)

What is the total number of missing or unknown values in the column GlucoseLevel?

In [8]:
print(df['GlucoseLevel'].unique())

[ 54.6 108.8  64.1 ... 218.9  66.3 168.5]


In [9]:
df['GlucoseLevel'].isna().sum()


np.int64(0)

In [10]:
missing_or_unknown = df['GlucoseLevel'].replace(
    ['Unknown', 'unknown', '?', '', 'NA', 'N/A', 'na', 'null', 'NULL'], 
    np.nan
).isna().sum()

print("Total missing or unknown GlucoseLevel values:", missing_or_unknown)

Total missing or unknown GlucoseLevel values: 0


What is the total number of missing or unknown values in the column LivesIn?

In [11]:
print(df['LivesIn'].unique())

['City' 'Village' 'Unknown']


In [12]:
missing_or_unknown = df['LivesIn'].replace(
    ['Unknown'], 
    np.nan
).isna().sum()

print("Total missing or unknown ", missing_or_unknown)

Total missing or unknown  5


What is the total number of missing or unknown values in the column BMI?

In [13]:
print(df['BMI'].unique())

[35.1 26.7 23.4 27.4 41.6 29.3 37.1 16.1 40.5 15.8 29.9 29.5 40.7 36.6
 31.5  nan 12.1 25.5 22.3 27.1 44.7 26.1 18.8 18.7 24.8 17.  37.6 20.6
 29.  56.2 30.7 25.3 23.  27.2 19.2 31.6 24.6 27.  24.5 18.2 52.  32.3
 42.7 30.  24.3 24.2 25.7 36.7 46.4 48.3 20.9 24.7 23.6 26.5 39.4 18.4
 25.6 25.9 54.6 31.9 14.6 38.7 23.7 27.3 29.2 39.7 30.1 28.1 35.7 14.3
 30.4 22.2 35.  44.5 36.3 25.2 26.6 31.4 36.8 25.8 38.4 43.2 20.4 30.6
 33.8 34.  26.2 29.6 30.2 22.9 38.9 16.3 23.3 25.1 34.1 45.7 37.3 26.4
 40.9 31.1 17.7 27.5 19.9 32.  35.9 32.1 24.9 23.8 18.  20.7 27.7 22.6
 13.1 19.4 28.5 28.8 21.7 19.6 27.8 41.  41.8 35.2 44.4 42.6 15.7 52.8
 23.1 38.5 22.7 18.3 42.3 43.4 51.5 24.  28.7 23.9 37.9 32.6 35.6 34.7
 28.3 33.2 32.5 44.1 34.2 22.  33.3 16.8 35.4 20.1 26.3 37.5 33.1 21.2
 33.  33.6 30.3 26.  34.8 31.8 42.4 25.4 23.2 19.3 27.9 36.1 43.  16.9
 20.5 33.9 28.2 24.1 41.1 32.2 26.8 30.8 22.5 29.7 40.  34.5 28.  37.7
 19.8 28.4 34.3 20.8 16.2 17.5 36.9 19.5 31.7 34.4 29.1 39.3 35.5 21.
 31.  2

In [14]:
unknown = df['BMI'].replace(['nan'], np.nan).isna().sum()
print(unknown)

149


In [15]:
df['BMI'].isna().sum()

np.int64(149)

What is the total number of missing or unknown values in the column SmokingStatus?

In [17]:
print(df['SmokingStatus'].unique())

['never smoked' 'smokes' 'Unknown' 'formerly smoked']


In [19]:
value = df['SmokingStatus'].replace(['Unknown'], np.nan).isna().sum()
print(value)

1204


What is the mean value of the BMI in the dataset? Ignore the missing values if any.

In [20]:
df['BMI'].mean()

np.float64(28.857958971695663)

How many people live in city, smoked at least once in life and had a heartattack? Ignore records/rows with any missing values.

In [24]:
print(df['SmokingStatus'].unique())
print(df['HeartAttack'].unique())
print(df['LivesIn'].unique())

['never smoked' 'smokes' 'Unknown' 'formerly smoked']
['No' 'Yes']
['City' 'Village' 'Unknown']


In [27]:
cond = df[(df['LivesIn'] == 'City')&(df['HeartAttack'] == 'Yes')&(df['SmokingStatus'].isin(['smokes', 'formerly smoked']))]
count = cond.shape[0]
print(count)

52


In [29]:
df_clean = df.dropna()

# A: female, no tension, no heart disease, never married
A = df_clean[
    (df_clean["Gender"] == "Female") &
    (df_clean["HasTension"] == "No") &
    (df_clean["HeartAttack"] == "No") &
    (df_clean["NeverMarried"] == "Yes")
].shape[0]

# B: female, no tension, no heart disease, married or previously married
B = df_clean[
    (df_clean["Gender"] == "Female") &
    (df_clean["HasTension"] == "No") &
    (df_clean["HeartAttack"] == "No") &
    (df_clean["NeverMarried"] == "No")
].shape[0]

# C: male, no tension, no heart disease, never married
C = df_clean[
    (df_clean["Gender"] == "Male") &
    (df_clean["HasTension"] == "No") &
    (df_clean["HeartAttack"] == "No") &
    (df_clean["NeverMarried"] == "Yes")
].shape[0]

# D: male, tension = yes, heart attack = yes, never married
D = df_clean[
    (df_clean["Gender"] == "Male") &
    (df_clean["HasTension"] == "Yes") &
    (df_clean["HeartAttack"] == "Yes") &
    (df_clean["NeverMarried"] == "Yes")
].shape[0]

A, B, C, D


(1295, 732, 799, 18)