In [1]:
import pandas as pd 

In [None]:
dataset = pd.read_csv("realworld_medical_dirty.csv")

In [3]:
dataset.head()

Unnamed: 0,Patient_ID,Age,Gender,Blood_Pressure,Cholesterol,BMI,Smoker,Diagnosis,Admission_Date,Notes
0,P1000,55.0,FEMALE,120.0,,35.4,Yes,Heart Disease,2023-12-27,
1,P1001,65.0,Male,150.0,180.0,27.8,,,2023-01-01,
2,P1002,45.0,Male,120.0,220.0,22.5,No,,2023-12-14,
3,P1003,65.0,Male,,180.0,35.4,No,,2023-07-09,
4,P1004,65.0,Male,120.0,300.0,40.1,,,2023-07-10,Follow-up required


In [4]:
# Checking for missing (null) values
dataset.isnull().sum()

Patient_ID         0
Age               17
Gender            15
Blood_Pressure    14
Cholesterol       20
BMI               20
Smoker            20
Diagnosis         48
Admission_Date     0
Notes             43
dtype: int64

In [5]:
# Checking data types
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Patient_ID      100 non-null    object 
 1   Age             83 non-null     float64
 2   Gender          85 non-null     object 
 3   Blood_Pressure  86 non-null     float64
 4   Cholesterol     80 non-null     float64
 5   BMI             80 non-null     float64
 6   Smoker          80 non-null     object 
 7   Diagnosis       52 non-null     object 
 8   Admission_Date  100 non-null    object 
 9   Notes           57 non-null     object 
dtypes: float64(4), object(6)
memory usage: 7.9+ KB


In [6]:
# Remove duplicate values
dataset.drop_duplicates(inplace=True)

In [7]:
# Fill missing values with median for selected columns
med_col = ["Age", "Blood_Pressure", "Cholesterol", "BMI"]
for i in med_col:
    med = dataset[i].median()
    dataset[i] = dataset[i].fillna(med)

In [8]:
# Convert 'Admission_Date' to datetime format
dataset['Admission_Date'] = pd.to_datetime(dataset['Admission_Date'])

In [9]:
# Convert text to title case
dataset['Gender'] = dataset['Gender'].str.title()

In [10]:
# Fill missing values in 'Smoker' with 'Yes'
dataset['Smoker'] = dataset['Smoker'].fillna("Yes")

In [11]:
# Fill missing values in 'Gender' with mode
dataset['Gender'] = dataset['Gender'].fillna(dataset["Gender"].mode()[0])

In [12]:
# Fill missing values in 'Diagnosis' with 'Unknown'
dataset['Diagnosis'] = dataset['Diagnosis'].fillna("Unknown")

# Remove unnecessary column
dataset.drop(['Patient_ID','Notes'], axis=1, inplace=True)

In [13]:
# Check outliers for better perdiction
numeric_cols = dataset.select_dtypes(include=['int64', 'float64']).columns
for col in numeric_cols:
    Q1 = dataset[col].quantile(0.25)
    Q3 = dataset[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = dataset[(dataset[col] < lower) | (dataset[col] > upper)]
    print(f" Column: {col} => Outliers: {len(outliers)}")

 Column: Age => Outliers: 0
 Column: Blood_Pressure => Outliers: 0
 Column: Cholesterol => Outliers: 0
 Column: BMI => Outliers: 0


In [14]:
# Create Age_Group column
dataset['Age_Group'] = pd.cut(dataset['Age'], bins=[0, 18, 35, 60, 100], labels=["Child", "Young", "Adult", "Old"])

In [15]:
dataset.isnull().sum()

Age               0
Gender            0
Blood_Pressure    0
Cholesterol       0
BMI               0
Smoker            0
Diagnosis         0
Admission_Date    0
Age_Group         0
dtype: int64

In [16]:
dataset.tail(3)

Unnamed: 0,Age,Gender,Blood_Pressure,Cholesterol,BMI,Smoker,Diagnosis,Admission_Date,Age_Group
97,65.0,Female,150.0,200.0,27.8,N,Unknown,2023-05-13,Old
98,25.0,Male,140.0,180.0,40.1,Yes,Diabetes,2023-12-24,Young
99,65.0,Male,140.0,300.0,35.4,N,Unknown,2023-09-16,Old


In [17]:
dataset['Smoker'] = dataset['Smoker'].replace({'Y':"Yes",'N':"No"})

In [18]:
dataset.tail(3)

Unnamed: 0,Age,Gender,Blood_Pressure,Cholesterol,BMI,Smoker,Diagnosis,Admission_Date,Age_Group
97,65.0,Female,150.0,200.0,27.8,No,Unknown,2023-05-13,Old
98,25.0,Male,140.0,180.0,40.1,Yes,Diabetes,2023-12-24,Young
99,65.0,Male,140.0,300.0,35.4,No,Unknown,2023-09-16,Old
