In [1]:
import pandas as pd
import numpy as np

# 1. Data cleaning

In [None]:
data = pd.DataFrame({
    'Patient_ID': ['001', '002', '003', '004'],
    'Age': [45, np.nan, 33, 60],
    'Blood_Pressure': [120, 130, 'error', 140],
    'Glucose': [110, 105, 100, np.nan]
})

# Replace 'error' with median blood pressure
bp_values = pd.to_numeric(data['Blood_Pressure'], errors='coerce')
data['Blood_Pressure'] = bp_values.fillna(bp_values.median())

# Fill missing values with mean
data['Age'].fillna(data['Age'].mean(), inplace=True)
data['Glucose'].fillna(data['Glucose'].mean(), inplace=True)

print("\n1. Cleaned Data:")
print(data)


1. Cleaned Data:
  Patient_ID   Age  Blood_Pressure  Glucose
0        001  45.0           120.0    110.0
1        002  46.0           130.0    105.0
2        003  33.0           130.0    100.0
3        004  60.0           140.0    105.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Age'].fillna(data['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Glucose'].fillna(data['Glucose'].mean(), inplace=True)


# 2. Data Integration

In [3]:
patients = pd.DataFrame({'Patient_ID': ['001'], 'Name': ['Ali'], 'Age': [45]})
vitals = pd.DataFrame({'Patient_ID': ['001'], 'Blood_Pressure': [120], 'Glucose': [110]})
merged = pd.merge(patients, vitals, on='Patient_ID')
print("\n2. Integrated Data:")
print(merged)


2. Integrated Data:
  Patient_ID Name  Age  Blood_Pressure  Glucose
0        001  Ali   45             120      110


# 3. Data Transformation


In [4]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

trans_data = pd.DataFrame({'Age': [45, 33], 'Glucose': [110, 100], 'Diagnosis': ['Diabetes', 'Normal']})
scaler = MinMaxScaler()
trans_data[['Glucose']] = scaler.fit_transform(trans_data[['Glucose']])
trans_data['Diagnosis'] = LabelEncoder().fit_transform(trans_data['Diagnosis'])
print("\n3. Transformed Data:")
print(trans_data)



3. Transformed Data:
   Age  Glucose  Diagnosis
0   45      1.0          0
1   33      0.0          1


# 4. Data Reduction


In [5]:
reduction_data = pd.DataFrame({
    'Age': [45, 33],
    'Blood_Pressure': [120, 110],
    'Glucose': [110, 100],
    'Cholesterol': [200, 180],
    'BMI': [25, 27],
    'Diagnosis': [1, 0]
})

reduced = reduction_data[['Age', 'Blood_Pressure', 'Glucose', 'Diagnosis']]
print("\n4. Reduced Data:")
print(reduced)


4. Reduced Data:
   Age  Blood_Pressure  Glucose  Diagnosis
0   45             120      110          1
1   33             110      100          0


# 5. Data Discretization


In [7]:
discretize_data = pd.DataFrame({'Age': [45, 33, 72], 'Glucose': [110, 100, 140]})

def age_group(age):
    if age <= 18:
        return 'Child'
    elif age <= 59:
        return 'Adult'
    else:
        return 'Senior'

def glucose_level(g):
    if g < 100:
        return 'Normal'
    elif g < 126:
        return 'Pre-Diabetes'
    else:
        return 'Diabetes'

discretize_data['Age_Group'] = discretize_data['Age'].apply(age_group)
discretize_data['Glucose_Level'] = discretize_data['Glucose'].apply(glucose_level)
print("\n5. Discretized Data:")
print(discretize_data[['Age_Group', 'Glucose_Level']])




5. Discretized Data:
  Age_Group Glucose_Level
0     Adult  Pre-Diabetes
1     Adult  Pre-Diabetes
2    Senior      Diabetes


# 6. Data Binning


In [8]:
bin_data = pd.DataFrame({'BMI': [18.5, 21.0, 23.5, 27.0, 30.5, 35.0]})
bins = [0, 20, 30, np.inf]
labels = ['Underweight', 'Normal', 'Overweight']
bin_data['Category'] = pd.cut(bin_data['BMI'], bins=bins, labels=labels)
print("\n6. Binned Data:")
print(bin_data)


6. Binned Data:
    BMI     Category
0  18.5  Underweight
1  21.0       Normal
2  23.5       Normal
3  27.0       Normal
4  30.5   Overweight
5  35.0   Overweight
