In [3]:
# Exercise 1: Identifying and Handling Missing Data
import pandas as pd

# Sample dataset with missing values
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', None],
    'Age': [24, 30, None, 22, 35],
    'Salary': [48000, None, 57000, None, 60000]
}
df = pd.DataFrame(data)

# Filling missing values and dropping rows
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].median(), inplace=True)
df.dropna(subset=['Name'], inplace=True)
print('After cleaning:\n', df)


After cleaning:
       Name    Age   Salary
0    Alice  24.00  48000.0
1      Bob  30.00  57000.0
2  Charlie  27.75  57000.0
3    David  22.00  57000.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].median(), inplace=True)


In [2]:
# Exercise 2: Standardizing Categorical Data
# Sample dataset with inconsistent categorical values
data = {
    'Product': ['Laptop', 'Laptop', 'Desktop', 'Tablet', 'Tablet'],
    'Category': ['Electronics', 'electronics', 'Electronics', 'Gadgets', 'gadgets']
}
df = pd.DataFrame(data)

# Standardize category values
df['Category'] = df['Category'].str.capitalize()
print('Standardized Data:\n', df)


Standardized Data:
    Product     Category
0   Laptop  Electronics
1   Laptop  Electronics
2  Desktop  Electronics
3   Tablet      Gadgets
4   Tablet      Gadgets


In [7]:
# Practice Task 1: Identifying and Handling Missing Data
import pandas as pd

df = pd.read_csv('/home/arsa/Downloads/Titanic-Dataset.csv')

# Mengisi missing value kolom tertentu
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Fare'] = df['Fare'].fillna(df['Fare'].median())

# Menghapus baris yang Name-nya kosong
df.dropna(subset=['Name'], inplace=True)

print(df.head(10).to_string(index=False))


 PassengerId  Survived  Pclass                                                Name    Sex       Age  SibSp  Parch           Ticket    Fare Cabin Embarked
           1         0       3                             Braund, Mr. Owen Harris   male 22.000000      1      0        A/5 21171  7.2500   NaN        S
           2         1       1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38.000000      1      0         PC 17599 71.2833   C85        C
           3         1       3                              Heikkinen, Miss. Laina female 26.000000      0      0 STON/O2. 3101282  7.9250   NaN        S
           4         1       1        Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.000000      1      0           113803 53.1000  C123        S
           5         0       3                            Allen, Mr. William Henry   male 35.000000      0      0           373450  8.0500   NaN        S
           6         0       3                                    Moran, Mr.

In [10]:
import pandas as pd

df = pd.read_csv("/home/arsa/Downloads/Titanic-Dataset.csv")

print("=== Dataset Asli (5 baris pertama) ===")
print(df.head())


kolom_numerik = df.select_dtypes(include=['int64', 'float64']).columns

print("\nKolom numerik:", kolom_numerik.tolist())

for kolom in kolom_numerik:
    min_value = df[kolom].min()
    max_value = df[kolom].max()
    df[kolom + "_MinMax"] = (df[kolom] - min_value) / (max_value - min_value)

for kolom in kolom_numerik:
    mean_value = df[kolom].mean()
    std_value = df[kolom].std()
    df[kolom + "_ZScore"] = (df[kolom] - mean_value) / std_value

print("\n=== Dataset Setelah Normalisasi (5 baris pertama) ===")
print(df.head())


=== Dataset Asli (5 baris pertama) ===
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0        

In [11]:
import pandas as pd

df = pd.read_csv("/home/arsa/Downloads/Titanic-Dataset.csv")

# Standardisasi kolom kategorikal
kolom_kategorikal = df.select_dtypes(include=['object']).columns

for kolom in kolom_kategorikal:
    df[kolom] = (
        df[kolom]
        .astype(str)
        .str.strip()
        .str.lower()
        .str.replace(r'\s+', ' ', regex=True)
    )

# Hapus baris duplikat
df = df.drop_duplicates()

print("=== Dataset Setelah Standardisasi & Penghapusan Duplikasi ===")
print(df.head())


=== Dataset Setelah Standardisasi & Penghapusan Duplikasi ===
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            braund, mr. owen harris    male  22.0      1   
1  cumings, mrs. john bradley (florence briggs th...  female  38.0      1   
2                             heikkinen, miss. laina  female  26.0      0   
3       futrelle, mrs. jacques heath (lily may peel)  female  35.0      1   
4                           allen, mr. william henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         a/5 21171   7.2500   nan        s  
1      0          pc 17599  71.2833   c85        c  
2      0  ston/o2. 3101282   7.9250   nan        s  
3      0            113803  53.1000  c123     

In [16]:
import pandas as pd
import numpy as np

print("=== 1. LOAD DATASET ===")
df = pd.read_csv("/home/arsa/Downloads/real_drug_dataset.csv")
print(df.head(), "\n")

print("=== 2. CLEANING DATASET ===")

# --- 2.1 Standardisasi kolom kategorikal ---
print("\n--- 2.1 Standardizing Categorical Columns ---")
kolom_kategorikal = df.select_dtypes(include="object").columns
print("Kolom kategorikal:", kolom_kategorikal.tolist())

for kolom in kolom_kategorikal:
    df[kolom] = (
        df[kolom]
        .astype(str)
        .str.strip()
        .str.lower()
        .str.replace(r"\s+", " ", regex=True)
    )

print(df.head(), "\n")


# --- 2.2 Menghapus duplikasi ---
print("--- 2.2 Removing Duplicates ---")
before = len(df)
df = df.drop_duplicates()
after = len(df)
print(f"Jumlah baris sebelum: {before}, sesudah: {after}")
print(df.head(), "\n")


# --- 2.3 Menangani nilai kosong ---
print("--- 2.3 Handling Missing Values ---")
missing_before = df.isnull().sum()
print("Missing values sebelum penanganan:\n", missing_before)

df = df.fillna({
    "Improvement_Score": df["Improvement_Score"].mean()
})

missing_after = df.isnull().sum()
print("\nMissing values setelah penanganan:\n", missing_after)
print(df.head(), "\n")

=== 1. LOAD DATASET ===
  Patient_ID  Age  Gender     Condition      Drug_Name  Dosage_mg  \
0      P0001   56    Male     Infection  Ciprofloxacin         50   
1      P0002   69    Male  Hypertension     Metoprolol        500   
2      P0003   46  Female    Depression      Bupropion        100   
3      P0004   32    Male      Diabetes      Glipizide        850   
4      P0005   60    Male    Depression      Bupropion        850   

   Treatment_Duration_days     Side_Effects  Improvement_Score  
0                        9           Nausea                8.5  
1                       24        Tiredness                8.7  
2                       25        Dry mouth                5.4  
3                       44  Low blood sugar                6.4  
4                       35          Anxiety                5.3   

=== 2. CLEANING DATASET ===

--- 2.1 Standardizing Categorical Columns ---
Kolom kategorikal: ['Patient_ID', 'Gender', 'Condition', 'Drug_Name', 'Side_Effects']
  Patien

In [17]:
print("=== 3. OUTLIER HANDLING (IQR Method) ===")

kolom_numerik = df.select_dtypes(include=['int64', 'float64']).columns
print("Kolom numerik:", kolom_numerik.tolist())

for kolom in kolom_numerik:
    Q1 = df[kolom].quantile(0.25)
    Q3 = df[kolom].quantile(0.75)
    IQR = Q3 - Q1

    batas_bawah = Q1 - 1.5 * IQR
    batas_atas = Q3 + 1.5 * IQR

    # Print info outlier per kolom
    outlier_count = df[(df[kolom] < batas_bawah) | (df[kolom] > batas_atas)].shape[0]
    print(f"- Kolom {kolom}: {outlier_count} outlier ditemukan")

    # Replace outlier
    df[kolom] = np.where(df[kolom] < batas_bawah, batas_bawah,
                 np.where(df[kolom] > batas_atas, batas_atas, df[kolom]))

print("\nDataset setelah menangani outlier:\n")
print(df.head(), "\n")


=== 3. OUTLIER HANDLING (IQR Method) ===
Kolom numerik: ['Age', 'Dosage_mg', 'Treatment_Duration_days', 'Improvement_Score']
- Kolom Age: 0 outlier ditemukan
- Kolom Dosage_mg: 0 outlier ditemukan
- Kolom Treatment_Duration_days: 0 outlier ditemukan
- Kolom Improvement_Score: 7 outlier ditemukan

Dataset setelah menangani outlier:

  Patient_ID   Age  Gender     Condition      Drug_Name  Dosage_mg  \
0      p0001  56.0    male     infection  ciprofloxacin       50.0   
1      p0002  69.0    male  hypertension     metoprolol      500.0   
2      p0003  46.0  female    depression      bupropion      100.0   
3      p0004  32.0    male      diabetes      glipizide      850.0   
4      p0005  60.0    male    depression      bupropion      850.0   

   Treatment_Duration_days     Side_Effects  Improvement_Score  
0                      9.0           nausea                8.5  
1                     24.0        tiredness                8.7  
2                     25.0        dry mouth       

In [18]:
print("=== 4. NORMALIZATION ===")

# --- 4.1 Min-Max Normalization ---
print("\n--- 4.1 Min-Max Normalization ---")
for kolom in kolom_numerik:
    min_v = df[kolom].min()
    max_v = df[kolom].max()
    df[kolom + "_MinMax"] = (df[kolom] - min_v) / (max_v - min_v)

print(df.head(), "\n")


# --- 4.2 Z-Score Standardization ---
print("--- 4.2 Z-Score Standardization ---")
for kolom in kolom_numerik:
    mean_v = df[kolom].mean()
    std_v = df[kolom].std()
    df[kolom + "_ZScore"] = (df[kolom] - mean_v) / std_v

print(df.head(), "\n")

print("=== 5. FINAL CLEANED DATASET ===")
print(df.head())
print("\nJumlah total kolom:", len(df.columns))
print("Jumlah total baris:", len(df))

=== 4. NORMALIZATION ===

--- 4.1 Min-Max Normalization ---
  Patient_ID   Age  Gender     Condition      Drug_Name  Dosage_mg  \
0      p0001  56.0    male     infection  ciprofloxacin       50.0   
1      p0002  69.0    male  hypertension     metoprolol      500.0   
2      p0003  46.0  female    depression      bupropion      100.0   
3      p0004  32.0    male      diabetes      glipizide      850.0   
4      p0005  60.0    male    depression      bupropion      850.0   

   Treatment_Duration_days     Side_Effects  Improvement_Score  Age_MinMax  \
0                      9.0           nausea                8.5    0.622951   
1                     24.0        tiredness                8.7    0.836066   
2                     25.0        dry mouth                5.4    0.459016   
3                     44.0  low blood sugar                6.4    0.229508   
4                     35.0          anxiety                5.3    0.688525   

   Dosage_mg_MinMax  Treatment_Duration_days_MinMa