In [1]:
# Importing necessary Libraries
import numpy as np
import polars as pl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Dataset

In [2]:
# Reading Train data
train_beneficiary = pl.read_csv("../data/Train_Beneficiarydata.csv")
train_inpatient = pl.read_csv("../data/Train_Inpatientdata.csv", ignore_errors=True)
train_outpatient = pl.read_csv("../data/Train_Outpatientdata.csv")
train = pl.read_csv("../data/Train.csv")

# Reading Test data
test_beneficiary = pl.read_csv("../data/Test_Beneficiarydata.csv")
test_inpatient = pl.read_csv("../data/Test_Inpatientdata.csv", ignore_errors=True)
test_outpatient = pl.read_csv("../data/Test_Outpatientdata.csv")
test = pl.read_csv("../data/Test.csv")

## Exploring Beneficiary Data

In [3]:
print(f"Shape of Train_Beneficiarydata: {train_beneficiary.shape}")
print(f"Shape of Test_Beneficiarydata: {test_beneficiary.shape}")

Shape of Train_Beneficiarydata: (138556, 25)
Shape of Test_Beneficiarydata: (63968, 25)


In [4]:
print(f"Columns in Train_Beneficiarydata: {train_beneficiary.columns}")

Columns in Train_Beneficiarydata: ['BeneID', 'DOB', 'DOD', 'Gender', 'Race', 'RenalDiseaseIndicator', 'State', 'County', 'NoOfMonths_PartACov', 'NoOfMonths_PartBCov', 'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease', 'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression', 'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke', 'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt', 'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt']


In [5]:
train_beneficiary.schema

Schema([('BeneID', String),
        ('DOB', String),
        ('DOD', String),
        ('Gender', Int64),
        ('Race', Int64),
        ('RenalDiseaseIndicator', String),
        ('State', Int64),
        ('County', Int64),
        ('NoOfMonths_PartACov', Int64),
        ('NoOfMonths_PartBCov', Int64),
        ('ChronicCond_Alzheimer', Int64),
        ('ChronicCond_Heartfailure', Int64),
        ('ChronicCond_KidneyDisease', Int64),
        ('ChronicCond_Cancer', Int64),
        ('ChronicCond_ObstrPulmonary', Int64),
        ('ChronicCond_Depression', Int64),
        ('ChronicCond_Diabetes', Int64),
        ('ChronicCond_IschemicHeart', Int64),
        ('ChronicCond_Osteoporasis', Int64),
        ('ChronicCond_rheumatoidarthritis', Int64),
        ('ChronicCond_stroke', Int64),
        ('IPAnnualReimbursementAmt', Int64),
        ('IPAnnualDeductibleAmt', Int64),
        ('OPAnnualReimbursementAmt', Int64),
        ('OPAnnualDeductibleAmt', Int64)])

In [6]:
# Number of unique beneficiaries in Training Data
num_unique_beneficiaries = train_beneficiary.select(pl.col("BeneID").n_unique()).item()
print(f"Number of unique beneficiaries: {num_unique_beneficiaries}")


Number of unique beneficiaries: 138556


In [7]:
# Gender distribution transformation (1: Male, 2: Female → 1: Male, 0: Female)
train_beneficiary = train_beneficiary.with_columns(
    (pl.when(pl.col("Gender") == 2).then(0).otherwise(1)).alias("Gender")
)

In [8]:
# Calculating Age from DOB
train_beneficiary = train_beneficiary.with_columns(
    pl.col("DOB").str.strptime(pl.Date, "%Y-%m-%d").alias("DOB")
)
train_beneficiary = train_beneficiary.with_columns(
    pl.col("DOB").dt.year().alias("Patient_Age_Year"),
    pl.col("DOB").dt.month().alias("Patient_Age_Month")
)

In [9]:
# Adding "Dead_or_Alive" column (1: Dead, 0: Alive)
train_beneficiary = train_beneficiary.with_columns(
    (pl.when(pl.col("DOD").is_null()).then(0).otherwise(1)).alias("Dead_or_Alive")
)

In [10]:
# Filling NaN values in "DOD" column with the latest known death date
max_dod = train_beneficiary["DOD"].drop_nulls().max()

train_beneficiary = train_beneficiary.with_columns(
    pl.when(pl.col("DOD").is_null())
    .then(pl.lit(max_dod))
    .otherwise(pl.col("DOD"))
    .alias("DOD")
)

In [11]:
# Ensuring "DOD" and "DOB" are in Date format
train_beneficiary = train_beneficiary.with_columns([
    pl.col("DOD").cast(pl.Utf8).str.strptime(pl.Date, "%Y-%m-%d", strict=False).alias("DOD"),
    pl.col("DOB").cast(pl.Utf8).str.strptime(pl.Date, "%Y-%m-%d", strict=False).alias("DOB")
])

# Calculating age only for non-null DOD values
train_beneficiary = train_beneficiary.with_columns(
    pl.when(pl.col("DOD").is_not_null())
    .then(((pl.col("DOD") - pl.col("DOB")).dt.total_days() / 365.0).round(1))
    .otherwise(None)
    .alias("AGE")
)

# Dropping DOD column
train_beneficiary = train_beneficiary.drop("DOD")

In [12]:
# train_beneficiary = train_beneficiary.with_columns(
#     pl.when(pl.col("AGE") <= 40)
#     .then("Young")
#     .when(pl.col("AGE") <= 60)
#     .then("Mid")
#     .when(pl.col("AGE") <= 80)
#     .then("Old")
#     .otherwise("Very Old")
#     .alias("AGE_groups")
# )