In [2]:
import pandas as pd

train_path = "Project2026/data/splits/train.csv"
test_path = "Project2026/data/splits/test_github.csv"
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)




In [3]:
from sklearn.impute import SimpleImputer
def clean(df):
    df = df.copy()
    # remove identifiers and columns we don't need for modeling
    df.replace('?', pd.NA, inplace=True)
    df.drop(["encounter_id", "patient_nbr", "weight",
             "payer_code", "medical_specialty"], axis=1, inplace=True)

    #Remove decimals for diag1,2 and 3
    cols = ['diag_1','diag_2','diag_3']
    for col in cols:
        df[col] = df[col].astype(str).str.split(".").str[0]
    

    #ORDINAL ENCODING
    # age groups are already ordered, map to integers 0..9
    age_map = {
        '[0-10)':0, '[10-20)':1, '[20-30)':2, '[30-40)':3,
        '[40-50)':4, '[50-60)':5, '[60-70)':6, '[70-80)':7,
        '[80-90)':8, '[90-100)':9
    }
    df['age'] = df['age'].map(age_map)

    # glucose / A1C results have an implicit order as well
    glu_map = {'Norm':0, '>200':1, '>300':2}
    df['max_glu_serum'] = df['max_glu_serum'].map(glu_map)

    a1c_map = {'Norm':0, '>7':1, '>8':2}
    df['A1Cresult'] = df['A1Cresult'].map(a1c_map)


    # Impute missing data
    ordinal_cols = ['max_glu_serum', 'A1Cresult']
    imputer = SimpleImputer(strategy='median') #TODO: Kanske lägg till indicator här
    df[ordinal_cols] = imputer.fit_transform(df[ordinal_cols])


    #Remove all remainng rows with NA
    df.dropna(inplace=True) # TODO: kanske replace istället

    #ONE-HOT ENCODING
    # select all categorical (nominal) columns
    nominal_cols = [
        'race', 'gender', 'admission_type_id',
        'discharge_disposition_id', 'admission_source_id',
        'change', 'diabetesMed',
        'diag_1', 'diag_2', 'diag_3',
        # all medication columns, 24 of them
        'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
        'glimepiride', 'acetohexamide', 'glipizide', 'glyburide',
        'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
        'miglitol', 'troglitazone', 'tolazamide', 'examide',
        'citoglipton', 'insulin', 'glyburide-metformin',
        'glipizide-metformin', 'glimepiride-pioglitazone',
        'metformin-rosiglitazone', 'metformin-pioglitazone'
    ]

    # use pandas.get_dummies to expand each category into binary columns
    df = pd.get_dummies(df, columns=nominal_cols, drop_first=False)

    return df


In [4]:

# after cleaning we can inspect the transformed dataframe
cleaned = clean(train_df)
print("Original shape:", train_df.shape)
print("Cleaned shape:", cleaned.shape)

Original shape: (71236, 51)
Cleaned shape: (68615, 2145)


In [5]:
# import seaborn as sns
# from sklearn.feature_selection import VarianceThreshold
# cleaned.nunique().sort_values()
# selector = VarianceThreshold(threshold=0.01)
# X = selector.fit_transform(cleaned)
# print(X.shape)


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score

cleaned_train = clean(train_df)
X_train = cleaned_train.drop("readmitted", axis=1)
y_train = cleaned_train["readmitted"]

cleaned_test = clean(test_df)
X_test = cleaned_test.drop("readmitted", axis=1)
y_test = cleaned_test["readmitted"]

X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

pred = model.predict(X_test)

acc = accuracy_score(y_test,pred)
print(acc)
print(classification_report(y_test, pred))
print("Confusion matrix:\n", confusion_matrix(y_test, pred))




0.5858530950601345
              precision    recall  f1-score   support

         <30       0.55      0.01      0.02      1654
         >30       0.52      0.38      0.44      5211
          No       0.61      0.84      0.71      7852

    accuracy                           0.59     14717
   macro avg       0.56      0.41      0.39     14717
weighted avg       0.57      0.59      0.54     14717

Confusion matrix:
 [[  16  621 1017]
 [   9 1993 3209]
 [   4 1235 6613]]
