# Data Prep

In [2]:
import pandas as pd
import sklearn.metrics as m
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

## Data

In [3]:
X = pd.read_csv('X.csv').drop('Unnamed: 0',axis=1)
y = pd.read_csv('y.csv').drop('Unnamed: 0',axis=1)
y.columns = ['readmitted']
y = y.readmitted.map({'<30':'readmitted','>30':'not readmitted','NO':'not readmitted'})

numeric = X.columns[:13]
categorical = X.columns[13:]

y = y.astype("category")
X[categorical] = X[categorical].apply(pd.Categorical)

target_counts = y.value_counts()
priors = [t/len(y) for t in target_counts]
priors

[0.8877685402068903, 0.1122314597931097]

## Categorical dummies & train/validate/test split

In [4]:
X = pd.get_dummies(X,drop_first=True)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.1,random_state=1984)
X_train,X_validate,y_train,y_validate = train_test_split(X_train,y_train,test_size=(1/9),random_state=1984)

## Scale numeric

In [5]:
c_scaler = ColumnTransformer([('c_scaler',StandardScaler(),numeric)], 
                               remainder='passthrough')
X_train    = pd.DataFrame(c_scaler.fit_transform(X_train),   columns=X.columns)
X_validate = pd.DataFrame(c_scaler.fit_transform(X_validate),columns=X.columns)
X_test     = pd.DataFrame(c_scaler.fit_transform(X_test),    columns=X.columns)

## SMOTE

In [6]:
oversample = SMOTE(random_state=1984)
X_train,y_train = oversample.fit_resample(X_train,y_train)
X_train = pd.DataFrame(X_train, columns=X.columns)

## Save

In [None]:
X_train.to_csv('X_train.csv')
X_validate.to_csv('X_validate.csv')
X_test.to_csv('X_test.csv')
y_train.to_csv('y_train.csv')
y_validate.to_csv('y_validate.csv')
y_test.to_csv('y_test.csv')