# Hospital Readmission Prediction - Feature Engineering

This notebook focuses on feature engineering for the hospital readmission prediction model. We'll transform raw features into more informative ones and prepare the data for modeling.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier
import warnings

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('Set2')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# Ignore warnings
warnings.filterwarnings('ignore')

## 1. Load the Data

In [2]:
# Load the data
data = pd.read_csv('diabetic_data.csv')
print(f"Dataset shape: {data.shape}")
data.head()

Dataset shape: (101766, 50)


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,?,?,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,?,11,5,13,2,0,1,648.0,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,?,?,44,1,16,0,0,0,8.0,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,?,?,51,0,8,0,0,0,197.0,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO


## 2. Basic Preprocessing

In [None]:
# Convert date columns to datetime
data['admission_date'] = pd.to_datetime(data['admission_date'])
data['discharge_date'] = pd.to_datetime(data['discharge_date'])

# Create a copy for feature engineering
df = data.copy()

## 3. Feature Engineering

In [None]:
# 3.1 Extract temporal features
df['admission_month'] = df['admission_date'].dt.month
df['admission_day_of_week'] = df['admission_date'].dt.dayofweek
df['admission_quarter'] = df['admission_date'].dt.quarter
df['is_weekend_admission'] = df['admission_day_of_week'].isin([5, 6]).astype(int)

# Visualize readmission rate by day of week
day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_readmission = df.groupby('admission_day_of_week')['readmission_30d'].mean() * 100

plt.figure(figsize=(10, 6))
day_readmission.plot(kind='bar')
plt.title('Readmission Rate by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Readmission Rate (%)')
plt.xticks(range(7), day_names)
plt.show()

In [None]:
# 3.2 Create age groups
bins = [0, 40, 65, 75, 100]
labels = ['<40', '40-65', '65-75', '>75']
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels)

# Visualize readmission rate by age group
age_group_readmission = df.groupby('age_group')['readmission_30d'].mean() * 100

plt.figure(figsize=(10, 6))
age_group_readmission.plot(kind='bar')
plt.title('Readmission Rate by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Readmission Rate (%)')
plt.show()

In [None]:
# 3.3 Create interaction features
df['age_heart_failure'] = df['age'] * df['heart_failure']
df['age_diabetes'] = df['age'] * df['diabetes']
df['previous_emergency'] = df['previous_admissions'] * df['emergency_admission']
df['medication_count_adherence'] = df['medication_count'] * df['medication_adherence']
df['comorbidity_count'] = df[['diabetes', 'heart_failure', 'copd', 'hypertension', 'renal_disease']].sum(axis=1)

# Visualize readmission rate by comorbidity count
comorbidity_readmission = df.groupby('comorbidity_count')['readmission_30d'].mean() * 100

plt.figure(figsize=(10, 6))
comorbidity_readmission.plot(kind='bar')
plt.title('Readmission Rate by Number of Comorbidities')
plt.xlabel('Number of Comorbidities')
plt.ylabel('Readmission Rate (%)')
plt.show()

In [None]:
# 3.4 Create risk score based on domain knowledge
df['risk_score'] = (
    0.2 * (df['age'] > 75).astype(int) +
    0.15 * df['diabetes'] +
    0.25 * df['heart_failure'] +
    0.2 * df['copd'] +
    0.1 * df['renal_disease'] +
    0.05 * (df['previous_admissions'] > 2).astype(int) +
    0.15 * (df['medication_count'] > 8).astype(int) -
    0.3 * df['medication_adherence'] +
    0.1 * df['emergency_admission'] +
    0.1 * (df['discharge_disposition'] == 2).astype(int)
)

# Visualize risk score distribution by readmission status
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='risk_score', hue='readmission_30d', bins=30, multiple='dodge')
plt.title('Risk Score Distribution by Readmission Status')
plt.xlabel('Risk Score')
plt.ylabel('Count')
plt.legend(['Not Readmitted', 'Readmitted'])
plt.show()

In [None]:
# 3.5 One-hot encode categorical variables
df_encoded = pd.get_dummies(df, columns=['gender', 'age_group', 'primary_diagnosis', 'discharge_disposition'], drop_first=True)

print(f"Shape after one-hot encoding: {df_encoded.shape}")
print(f"New features added: {df_encoded.shape[1] - df.shape[1]}")

## 4. Feature Selection

In [None]:
# Prepare data for feature selection
# Drop non-feature columns
X = df_encoded.drop(columns=['patient_id', 'admission_date', 'discharge_date', 'readmission_30d', 'days_to_readmission'])
y = df_encoded['readmission_30d']

print(f"Features shape: {X.shape}")

In [None]:
# 4.1 Feature importance using Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Get feature importances
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

# Plot the top 20 features
plt.figure(figsize=(12, 8))
plt.title('Feature Importances')
plt.bar(range(20), importances[indices][:20], align='center')
plt.xticks(range(20), X.columns[indices][:20], rotation=90)
plt.tight_layout()
plt.show()

# Print top 20 features
print("Top 20 features by importance:")
for i in range(20):
    print(f"{i+1}. {X.columns[indices][i]}: {importances[indices][i]:.4f}")

In [None]:
# 4.2 Statistical feature selection using ANOVA F-value
selector = SelectKBest(f_classif, k=20)
X_new = selector.fit_transform(X, y)

# Get selected feature names
mask = selector.get_support()
selected_features = X.columns[mask]

# Get scores
scores = selector.scores_
selected_scores = scores[mask]

# Sort by score
sorted_indices = np.argsort(selected_scores)[::-1]
sorted_features = selected_features[sorted_indices]
sorted_scores = selected_scores[sorted_indices]

# Plot
plt.figure(figsize=(12, 8))
plt.title('Feature Selection using ANOVA F-value')
plt.bar(range(len(sorted_features)), sorted_scores, align='center')
plt.xticks(range(len(sorted_features)), sorted_features, rotation=90)
plt.tight_layout()
plt.show()

print("Top 20 features by ANOVA F-value:")
for i, (feature, score) in enumerate(zip(sorted_features, sorted_scores)):
    print(f"{i+1}. {feature}: {score:.4f}")

In [None]:
# 4.3 Recursive Feature Elimination (RFE)
rfe = RFE(estimator=RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=20)
rfe.fit(X, y)

# Get selected feature names
rfe_mask = rfe.get_support()
rfe_features = X.columns[rfe_mask]

print("Top 20 features selected by RFE:")
for i, feature in enumerate(rfe_features):
    print(f"{i+1}. {feature}")

## 5. Compare Feature Selection Methods

In [None]:
# Compare the features selected by different methods
rf_features = X.columns[indices][:20]
anova_features = sorted_features

# Find common features
common_all = set(rf_features) & set(anova_features) & set(rfe_features)
common_rf_anova = set(rf_features) & set(anova_features)
common_rf_rfe = set(rf_features) & set(rfe_features)
common_anova_rfe = set(anova_features) & set(rfe_features)

print(f"Features common to all methods: {len(common_all)}")
print(f"Features common to RF and ANOVA: {len(common_rf_anova)}")
print(f"Features common to RF and RFE: {len(common_rf_rfe)}")
print(f"Features common to ANOVA and RFE: {len(common_anova_rfe)}")

print("\nFeatures common to all methods:")
for feature in common_all:
    print(f"- {feature}")

## 6. Final Feature Set

In [None]:
# Create a final feature set based on the analysis
# We'll use features that appear in at least 2 of the 3 methods
all_selected_features = list(rf_features) + list(anova_features) + list(rfe_features)
feature_counts = pd.Series(all_selected_features).value_counts()
final_features = feature_counts[feature_counts >= 2].index.tolist()

print(f"Final feature set size: {len(final_features)}")
print("\nFinal features:")
for feature in final_features:
    print(f"- {feature}")

In [None]:
# Create the final feature matrix
X_final = X[final_features]

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_final)

# Convert back to DataFrame for better visualization
X_scaled_df = pd.DataFrame(X_scaled, columns=X_final.columns)

print(f"Final scaled feature matrix shape: {X_scaled_df.shape}")
X_scaled_df.head()

## 7. Save Processed Data

In [None]:
# Save the final feature set for reference
import os
os.makedirs('../data/features', exist_ok=True)

# Save the list of final features
with open('../data/features/final_features.txt', 'w') as f:
    for feature in final_features:
        f.write(f"{feature}\n")

print(f"Final feature list saved to ../data/features/final_features.txt")

In [None]:
# Save the full processed dataset
# Add the target variable back
X_scaled_df['readmission_30d'] = y.values

# Save to CSV
X_scaled_df.to_csv('../data/features/processed_features.csv', index=False)
print(f"Processed features saved to ../data/features/processed_features.csv")

## 8. Summary of Feature Engineering

In this notebook, we performed several feature engineering steps:

1. **Temporal Features**:
   - Extracted month, day of week, and quarter from admission date
   - Created a weekend admission indicator

2. **Categorical Transformations**:
   - Created age groups
   - One-hot encoded categorical variables

3. **Interaction Features**:
   - Combined age with medical conditions
   - Created previous emergency admissions
   - Combined medication count and adherence
   - Created comorbidity count

4. **Domain Knowledge Features**:
   - Created a risk score based on clinical factors

5. **Feature Selection**:
   - Used Random Forest importance
   - Applied ANOVA F-value selection
   - Performed Recursive Feature Elimination
   - Selected features that appeared in at least 2 methods

6. **Feature Scaling**:
   - Standardized the final feature set

The final feature set includes a mix of demographic, medical, temporal, and interaction features that show strong predictive power for hospital readmissions.