# Objective 1: Investigate Severity and Treatment Outcomes
We'll use a Random Forest Regressor for predicting treatment outcomes based on the severity of illness and other relevant features.

# Objective 2: Systematic Differences in Billing Errors
We'll use a Random Forest Classifier to identify systematic differences in billing errors based on payment type and other features.

# Objective 3: Patterns of Billing Disparities
We'll use a Gradient Boosting Regressor to predict billing amounts for different insurance types.

# Install SHAP(Shapely Additive exPlanations)

In [None]:
pip install shap

# Install LIME (Local Interpretable Model-agnostic Explanations)

In [None]:
pip install lime

# Install ELI5

In [None]:
pip install eli5

# Install Tensorflow

In [None]:
pip install tensorflow

# Importing the necessary Libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
import lime
import lime.lime_tabular
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, r2_score, mean_absolute_error, mean_absolute_percentage_error
import numpy as np
import shap
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import seaborn as sns
import eli5
from eli5.sklearn import PermutationImportance
from IPython.display import display, HTML

# Load the dataset

In [None]:

file_path = '/content/Hospital_Inpatient_Discharges__SPARCS_De-Identified___2017_20240120 (1).csv'
data = pd.read_csv(file_path)


# EDA

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the dataset
#file_path = '/mnt/data/Hospital_Inpatient_Discharges__SPARCS_De-Identified___2017_20240120 (1).csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

# Summary statistics
print("\nSummary statistics:")
print(data.describe(include='all'))

# Data types and missing values
print("\nData types and missing values:")
print(data.info())

# Check for missing values
print("\nMissing values count:")
print(data.isnull().sum())

# Visualize missing values
msno.matrix(data)
plt.show()

# Encode categorical features
categorical_features = data.select_dtypes(include=[object]).columns
encoder = LabelEncoder()
for feature in categorical_features:
    data[feature] = encoder.fit_transform(data[feature].astype(str))

# Distribution of numerical features
numerical_features = data.select_dtypes(include=[np.number]).columns
print("\nNumerical features:")
print(numerical_features)

for feature in numerical_features:
    plt.figure(figsize=(10, 6))
    sns.histplot(data[feature].dropna(), kde=True)
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.show()

# Distribution of categorical features
print("\nCategorical features:")
print(categorical_features)

for feature in categorical_features:
    plt.figure(figsize=(10, 6))
    sns.countplot(y=data[feature], order=data[feature].value_counts().index)
    plt.title(f'Distribution of {feature}')
    plt.xlabel('Count')
    plt.ylabel(feature)
    plt.show()

# Correlation matrix
plt.figure(figsize=(12, 8))
corr_matrix = data.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

# Pairplot for numerical features
sns.pairplot(data[numerical_features])
plt.show()

# Box plots to detect outliers
for feature in numerical_features:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=data[feature])
    plt.title(f'Box plot of {feature}')
    plt.xlabel(feature)
    plt.show()

# Relationship between numerical features and target variable (example: Total Charges)
target = 'Total Charges'
for feature in numerical_features:
    if feature != target:
        plt.figure(figsize=(10, 6))
        sns.scatterplot(x=data[feature], y=data[target])
        plt.title(f'Relationship between {feature} and {target}')
        plt.xlabel(feature)
        plt.ylabel(target)
        plt.show()

# Distribution of target variable (example: Total Charges)
plt.figure(figsize=(10, 6))
sns.histplot(data[target].dropna(), kde=True)
plt.title(f'Distribution of {target}')
plt.xlabel(target)
plt.ylabel('Frequency')
plt.show()

# Checking relationships in categorical features with target variable
for feature in categorical_features:
    plt.figure(figsize=(12, 6))
    sns.boxplot(x=data[feature], y=data[target])
    plt.title(f'Relationship between {feature} and {target}')
    plt.xlabel(feature)
    plt.ylabel(target)
    plt.xticks(rotation=45)
    plt.show()


# Data Preprocessing

In [None]:

data['Length of Stay'] = pd.to_numeric(data['Length of Stay'], errors='coerce')
data['Total Charges'] = pd.to_numeric(data['Total Charges'], errors='coerce')
data['Total Costs'] = pd.to_numeric(data['Total Costs'], errors='coerce')
cleaned_data = data.dropna(subset=['Length of Stay', 'Total Charges', 'Total Costs'])

# Create Billing Discrepancy column

In [None]:

cleaned_data['Billing Discrepancy'] = cleaned_data['Total Charges'] - cleaned_data['Total Costs']

# Objective 1: Predicting Treatment Outcomes
# Features and target

In [None]:

features_obj1 = ['APR Severity of Illness Code', 'Length of Stay', 'Total Charges']
X1 = cleaned_data[features_obj1]
y1 = cleaned_data['Total Costs']


# Split data

In [None]:

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42)


# Standardize features

In [None]:

scaler = StandardScaler()
X1_train_scaled = scaler.fit_transform(X1_train)
X1_test_scaled = scaler.transform(X1_test)


# Train Random Forest Regressor

In [None]:

rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X1_train_scaled, y1_train)


# Predict and evaluate using RFR



In [None]:

y1_pred = rf_regressor.predict(X1_test_scaled)
mse_obj1 = mean_squared_error(y1_test, y1_pred)
print(f'Objective 1 - Mean Squared Error: {mse_obj1}')


# Predict and evaluate its accuracy and precision of RFR

Thereby reaching with a **84% accuracy** using RFR
Objective 1 - Mean Squared Error: 5176.0114566283455
Objective 1 - Mean Absolute Error: 42.13125000000002
Objective 1 - Mean Absolute Percentage Error: 3.1584146341718866
Objective 1 - Root Mean Squared Error: 71.94450261575477
Objective 1 - **R^2 Score: 0.8479155196583776**


In [None]:

y1_pred = rf_regressor.predict(X1_test_scaled)
mse_obj1 = mean_squared_error(y1_test, y1_pred)
mae_obj1 = mean_absolute_error(y1_test, y1_pred)
mape_obj1 = mean_absolute_percentage_error(y1_test, y1_pred)
rmse_obj1 = np.sqrt(mse_obj1)
r2_obj1 = r2_score(y1_test, y1_pred)


print(f'Objective 1 - Mean Squared Error: {mse_obj1}')
print(f'Objective 1 - Mean Absolute Error: {mae_obj1}')
print(f'Objective 1 - Mean Absolute Percentage Error: {mape_obj1}')
print(f'Objective 1 - Root Mean Squared Error: {rmse_obj1}')
print(f'Objective 1 - R^2 Score: {r2_obj1}')


# SHAP Analysis for Objective 1

In [None]:
explainer_rf = shap.TreeExplainer(rf_regressor)
shap_values_rf = explainer_rf.shap_values(X1_test_scaled)

In [None]:
print("SHAP summary plot for Objective 1")
shap.summary_plot(shap_values_rf, X1_test)

# # LIME Analysis for Objective 1

In [None]:

lime_explainer = lime.lime_tabular.LimeTabularExplainer(
    X1_train_scaled,
    feature_names=features_obj1,
    class_names=['Total Costs'],
    verbose=True,
    mode='regression'
)

### Explain a single instance

In [None]:

i = 0  # Index of the instance to explain
lime_exp = lime_explainer.explain_instance(X1_test_scaled[i], rf_regressor.predict)

# Showing the explanation after implementing LIME

In [None]:

print("LIME explanation for a single instance")
lime_exp.show_in_notebook(show_table=True, show_all=False)

# Show the explanation as a plot

In [None]:

lime_exp.as_pyplot_figure()
plt.show()

# ELI 5 obj1

##Implementing ELI5 for

1.   Text Explanation
2.   Explanation for single prediction
3.   Also understanding the Permutation Importance



In [None]:
# ELI5 Text Explanation
print(eli5.format_as_text(eli5.explain_weights(rf_regressor, feature_names=features_obj1)))

# ELI5 HTML Explanation for Jupyter Notebook

display(eli5.show_weights(rf_regressor, feature_names=features_obj1))

# ELI5 Explanation for a Single Prediction
i = 0  # Index of the instance to explain
print(eli5.format_as_text(eli5.explain_prediction(rf_regressor, X1_test_scaled[i], feature_names=features_obj1)))
display(eli5.show_prediction(rf_regressor, X1_test_scaled[i], feature_names=features_obj1))

# Permutation Importance
perm = PermutationImportance(rf_regressor, random_state=42).fit(X1_test_scaled, y1_test)
display(eli5.show_weights(perm, feature_names=features_obj1))

# Objective 2: Identifying Systematic Billing Errors

##### Features and target

In [None]:


features_obj2 = ['APR Severity of Illness Code', 'Total Charges', 'Total Costs', 'Payment Typology 1']
X2 = cleaned_data[features_obj2]
y2 = cleaned_data['Billing Discrepancy'] > 100  # Example threshold for billing error


######Encode categorical features

In [None]:

encoder = LabelEncoder()
X2['Payment Typology 1'] = encoder.fit_transform(X2['Payment Typology 1'])


###### Split data

In [None]:

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

# Train Random Forest Classifier for objective 2

In [None]:

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X2_train, y2_train)


# Predict and evaluate RFC


*   Accuracy: 0.9166666666666666



In [None]:

y2_pred = rf_classifier.predict(X2_test)
accuracy_obj2 = accuracy_score(y2_test, y2_pred)
print(f'Objective 2 - Accuracy: {accuracy_obj2}')
print(classification_report(y2_test, y2_pred))


# SHAP Analysis for Objective 2

In [None]:

explainer_rf_classifier = shap.TreeExplainer(rf_classifier)
shap_values_rf_classifier = explainer_rf_classifier.shap_values(X2_test)

# SHAP summary plot for Objective 2

In [None]:
print("SHAP summary plot for Objective 2")
shap.summary_plot(shap_values_rf_classifier, X2_test)

In [None]:
# Predict and evaluate
y2_pred = rf_classifier.predict(X2_test)
accuracy_obj2 = accuracy_score(y2_test, y2_pred)
precision_obj2 = precision_score(y2_test, y2_pred)
recall_obj2 = recall_score(y2_test, y2_pred)
f1_obj2 = f1_score(y2_test, y2_pred)
conf_matrix_obj2 = confusion_matrix(y2_test, y2_pred)

print(f'Objective 2 - Accuracy: {accuracy_obj2}')
print(f'Objective 2 - Precision: {precision_obj2}')
print(f'Objective 2 - Recall: {recall_obj2}')
print(f'Objective 2 - F1 Score: {f1_obj2}')
print(f'Objective 2 - Confusion Matrix:\n{conf_matrix_obj2}')
print(classification_report(y2_test, y2_pred))


# LIME Analysis for Objective 2

In [None]:
# Convert to numpy arrays
X2_np = X2.values
y2_np = y2.values

# Split data
X2_train, X2_test, y2_train, y2_test = train_test_split(X2_np, y2_np, test_size=0.2, random_state=42)

# Train Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X2_train, y2_train)

# Predict and evaluate
y2_pred = rf_classifier.predict(X2_test)
accuracy_obj2 = accuracy_score(y2_test, y2_pred)
print(f'Objective 2 - Accuracy: {accuracy_obj2}')
print(classification_report(y2_test, y2_pred))

# LIME Analysis for Objective 2
lime_explainer_obj2 = lime.lime_tabular.LimeTabularExplainer(
    X2_train,
    feature_names=features_obj2,
    class_names=['No Billing Error', 'Billing Error'],
    categorical_features=[3],  # Specify the index of categorical features in the numpy array
    verbose=True,
    mode='classification'
)

# Explain a single instance

In [None]:

i = 0  # Index of the instance to explain
lime_exp_obj2 = lime_explainer_obj2.explain_instance(X2_test[i], rf_classifier.predict_proba)

# Show the explanation
print("LIME explanation for a single instance")
lime_exp_obj2.show_in_notebook(show_table=True, show_all=False)

# Show the explanation using LIME as a plot for objective 2

In [None]:

lime_exp_obj2.as_pyplot_figure()
plt.show()


# ELI5

# ELI5 Text Explanation

In [None]:

print(eli5.format_as_text(eli5.explain_weights(rf_classifier, feature_names=features_obj2)))

# ELI5 HTML Explanation for Jupyter Notebook
display(eli5.show_weights(rf_classifier, feature_names=features_obj2))

# ELI5 Explanation for a Single Prediction
i = 0  # Index of the instance to explain
print(eli5.format_as_text(eli5.explain_prediction(rf_classifier, X2_test[i], feature_names=features_obj2)))
display(eli5.show_prediction(rf_classifier, X2_test[i], feature_names=features_obj2))

# Permutation Importance
perm = PermutationImportance(rf_classifier, random_state=42).fit(X2_test, y2_test)
display(eli5.show_weights(perm, feature_names=features_obj2))


# Objective 3: Predicting Billing Amounts for Different Insurance Types


*   Features and target



In [None]:
features_obj3 = ['APR Severity of Illness Code', 'Payment Typology 1', 'CCS Procedure Code']
X3 = cleaned_data[features_obj3]
y3 = cleaned_data['Total Charges']

In [None]:
# Encode categorical features
X3['Payment Typology 1'] = encoder.fit_transform(X3['Payment Typology 1'])
X3['CCS Procedure Code'] = cleaned_data['CCS Procedure Code'].fillna(0).astype(int)

# Split data

In [None]:

X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.2, random_state=42)

# Train Gradient Boosting Regressor

In [None]:

gb_regressor = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_regressor.fit(X3_train, y3_train)


# Predict and evaluate

In [None]:

y3_pred = gb_regressor.predict(X3_test)
mse_obj3 = mean_squared_error(y3_test, y3_pred)
print(f'Objective 3 - Mean Squared Error: {mse_obj3}')

# SHAP Analysis for Objective 3

In [None]:

explainer_gb = shap.TreeExplainer(gb_regressor)
shap_values_gb = explainer_gb.shap_values(X3_test)

In [None]:
print("SHAP summary plot for Objective 3")
shap.summary_plot(shap_values_gb, X3_test)

In [None]:
# Dependence Plot
print("SHAP dependence plot for 'Total Charges'")
shap.dependence_plot('Total Charges', shap_values_rf, X1_test)

In [None]:
# Force Plot for a single instance
print("SHAP force plot for a single instance")
shap.force_plot(explainer_rf.expected_value, shap_values_rf[0], X1_test.iloc[0])

###### Waterfall Plot for a single instance

In [None]:

print("SHAP waterfall plot for a single instance")
shap.waterfall_plot(shap.Explanation(values=shap_values_rf[0],
                                     base_values=explainer_rf.expected_value,
                                     data=X1_test.iloc[0]))


# Bar Plot

*   SHAP bar plot for Objective 1



In [None]:

print("SHAP bar plot for Objective 1")
shap.summary_plot(shap_values_rf, X1_test, plot_type="bar")

plt.show()

# Lime Objective 3

In [None]:
# Convert to numpy arrays
X3_np = X3.values
y3_np = y3.values

# Split data
X3_train, X3_test, y3_train, y3_test = train_test_split(X3_np, y3_np, test_size=0.2, random_state=42)

# Train Gradient Boosting Regressor
gb_regressor = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_regressor.fit(X3_train, y3_train)

# Predict and evaluate
y3_pred = gb_regressor.predict(X3_test)
mse_obj3 = mean_squared_error(y3_test, y3_pred)
print(f'Objective 3 - Mean Squared Error: {mse_obj3}')

# LIME Analysis for Objective 3
lime_explainer_obj3 = lime.lime_tabular.LimeTabularExplainer(
    X3_train,
    feature_names=features_obj3,
    class_names=['Total Charges'],
    categorical_features=[1, 2],  # Specify the indices of categorical features in the numpy array
    verbose=True,
    mode='regression'
)

# Explain a single instance
i = 0  # Index of the instance to explain
lime_exp_obj3 = lime_explainer_obj3.explain_instance(X3_test[i], gb_regressor.predict)

# Show the explanation
print("LIME explanation for a single instance")
lime_exp_obj3.show_in_notebook(show_table=True, show_all=False)

# Show the explanation as a plot
lime_exp_obj3.as_pyplot_figure()
plt.show()

# ELI 5 for obj3


In [None]:
# ELI5 Text Explanation
print(eli5.format_as_text(eli5.explain_weights(gb_regressor, feature_names=features_obj3)))

# ELI5 HTML Explanation for Jupyter Notebook
display(eli5.show_weights(gb_regressor, feature_names=features_obj3))

# ELI5 Explanation for a Single Prediction
i = 0  # Index of the instance to explain
print(eli5.format_as_text(eli5.explain_prediction(gb_regressor, X3_test[i], feature_names=features_obj3)))
display(eli5.show_prediction(gb_regressor, X3_test[i], feature_names=features_obj3))

# Permutation Importance
perm = PermutationImportance(gb_regressor, random_state=42).fit(X3_test, y3_test)
display(eli5.show_weights(perm, feature_names=features_obj3))

# Predict and evaluate GBR (R^2 Score: 0.26861310772369096)


1.   Objective 3 - Mean Squared Error: 34945.09096476254
2.   Objective 3 - Mean Absolute Error: 134.2881684673238
3.   Objective 3 - Mean Absolute Percentage Error: 0.939080397251264
4.   Objective 3 - Root Mean Squared Error: 186.93606116734819
5.   Objective 3 - R^2 Score: 0.26861310772369096






In [None]:
# Predict and evaluate
y3_pred = gb_regressor.predict(X3_test)
mse_obj3 = mean_squared_error(y3_test, y3_pred)
mae_obj3 = mean_absolute_error(y3_test, y3_pred)
mape_obj3 = mean_absolute_percentage_error(y3_test, y3_pred)
rmse_obj3 = np.sqrt(mse_obj3)
r2_obj3 = r2_score(y3_test, y3_pred)

print(f'Objective 3 - Mean Squared Error: {mse_obj3}')
print(f'Objective 3 - Mean Absolute Error: {mae_obj3}')
print(f'Objective 3 - Mean Absolute Percentage Error: {mape_obj3}')
print(f'Objective 3 - Root Mean Squared Error: {rmse_obj3}')
print(f'Objective 3 - R^2 Score: {r2_obj3}')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import seaborn as sns

# Load the dataset
file_path = '/content/Hospital_Inpatient_Discharges__SPARCS_De-Identified___2017_20240120 (1).csv'
data = pd.read_csv(file_path)

# Data Preprocessing
data['Length of Stay'] = pd.to_numeric(data['Length of Stay'], errors='coerce')
data['Total Charges'] = pd.to_numeric(data['Total Charges'], errors='coerce')
data['Total Costs'] = pd.to_numeric(data['Total Costs'], errors='coerce')
cleaned_data = data.dropna(subset=['Length of Stay', 'Total Charges', 'Total Costs'])

# Encode categorical features
encoder = LabelEncoder()

# Objective 3: Predicting Billing Amounts for Different Insurance Types
# Create Billing Discrepancy column
cleaned_data['Billing Discrepancy'] = cleaned_data['Total Charges'] - cleaned_data['Total Costs']

# Features and target for Objective 3
features_obj3 = ['APR Severity of Illness Code', 'Payment Typology 1', 'CCS Procedure Code']
X3 = cleaned_data[features_obj3]
y3 = cleaned_data['Total Charges']

# Encode categorical features
X3['Payment Typology 1'] = encoder.fit_transform(X3['Payment Typology 1'])
X3['CCS Procedure Code'] = cleaned_data['CCS Procedure Code'].fillna(0).astype(int)

# Standardize features
scaler = StandardScaler()
X3_scaled = scaler.fit_transform(X3)

# Split data
X3_train, X3_test, y3_train, y3_test = train_test_split(X3_scaled, y3, test_size=0.2, random_state=42)

# Build the neural network model
model = Sequential()
model.add(Dense(256, input_dim=X3_train.shape[1], activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1))  # Output layer for regression

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
history = model.fit(X3_train, y3_train, epochs=50, batch_size=32, verbose=1, validation_data=(X3_test, y3_test))

# Predict and evaluate
y3_pred = model.predict(X3_test)
mse_obj3 = mean_squared_error(y3_test, y3_pred)
mae_obj3 = mean_absolute_error(y3_test, y3_pred)
r2_obj3 = r2_score(y3_test, y3_pred)

print(f'Objective 3 - Mean Squared Error: {mse_obj3}')
print(f'Objective 3 - Mean Absolute Error: {mae_obj3}')
print(f'Objective 3 - R^2 Score: {r2_obj3}')

# Residuals plot
plt.figure(figsize=(8, 6))
plt.scatter(y3_test, y3_pred)
plt.plot([y3_test.min(), y3_test.max()], [y3_test.min(), y3_test.max()], '--', color='red')
plt.xlabel('Actual Total Charges')
plt.ylabel('Predicted Total Charges')
plt.title('Actual vs Predicted Total Charges')
plt.show()

# Learning Curves
plt.figure(figsize=(12, 8))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Learning Curves')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Error Distribution
errors = y3_test - y3_pred.flatten()
plt.figure(figsize=(8, 6))
sns.histplot(errors, kde=True)
plt.xlabel('Prediction Error')
plt.title('Distribution of Prediction Errors')
plt.show()


In [None]:
pip install dbn


In [None]:
pip install sklearn-deep-belief-network


#  Adam optimizer Objective 2 - Accuracy: 0.9166666666666666

*   Objective 2 - Precision: 1.0
*   Objective 2 - Recall: 0.9
*   Objective 2 - F1 Score: 0.9473684210526316
*   Objective 2 - Confusion Matrix:
[[2 0]
 [1 9]]
              precision    recall  f1-score   support

       False       0.67      1.00      0.80         2
        True       1.00      0.90      0.95        10

    accuracy                           0.92        12
   macro avg       0.83      0.95      0.87        12
weighted avg       0.94      0.92      0.92        12


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Load the dataset
file_path = '/content/Hospital_Inpatient_Discharges__SPARCS_De-Identified___2017_20240120 (1).csv'
data = pd.read_csv(file_path)

# Data Preprocessing
data['Length of Stay'] = pd.to_numeric(data['Length of Stay'], errors='coerce')
data['Total Charges'] = pd.to_numeric(data['Total Charges'], errors='coerce')
data['Total Costs'] = pd.to_numeric(data['Total Costs'], errors='coerce')
cleaned_data = data.dropna(subset=['Length of Stay', 'Total Charges', 'Total Costs'])

# Create Billing Discrepancy column
cleaned_data['Billing Discrepancy'] = cleaned_data['Total Charges'] - cleaned_data['Total Costs']

# Features and target for Objective 2
features_obj2 = ['APR Severity of Illness Code', 'Total Charges', 'Total Costs', 'Payment Typology 1']
X2 = cleaned_data[features_obj2]
y2 = cleaned_data['Billing Discrepancy'] > 100  # Example threshold for billing error

# Encode categorical features
encoder = LabelEncoder()
X2['Payment Typology 1'] = encoder.fit_transform(X2['Payment Typology 1'])

# Standardize features
scaler = StandardScaler()
X2_scaled = scaler.fit_transform(X2)

# Split data
X2_train, X2_test, y2_train, y2_test = train_test_split(X2_scaled, y2, test_size=0.2, random_state=42)

# Build the neural network model
model = Sequential()
model.add(Dense(256, input_dim=X2_train.shape[1], activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X2_train, y2_train, epochs=50, batch_size=32, verbose=1, validation_data=(X2_test, y2_test))

# Predict and evaluate
y2_pred = (model.predict(X2_test) > 0.5).astype("int32")
accuracy_obj2 = accuracy_score(y2_test, y2_pred)
precision_obj2 = precision_score(y2_test, y2_pred)
recall_obj2 = recall_score(y2_test, y2_pred)
f1_obj2 = f1_score(y2_test, y2_pred)
conf_matrix_obj2 = confusion_matrix(y2_test, y2_pred)

print(f'Objective 2 - Accuracy: {accuracy_obj2}')
print(f'Objective 2 - Precision: {precision_obj2}')
print(f'Objective 2 - Recall: {recall_obj2}')
print(f'Objective 2 - F1 Score: {f1_obj2}')
print(f'Objective 2 - Confusion Matrix:\n{conf_matrix_obj2}')
print(classification_report(y2_test, y2_pred))


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc, precision_recall_curve
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import seaborn as sns

# Load the dataset
file_path = '/content/Hospital_Inpatient_Discharges__SPARCS_De-Identified___2017_20240120 (1).csv'
data = pd.read_csv(file_path)

# Data Preprocessing
data['Length of Stay'] = pd.to_numeric(data['Length of Stay'], errors='coerce')
data['Total Charges'] = pd.to_numeric(data['Total Charges'], errors='coerce')
data['Total Costs'] = pd.to_numeric(data['Total Costs'], errors='coerce')
cleaned_data = data.dropna(subset=['Length of Stay', 'Total Charges', 'Total Costs'])

# Create Billing Discrepancy column
cleaned_data['Billing Discrepancy'] = cleaned_data['Total Charges'] - cleaned_data['Total Costs']

# Features and target for Objective 2
features_obj2 = ['APR Severity of Illness Code', 'Total Charges', 'Total Costs', 'Payment Typology 1']
X2 = cleaned_data[features_obj2]
y2 = cleaned_data['Billing Discrepancy'] > 100  # Example threshold for billing error

# Encode categorical features
encoder = LabelEncoder()
X2['Payment Typology 1'] = encoder.fit_transform(X2['Payment Typology 1'])

# Standardize features
scaler = StandardScaler()
X2_scaled = scaler.fit_transform(X2)

# Split data
X2_train, X2_test, y2_train, y2_test = train_test_split(X2_scaled, y2, test_size=0.2, random_state=42)

# Build the neural network model
model = Sequential()
model.add(Dense(256, input_dim=X2_train.shape[1], activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X2_train, y2_train, epochs=50, batch_size=32, verbose=1, validation_data=(X2_test, y2_test))

# Predict probabilities and labels
y2_pred_proba = model.predict(X2_test)
y2_pred = (y2_pred_proba > 0.5).astype("int32")

# Evaluate the model
accuracy_obj2 = accuracy_score(y2_test, y2_pred)
precision_obj2 = precision_score(y2_test, y2_pred)
recall_obj2 = recall_score(y2_test, y2_pred)
f1_obj2 = f1_score(y2_test, y2_pred)
conf_matrix_obj2 = confusion_matrix(y2_test, y2_pred)

print(f'Objective 2 - Accuracy: {accuracy_obj2}')
print(f'Objective 2 - Precision: {precision_obj2}')
print(f'Objective 2 - Recall: {recall_obj2}')
print(f'Objective 2 - F1 Score: {f1_obj2}')
print(f'Objective 2 - Confusion Matrix:\n{conf_matrix_obj2}')
print(classification_report(y2_test, y2_pred))

# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_obj2, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# ROC Curve
fpr, tpr, thresholds = roc_curve(y2_test, y2_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:0.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y2_test, y2_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, lw=2, color='blue')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()

# Learning Curves
plt.figure(figsize=(12, 8))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Learning Curves')
plt.xlabel('Epochs')
plt.ylabel('Accuracy/Loss')
plt.legend()
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import seaborn as sns

# Load the dataset
file_path = '/content/Hospital_Inpatient_Discharges__SPARCS_De-Identified___2017_20240120 (1).csv'
data = pd.read_csv(file_path)

# Data Preprocessing
data['Length of Stay'] = pd.to_numeric(data['Length of Stay'], errors='coerce')
data['Total Charges'] = pd.to_numeric(data['Total Charges'], errors='coerce')
data['Total Costs'] = pd.to_numeric(data['Total Costs'], errors='coerce')
cleaned_data = data.dropna(subset=['Length of Stay', 'Total Charges', 'Total Costs'])

# Encode categorical features
encoder = LabelEncoder()

# Objective 3: Predicting Billing Amounts for Different Insurance Types
# Create Billing Discrepancy column
cleaned_data['Billing Discrepancy'] = cleaned_data['Total Charges'] - cleaned_data['Total Costs']

# Features and target for Objective 3
features_obj3 = ['APR Severity of Illness Code', 'Payment Typology 1', 'CCS Procedure Code']
X3 = cleaned_data[features_obj3]
y3 = cleaned_data['Total Charges']

# Encode categorical features
X3['Payment Typology 1'] = encoder.fit_transform(X3['Payment Typology 1'])
X3['CCS Procedure Code'] = cleaned_data['CCS Procedure Code'].fillna(0).astype(int)

# Standardize features
scaler = StandardScaler()
X3_scaled = scaler.fit_transform(X3)

# Split data
X3_train, X3_test, y3_train, y3_test = train_test_split(X3_scaled, y3, test_size=0.2, random_state=42)

# Build the neural network model
model = Sequential()
model.add(Dense(256, input_dim=X3_train.shape[1], activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1))  # Output layer for regression

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
history = model.fit(X3_train, y3_train, epochs=50, batch_size=32, verbose=1, validation_data=(X3_test, y3_test))

# Predict and evaluate
y3_pred = model.predict(X3_test)
mse_obj3 = mean_squared_error(y3_test, y3_pred)
mae_obj3 = mean_absolute_error(y3_test, y3_pred)
r2_obj3 = r2_score(y3_test, y3_pred)

print(f'Objective 3 - Mean Squared Error: {mse_obj3}')
print(f'Objective 3 - Mean Absolute Error: {mae_obj3}')
print(f'Objective 3 - R^2 Score: {r2_obj3}')

# Residuals plot
plt.figure(figsize=(8, 6))
plt.scatter(y3_test, y3_pred)
plt.plot([y3_test.min(), y3_test.max()], [y3_test.min(), y3_test.max()], '--', color='red')
plt.xlabel('Actual Total Charges')
plt.ylabel('Predicted Total Charges')
plt.title('Actual vs Predicted Total Charges')
plt.show()

# Learning Curves
plt.figure(figsize=(12, 8))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Learning Curves')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Error Distribution
errors = y3_test - y3_pred.flatten()
plt.figure(figsize=(8, 6))
sns.histplot(errors, kde=True)
plt.xlabel('Prediction Error')
plt.title('Distribution of Prediction Errors')
plt.show()


# Implementing PCA to improve GBR performance

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import seaborn as sns

# Load the dataset
file_path = '/content/Hospital_Inpatient_Discharges__SPARCS_De-Identified___2017_20240120 (1).csv'
data = pd.read_csv(file_path)

# Data Preprocessing
data['Length of Stay'] = pd.to_numeric(data['Length of Stay'], errors='coerce')
data['Total Charges'] = pd.to_numeric(data['Total Charges'], errors='coerce')
data['Total Costs'] = pd.to_numeric(data['Total Costs'], errors='coerce')
cleaned_data = data.dropna(subset=['Length of Stay', 'Total Charges', 'Total Costs'])

# Features and target for Objective 3
features_obj3 = ['APR Severity of Illness Code', 'Payment Typology 1', 'CCS Procedure Code']
X3 = cleaned_data[features_obj3]
y3 = cleaned_data['Total Charges']

# Encode categorical features
encoder = LabelEncoder()
X3['Payment Typology 1'] = encoder.fit_transform(X3['Payment Typology 1'])
X3['CCS Procedure Code'] = cleaned_data['CCS Procedure Code'].fillna(0).astype(int)

# Standardize features
scaler = StandardScaler()
X3_scaled = scaler.fit_transform(X3)

# Apply PCA
pca = PCA(n_components=2)  # Adjust the number of components as needed
X3_pca = pca.fit_transform(X3_scaled)

# Split data
X3_train, X3_test, y3_train, y3_test = train_test_split(X3_pca, y3, test_size=0.2, random_state=42)

# Train GBR model with PCA
gbr_pca = GradientBoostingRegressor(n_estimators=100, random_state=42)
gbr_pca.fit(X3_train, y3_train)

# Predict and evaluate
y3_pred_pca = gbr_pca.predict(X3_test)
mse_pca = mean_squared_error(y3_test, y3_pred_pca)
mae_pca = mean_absolute_error(y3_test, y3_pred_pca)
r2_pca = r2_score(y3_test, y3_pred_pca)

print(f'PCA - Mean Squared Error: {mse_pca}')
print(f'PCA - Mean Absolute Error: {mae_pca}')
print(f'PCA - R^2 Score: {r2_pca}')

# Residuals plot
plt.figure(figsize=(8, 6))
plt.scatter(y3_test, y3_pred_pca)
plt.plot([y3_test.min(), y3_test.max()], [y3_test.min(), y3_test.max()], '--', color='red')
plt.xlabel('Actual Total Charges')
plt.ylabel('Predicted Total Charges')
plt.title('Actual vs Predicted Total Charges (PCA)')
plt.show()

# Error Distribution
errors_pca = y3_test - y3_pred_pca
plt.figure(figsize=(8, 6))
sns.histplot(errors_pca, kde=True)
plt.xlabel('Prediction Error')
plt.title('Distribution of Prediction Errors (PCA)')
plt.show()


# Implementing CFS to improve GBR model Performance

In [None]:
# Apply CFS
selector = SelectKBest(score_func=f_regression, k='all')
X3_selected = selector.fit_transform(X3_scaled, y3)

# Split data
X3_train, X3_test, y3_train, y3_test = train_test_split(X3_selected, y3, test_size=0.2, random_state=42)

# Train GBR model with CFS
gbr_cfs = GradientBoostingRegressor(n_estimators=100, random_state=42)
gbr_cfs.fit(X3_train, y3_train)

# Predict and evaluate
y3_pred_cfs = gbr_cfs.predict(X3_test)
mse_cfs = mean_squared_error(y3_test, y3_pred_cfs)
mae_cfs = mean_absolute_error(y3_test, y3_pred_cfs)
r2_cfs = r2_score(y3_test, y3_pred_cfs)

print(f'CFS - Mean Squared Error: {mse_cfs}')
print(f'CFS - Mean Absolute Error: {mae_cfs}')
print(f'CFS - R^2 Score: {r2_cfs}')

# Residuals plot
plt.figure(figsize=(8, 6))
plt.scatter(y3_test, y3_pred_cfs)
plt.plot([y3_test.min(), y3_test.max()], [y3_test.min(), y3_test.max()], '--', color='red')
plt.xlabel('Actual Total Charges')
plt.ylabel('Predicted Total Charges')
plt.title('Actual vs Predicted Total Charges (CFS)')
plt.show()

# Error Distribution
errors_cfs = y3_test - y3_pred_cfs
plt.figure(figsize=(8, 6))
sns.histplot(errors_cfs, kde=True)
plt.xlabel('Prediction Error')
plt.title('Distribution of Prediction Errors (CFS)')
plt.show()


# Implementing PCA to improve NN model Performance

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc, precision_recall_curve
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import seaborn as sns

# Load the dataset
file_path = '/content/Hospital_Inpatient_Discharges__SPARCS_De-Identified___2017_20240120 (1).csv'
data = pd.read_csv(file_path)

# Data Preprocessing
data['Length of Stay'] = pd.to_numeric(data['Length of Stay'], errors='coerce')
data['Total Charges'] = pd.to_numeric(data['Total Charges'], errors='coerce')
data['Total Costs'] = pd.to_numeric(data['Total Costs'], errors='coerce')
cleaned_data = data.dropna(subset=['Length of Stay', 'Total Charges', 'Total Costs'])

# Create Billing Discrepancy column
cleaned_data['Billing Discrepancy'] = cleaned_data['Total Charges'] - cleaned_data['Total Costs']

# Features and target for Objective 2
features_obj2 = ['APR Severity of Illness Code', 'Total Charges', 'Total Costs', 'Payment Typology 1']
X2 = cleaned_data[features_obj2]
y2 = cleaned_data['Billing Discrepancy'] > 100  # Example threshold for billing error

# Encode categorical features
encoder = LabelEncoder()
X2['Payment Typology 1'] = encoder.fit_transform(X2['Payment Typology 1'])

# Standardize features
scaler = StandardScaler()
X2_scaled = scaler.fit_transform(X2)

# Apply PCA
pca = PCA(n_components=2)  # Adjust the number of components as needed
X2_pca = pca.fit_transform(X2_scaled)

# Split data
X2_train, X2_test, y2_train, y2_test = train_test_split(X2_pca, y2, test_size=0.2, random_state=42)

# Build the neural network model
model = Sequential()
model.add(Dense(256, input_dim=X2_train.shape[1], activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history_pca = model.fit(X2_train, y2_train, epochs=50, batch_size=32, verbose=1, validation_data=(X2_test, y2_test))

# Predict probabilities and labels
y2_pred_proba_pca = model.predict(X2_test)
y2_pred_pca = (y2_pred_proba_pca > 0.5).astype("int32")

# Evaluate the model
accuracy_pca = accuracy_score(y2_test, y2_pred_pca)
precision_pca = precision_score(y2_test, y2_pred_pca)
recall_pca = recall_score(y2_test, y2_pred_pca)
f1_pca = f1_score(y2_test, y2_pred_pca)
conf_matrix_pca = confusion_matrix(y2_test, y2_pred_pca)

print(f'PCA - Accuracy: {accuracy_pca}')
print(f'PCA - Precision: {precision_pca}')
print(f'PCA - Recall: {recall_pca}')
print(f'PCA - F1 Score: {f1_pca}')
print(f'PCA - Confusion Matrix:\n{conf_matrix_pca}')
print(classification_report(y2_test, y2_pred_pca))

# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_pca, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix (PCA)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# ROC Curve
fpr_pca, tpr_pca, thresholds_pca = roc_curve(y2_test, y2_pred_proba_pca)
roc_auc_pca = auc(fpr_pca, tpr_pca)

plt.figure(figsize=(8, 6))
plt.plot(fpr_pca, tpr_pca, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc_pca:0.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve (PCA)')
plt.legend(loc="lower right")
plt.show()

# Precision-Recall Curve
precision_pca, recall_pca, _ = precision_recall_curve(y2_test, y2_pred_proba_pca)
plt.figure(figsize=(8, 6))
plt.plot(recall_pca, precision_pca, lw=2, color='blue')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve (PCA)')
plt.show()

# Learning Curves
plt.figure(figsize=(12, 8))
plt.plot(history_pca.history['accuracy'], label='Train Accuracy')
plt.plot(history_pca.history['val_accuracy'], label='Validation Accuracy')
plt.plot(history_pca.history['loss'], label='Train Loss')
plt.plot(history_pca.history['val_loss'], label='Validation Loss')
plt.title('Learning Curves (PCA)')
plt.xlabel('Epochs')
plt.ylabel('Accuracy/Loss')
plt.legend()
plt.show()


# Implementing CFS to improve NN model Performance

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

# Apply CFS
selector = SelectKBest(score_func=f_classif, k='all')
X2_selected = selector.fit_transform(X2_scaled, y2)

# Split data
X2_train, X2_test, y2_train, y2_test = train_test_split(X2_selected, y2, test_size=0.2, random_state=42)

# Build the neural network model
model = Sequential()
model.add(Dense(256, input_dim=X2_train.shape[1], activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history_cfs = model.fit(X2_train, y2_train, epochs=50, batch_size=32, verbose=1, validation_data=(X2_test, y2_test))

# Predict probabilities and labels
y2_pred_proba_cfs = model.predict(X2_test)
y2_pred_cfs = (y2_pred_proba_cfs > 0.5).astype("int32")

# Evaluate the model
accuracy_cfs = accuracy_score(y2_test, y2_pred_cfs)
precision_cfs = precision_score(y2_test, y2_pred_cfs)
recall_cfs = recall_score(y2_test, y2_pred_cfs)
f1_cfs = f1_score(y2_test, y2_pred_cfs)
conf_matrix_cfs = confusion_matrix(y2_test, y2_pred_cfs)

print(f'CFS - Accuracy: {accuracy_cfs}')
print(f'CFS - Precision: {precision_cfs}')
print(f'CFS - Recall: {recall_cfs}')
print(f'CFS - F1 Score: {f1_cfs}')
print(f'CFS - Confusion Matrix:\n{conf_matrix_cfs}')
print(classification_report(y2_test, y2_pred_cfs))

# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_cfs, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix (CFS)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# ROC Curve
fpr_cfs, tpr_cfs, thresholds_cfs = roc_curve(y2_test, y2_pred_proba_cfs)
roc_auc_cfs = auc(fpr_cfs, tpr_cfs)

plt.figure(figsize=(8, 6))
plt.plot(fpr_cfs, tpr_cfs, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc_cfs:0.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve (CFS)')
plt.legend(loc="lower right")
plt.show()

# Precision-Recall Curve
precision_cfs, recall_cfs, _ = precision_recall_curve(y2_test, y2_pred_proba_cfs)
plt.figure(figsize=(8, 6))
plt.plot(recall_cfs, precision_pca, lw=2, color='blue')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve (CFS)')
plt.show()

# Learning Curves
plt.figure(figsize=(12, 8))
plt.plot(history_cfs.history['accuracy'], label='Train Accuracy')
plt.plot(history_cfs.history['val_accuracy'], label='Validation Accuracy')
plt.plot(history_cfs.history['loss'], label='Train Loss')
plt.plot(history_cfs.history['val_loss'], label='Validation Loss')
plt.title('Learning Curves (CFS)')
plt.xlabel('Epochs')
plt.ylabel('Accuracy/Loss')
plt.legend()
plt.show()


# Implementing PCA to improve RFR model Performance

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc, precision_recall_curve
import seaborn as sns

# Load the dataset
file_path = '/content/Hospital_Inpatient_Discharges__SPARCS_De-Identified___2017_20240120 (1).csv'
data = pd.read_csv(file_path)

# Data Preprocessing
data['Length of Stay'] = pd.to_numeric(data['Length of Stay'], errors='coerce')
data['Total Charges'] = pd.to_numeric(data['Total Charges'], errors='coerce')
data['Total Costs'] = pd.to_numeric(data['Total Costs'], errors='coerce')
cleaned_data = data.dropna(subset=['Length of Stay', 'Total Charges', 'Total Costs'])

# Create Billing Discrepancy column
cleaned_data['Billing Discrepancy'] = cleaned_data['Total Charges'] - cleaned_data['Total Costs']

# Features and target for Objective 2
features_obj2 = ['APR Severity of Illness Code', 'Total Charges', 'Total Costs', 'Payment Typology 1']
X2 = cleaned_data[features_obj2]
y2 = cleaned_data['Billing Discrepancy'] > 100  # Example threshold for billing error

# Encode categorical features
encoder = LabelEncoder()
X2['Payment Typology 1'] = encoder.fit_transform(X2['Payment Typology 1'])

# Standardize features
scaler = StandardScaler()
X2_scaled = scaler.fit_transform(X2)

# Apply PCA
pca = PCA(n_components=2)  # Adjust the number of components as needed
X2_pca = pca.fit_transform(X2_scaled)

# Split data
X2_train, X2_test, y2_train, y2_test = train_test_split(X2_pca, y2, test_size=0.2, random_state=42)

# Train Random Forest Classifier
rf_classifier_pca = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier_pca.fit(X2_train, y2_train)

# Predict and evaluate
y2_pred_pca = rf_classifier_pca.predict(X2_test)
accuracy_pca = accuracy_score(y2_test, y2_pred_pca)
precision_pca = precision_score(y2_test, y2_pred_pca)
recall_pca = recall_score(y2_test, y2_pred_pca)
f1_pca = f1_score(y2_test, y2_pred_pca)
conf_matrix_pca = confusion_matrix(y2_test, y2_pred_pca)

print(f'PCA - Accuracy: {accuracy_pca}')
print(f'PCA - Precision: {precision_pca}')
print(f'PCA - Recall: {recall_pca}')
print(f'PCA - F1 Score: {f1_pca}')
print(f'PCA - Confusion Matrix:\n{conf_matrix_pca}')
print(classification_report(y2_test, y2_pred_pca))

# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_pca, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix (PCA)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# ROC Curve
y2_pred_proba_pca = rf_classifier_pca.predict_proba(X2_test)[:, 1]
fpr_pca, tpr_pca, thresholds_pca = roc_curve(y2_test, y2_pred_proba_pca)
roc_auc_pca = auc(fpr_pca, tpr_pca)

plt.figure(figsize=(8, 6))
plt.plot(fpr_pca, tpr_pca, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc_pca:0.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve (PCA)')
plt.legend(loc="lower right")
plt.show()

# Precision-Recall Curve
precision_pca, recall_pca, _ = precision_recall_curve(y2_test, y2_pred_proba_pca)
plt.figure(figsize=(8, 6))
plt.plot(recall_pca, precision_pca, lw=2, color='blue')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve (PCA)')
plt.show()


# Implementing CFS to improve rfc model Performance

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

# Apply CFS
selector = SelectKBest(score_func=f_classif, k='all')
X2_selected = selector.fit_transform(X2_scaled, y2)

# Split data
X2_train, X2_test, y2_train, y2_test = train_test_split(X2_selected, y2, test_size=0.2, random_state=42)

# Train Random Forest Classifier
rf_classifier_cfs = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier_cfs.fit(X2_train, y2_train)

# Predict and evaluate
y2_pred_cfs = rf_classifier_cfs.predict(X2_test)
accuracy_cfs = accuracy_score(y2_test, y2_pred_cfs)
precision_cfs = precision_score(y2_test, y2_pred_cfs)
recall_cfs = recall_score(y2_test, y2_pred_cfs)
f1_cfs = f1_score(y2_test, y2_pred_cfs)
conf_matrix_cfs = confusion_matrix(y2_test, y2_pred_cfs)

print(f'CFS - Accuracy: {accuracy_cfs}')
print(f'CFS - Precision: {precision_cfs}')
print(f'CFS - Recall: {recall_cfs}')
print(f'CFS - F1 Score: {f1_cfs}')
print(f'CFS - Confusion Matrix:\n{conf_matrix_cfs}')
print(classification_report(y2_test, y2_pred_cfs))

# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_cfs, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix (CFS)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# ROC Curve
y2_pred_proba_cfs = rf_classifier_cfs.predict_proba(X2_test)[:, 1]
fpr_cfs, tpr_cfs, thresholds_cfs = roc_curve(y2_test, y2_pred_proba_cfs)
roc_auc_cfs = auc(fpr_cfs, tpr_cfs)

plt.figure(figsize=(8, 6))
plt.plot(fpr_cfs, tpr_cfs, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc_cfs:0.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve (CFS)')
plt.legend(loc="lower right")
plt.show()

# Precision-Recall Curve
precision_cfs, recall_cfs, _ = precision_recall_curve(y2_test, y2_pred_proba_cfs)
plt.figure(figsize=(8, 6))
plt.plot(recall_cfs, precision_cfs, lw=2, color='blue')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve (CFS)')
plt.show()


# Applying PCA for Objective 2 with Random Forest Classifier

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc, precision_recall_curve
import seaborn as sns

# Load the dataset
file_path = '/content/Hospital_Inpatient_Discharges__SPARCS_De-Identified___2017_20240120 (1).csv'
data = pd.read_csv(file_path)

# Data Preprocessing
data['Length of Stay'] = pd.to_numeric(data['Length of Stay'], errors='coerce')
data['Total Charges'] = pd.to_numeric(data['Total Charges'], errors='coerce')
data['Total Costs'] = pd.to_numeric(data['Total Costs'], errors='coerce')
cleaned_data = data.dropna(subset=['Length of Stay', 'Total Charges', 'Total Costs'])

# Create Billing Discrepancy column
cleaned_data['Billing Discrepancy'] = cleaned_data['Total Charges'] - cleaned_data['Total Costs']

# Features and target for Objective 2
features_obj2 = ['APR Severity of Illness Code', 'Total Charges', 'Total Costs', 'Payment Typology 1']
X2 = cleaned_data[features_obj2]
y2 = cleaned_data['Billing Discrepancy'] > 100  # Example threshold for billing error

# Encode categorical features
encoder = LabelEncoder()
X2['Payment Typology 1'] = encoder.fit_transform(X2['Payment Typology 1'])

# Standardize features
scaler = StandardScaler()
X2_scaled = scaler.fit_transform(X2)

# Apply PCA
pca = PCA(n_components=2)  # Adjust the number of components as needed
X2_pca = pca.fit_transform(X2_scaled)

# Split data
X2_train, X2_test, y2_train, y2_test = train_test_split(X2_pca, y2, test_size=0.2, random_state=42)

# Train Random Forest Classifier
rf_classifier_pca = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier_pca.fit(X2_train, y2_train)

# Predict and evaluate
y2_pred_pca = rf_classifier_pca.predict(X2_test)
accuracy_pca = accuracy_score(y2_test, y2_pred_pca)
precision_pca = precision_score(y2_test, y2_pred_pca)
recall_pca = recall_score(y2_test, y2_pred_pca)
f1_pca = f1_score(y2_test, y2_pred_pca)
conf_matrix_pca = confusion_matrix(y2_test, y2_pred_pca)

print(f'PCA - Accuracy: {accuracy_pca}')
print(f'PCA - Precision: {precision_pca}')
print(f'PCA - Recall: {recall_pca}')
print(f'PCA - F1 Score: {f1_pca}')
print(f'PCA - Confusion Matrix:\n{conf_matrix_pca}')
print(classification_report(y2_test, y2_pred_pca))

# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_pca, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix (PCA)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# ROC Curve
y2_pred_proba_pca = rf_classifier_pca.predict_proba(X2_test)[:, 1]
fpr_pca, tpr_pca, thresholds_pca = roc_curve(y2_test, y2_pred_proba_pca)
roc_auc_pca = auc(fpr_pca, tpr_pca)

plt.figure(figsize=(8, 6))
plt.plot(fpr_pca, tpr_pca, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc_pca:0.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve (PCA)')
plt.legend(loc="lower right")
plt.show()

# Precision-Recall Curve
precision_pca, recall_pca, _ = precision_recall_curve(y2_test, y2_pred_proba_pca)
plt.figure(figsize=(8, 6))
plt.plot(recall_pca, precision_pca, lw=2, color='blue')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve (PCA)')
plt.show()


# Applying CFS for Objective 2 with Random Forest Classifier

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

# Apply CFS
selector = SelectKBest(score_func=f_classif, k='all')
X2_selected = selector.fit_transform(X2_scaled, y2)

# Split data
X2_train, X2_test, y2_train, y2_test = train_test_split(X2_selected, y2, test_size=0.2, random_state=42)

# Train Random Forest Classifier
rf_classifier_cfs = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier_cfs.fit(X2_train, y2_train)

# Predict and evaluate
y2_pred_cfs = rf_classifier_cfs.predict(X2_test)
accuracy_cfs = accuracy_score(y2_test, y2_pred_cfs)
precision_cfs = precision_score(y2_test, y2_pred_cfs)
recall_cfs = recall_score(y2_test, y2_pred_cfs)
f1_cfs = f1_score(y2_test, y2_pred_cfs)
conf_matrix_cfs = confusion_matrix(y2_test, y2_pred_cfs)

print(f'CFS - Accuracy: {accuracy_cfs}')
print(f'CFS - Precision: {precision_cfs}')
print(f'CFS - Recall: {recall_cfs}')
print(f'CFS - F1 Score: {f1_cfs}')
print(f'CFS - Confusion Matrix:\n{conf_matrix_cfs}')
print(classification_report(y2_test, y2_pred_cfs))

# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_cfs, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix (CFS)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# ROC Curve
y2_pred_proba_cfs = rf_classifier_cfs.predict_proba(X2_test)[:, 1]
fpr_cfs, tpr_cfs, thresholds_cfs = roc_curve(y2_test, y2_pred_proba_cfs)
roc_auc_cfs = auc(fpr_cfs, tpr_cfs)

plt.figure(figsize=(8, 6))
plt.plot(fpr_cfs, tpr_cfs, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc_cfs:0.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve (CFS)')
plt.legend(loc="lower right")
plt.show()

# Precision-Recall Curve
precision_cfs, recall_cfs, _ = precision_recall_curve(y2_test, y2_pred_proba_cfs)
plt.figure(figsize=(8, 6))
plt.plot(recall_cfs, precision_cfs, lw=2, color='blue')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve (CFS)')
plt.show()


# logistic regression

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import seaborn as sns

# Load the dataset
file_path = '/content/Hospital_Inpatient_Discharges__SPARCS_De-Identified___2017_20240120 (1).csv'
data = pd.read_csv(file_path)

# Data Preprocessing
data['Length of Stay'] = pd.to_numeric(data['Length of Stay'], errors='coerce')
data['Total Charges'] = pd.to_numeric(data['Total Charges'], errors='coerce')
data['Total Costs'] = pd.to_numeric(data['Total Costs'], errors='coerce')
cleaned_data = data.dropna(subset=['Length of Stay', 'Total Charges', 'Total Costs'])

# Encode categorical features
encoder = LabelEncoder()

# Features and target for Objective 3
features_obj3 = ['APR Severity of Illness Code', 'Payment Typology 1', 'CCS Procedure Code']
X3 = cleaned_data[features_obj3]
y3 = cleaned_data['Total Charges']

# Encode categorical features
X3['Payment Typology 1'] = encoder.fit_transform(X3['Payment Typology 1'])
X3['CCS Procedure Code'] = cleaned_data['CCS Procedure Code'].fillna(0).astype(int)

# Standardize features
scaler = StandardScaler()
X3_scaled = scaler.fit_transform(X3)

# Split data
X3_train, X3_test, y3_train, y3_test = train_test_split(X3_scaled, y3, test_size=0.2, random_state=42)

# Train Linear Regression model
linear_regressor = LinearRegression()
linear_regressor.fit(X3_train, y3_train)

# Predict and evaluate
y3_pred = linear_regressor.predict(X3_test)
mse = mean_squared_error(y3_test, y3_pred)
mae = mean_absolute_error(y3_test, y3_pred)
r2 = r2_score(y3_test, y3_pred)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R² Score: {r2}')

# Residuals plot
plt.figure(figsize=(8, 6))
plt.scatter(y3_test, y3_pred)
plt.plot([y3_test.min(), y3_test.max()], [y3_test.min(), y3_test.max()], '--', color='red')
plt.xlabel('Actual Total Charges')
plt.ylabel('Predicted Total Charges')
plt.title('Actual vs Predicted Total Charges')
plt.show()

# Error Distribution
errors = y3_test - y3_pred
plt.figure(figsize=(8, 6))
sns.histplot(errors, kde=True)
plt.xlabel('Prediction Error')
plt.title('Distribution of Prediction Errors')
plt.show()
