In [93]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sqlalchemy import create_engine
from sqlalchemy import MetaData

In [26]:
#Loading Database credentials 
username = 'postgres'
password = 'myproject'
hostname = 'database-1.cfg4ma0mq56c.us-east-2.rds.amazonaws.com'
port = '5432'
database_name = 'tx-dx'

# Create connection string
engine = create_engine(f'postgresql://{username}:{password}@{hostname}:{port}/{database_name}')

In [27]:
#Testing connection to database server 
try:
    connection = engine.connect()
    print("Connection successful!")
    connection.close()
except Exception as e:
    print(f"Connection failed with error: {e}")

Connection successful!


In [28]:

metadata = MetaData()
metadata.reflect(bind=engine)
# Get the reflected table from the metadata
reflected_employee_table = metadata.tables['claim']

In [29]:
from sqlalchemy import select 

stmt = select(reflected_employee_table)

with engine.connect() as connection: 
    results = connection.execute(stmt).fetchall()

In [30]:
df = pd.DataFrame(results, columns = reflected_employee_table.columns.keys())
df.head()

Unnamed: 0,index,Patient,Age,Age_Group,Sex,Diagnosis_Code,Diagnosis_Group,Diagnosis_Family,Diagnosis_Description,Med_Code,Med_Description,Med_Description_Simp,Quantity,Status,Amount_Billed,Amount_Paid
0,0,2112140237,37,26-45,Male,K21.9,K2,K,Gastro-esophageal reflux disease without esoph...,17381100000000.0,(SODIUM CHLORIDE : 9 MG/ML) SOLUTION FOR INFU...,SODIUM CHLORIDE,1,Paid,3.0,3.0
1,1,2002110188,38,26-45,Male,I21.3,I0,I,ST elevation (STEMI) myocardial infarction of ...,9933860000000.0,(CLOPIDOGREL (AS BESILATE) : 75 MG) FILM COAT...,CLOPIDOGREL,4,Rejected,17.44,0.0
2,2,1510110229,59,46-65,Male,B34.2,B3,B,"Coronavirus infection, unspecified",1372430000000.0,(PANTOPRAZOLE (AS SODIUM) : 40 MG) ENTERIC CO...,PANTOPRAZOLE,2,Paid,5.36,5.36
3,3,2312040128,38,26-45,Male,I69.354,I1,I,Hemiplegia and hemiparesis following cerebral ...,271792000000.0,(AMLODIPINE : 5 MG) (VALSARTAN : 160 MG) FILM...,"AMLODIPINE, VALSARTAN",7,Paid,51.66,51.66
4,4,2311110151,44,26-45,Male,J32.9,J3,J,"Chronic sinusitis, unspecified",3551200000000.0,(GENTAMICIN : 0.3%) EYE OINTMENT,GENTAMICIN,1,Paid,8.5,8.5


In [31]:
connection.close()

In [32]:
df.dtypes

index                      int64
Patient                    int64
Age                        int64
Age_Group                 object
Sex                       object
Diagnosis_Code            object
Diagnosis_Group           object
Diagnosis_Family          object
Diagnosis_Description     object
Med_Code                 float64
Med_Description           object
Med_Description_Simp      object
Quantity                   int64
Status                    object
Amount_Billed            float64
Amount_Paid              float64
dtype: object

In [49]:
df['Sex'] = df['Sex'].replace({'Male': 0, 'Female': 1})

In [50]:
df.nunique()

index                    215553
Patient                   15255
Age                          99
Age_Group                     5
Sex                           2
Diagnosis_Code             1847
Diagnosis_Group              74
Diagnosis_Family             23
Diagnosis_Description      1847
Med_Code                   1260
Med_Description            1063
Med_Description_Simp        507
Quantity                     79
Status                        2
Amount_Billed              2452
Amount_Paid                4187
dtype: int64

In [51]:
df['Status'].value_counts()

Paid        155131
Rejected     60422
Name: Status, dtype: int64

In [98]:
# Select relevant columns for prediction
features = ['Age', 'Sex', 'Diagnosis_Group']
target = 'Status'

X = df[features]
y = df[target]


In [99]:
# Create a ColumnTransformer for combined preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Age']),
        ('cat', OneHotEncoder(drop='first'), ['Diagnosis_Group'])
    ])

# Fit and transform the data
X_processed = preprocessor.fit_transform(X)

In [100]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

In [48]:
# Get feature names after transformation
feature_names = preprocessor.get_feature_names_out()

# Create a DataFrame from the processed data
X_processed_df = pd.DataFrame(X_processed, columns=feature_names)

Unnamed: 0,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,Age_9,Age_10,Age_11,...,Diagnosis_Group_R7,Diagnosis_Group_S0,Diagnosis_Group_S2,Diagnosis_Group_S3,Diagnosis_Group_T1,Diagnosis_Group_U0,Diagnosis_Group_Z0,Diagnosis_Group_Z3,Diagnosis_Group_Z4,Diagnosis_Group_Z9
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Inital trial

In [90]:
# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf_classifier.predict(X_test)

In [91]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.739324070422862


In [92]:
# Generate classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

        Paid       0.74      0.99      0.85     31164
    Rejected       0.72      0.10      0.17     11947

    accuracy                           0.74     43111
   macro avg       0.73      0.54      0.51     43111
weighted avg       0.73      0.74      0.66     43111



Optomization #1: tuning hyperparameters 

In [101]:
# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=500, max_depth= None, min_samples_split=2, random_state=42)
rf_classifier.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf_classifier.predict(X_test)

In [102]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7956206072696064


In [103]:
# Generate classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

        Paid       0.81      0.94      0.87     31164
    Rejected       0.72      0.43      0.54     11947

    accuracy                           0.80     43111
   macro avg       0.77      0.68      0.70     43111
weighted avg       0.79      0.80      0.78     43111



Optomization #2: Using class_weight='balanced' to adjust the weight inversely proportional to class freuqnecies.

In [69]:
# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators= 250, max_depth= None , min_samples_split= 2, random_state=42, class_weight='balanced')
rf_classifier.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf_classifier.predict(X_test)

In [70]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6325067848113011


In [71]:
# Generate classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

        Paid       0.80      0.66      0.72     31164
    Rejected       0.39      0.56      0.46     11947

    accuracy                           0.63     43111
   macro avg       0.59      0.61      0.59     43111
weighted avg       0.68      0.63      0.65     43111



Optomization #3: Using SMOTE to oversample the Rejected class

In [72]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [73]:
# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators= 250, max_depth= None , min_samples_split= 2, random_state=42, class_weight='balanced')
rf_classifier.fit(X_resampled, y_resampled)

# Make predictions on the testing set
y_pred = rf_classifier.predict(X_test)

In [74]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.634084108464197


In [75]:
# Generate classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

        Paid       0.80      0.66      0.72     31164
    Rejected       0.39      0.57      0.46     11947

    accuracy                           0.63     43111
   macro avg       0.59      0.61      0.59     43111
weighted avg       0.69      0.63      0.65     43111



Optomization #4: manually setting the weights

In [104]:
# Initialize and train the Random Forest classifier

weights = {'Paid': 1, 'Rejected': 2}
rf_classifier = RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=4, random_state=42, class_weight=weights)
rf_classifier.fit(X_train, y_train)

In [105]:
# Make predictions on the testing set
y_pred = rf_classifier.predict(X_test)

In [106]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7659530050335182


In [107]:
# Generate classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

        Paid       0.85      0.82      0.84     31164
    Rejected       0.57      0.62      0.60     11947

    accuracy                           0.77     43111
   macro avg       0.71      0.72      0.72     43111
weighted avg       0.77      0.77      0.77     43111



Optomization #5: Combining over (SMOT) and under (TomekLinks) sampling techniques 

In [76]:
from imblearn.combine import SMOTETomek

smt = SMOTETomek(random_state=42)
X_resampled, y_resampled = smt.fit_resample(X_train, y_train)

In [77]:
# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=250, max_depth=None, min_samples_split=6, random_state=42)
rf_classifier.fit(X_resampled, y_resampled)

# Make predictions on the testing set
y_pred = rf_classifier.predict(X_test)

In [78]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.634084108464197


In [79]:
# Generate classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

        Paid       0.80      0.66      0.72     31164
    Rejected       0.39      0.57      0.46     11947

    accuracy                           0.63     43111
   macro avg       0.59      0.61      0.59     43111
weighted avg       0.69      0.63      0.65     43111



Optimization #6: using Diagnosis_famiy as one of the feautres

In [80]:
# Select relevant columns for prediction
features = ['Age', 'Sex', 'Diagnosis_Family']
target = 'Status'

X = df[features]
y = df[target]

In [83]:
# Perform one-hot encoding for categorical variables
encoder = OneHotEncoder(drop='first')
X_encoded = encoder.fit_transform(X[['Diagnosis_Family']])

# Create a scaler object 
scaler = StandardScaler()

scaled_data = scaler.fit_transform(X[['Age','Sex']], X_encoded)


In [84]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [85]:
# Initialize and train the Random Forest classifier

weights = {'Paid': 1, 'Rejected': 2}
rf_classifier = RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=10, random_state=42, class_weight=weights)
rf_classifier.fit(X_train, y_train)

In [86]:
# Make predictions on the testing set
y_pred = rf_classifier.predict(X_test)

In [87]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7071049152188537


In [88]:
# Generate classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

        Paid       0.76      0.88      0.81     31164
    Rejected       0.45      0.26      0.33     11947

    accuracy                           0.71     43111
   macro avg       0.60      0.57      0.57     43111
weighted avg       0.67      0.71      0.68     43111

