In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sqlalchemy import create_engine
from sqlalchemy import MetaData

In [2]:
#Loading Database credentials 
username = 'postgres'
password = 'myproject'
hostname = 'database-1.cfg4ma0mq56c.us-east-2.rds.amazonaws.com'
port = '5432'
database_name = 'tx-dx'

# Create connection string
engine = create_engine(f'postgresql://{username}:{password}@{hostname}:{port}/{database_name}')

In [3]:
#Testing connection to database server 
try:
    connection = engine.connect()
    print("Connection successful!")
    connection.close()
except Exception as e:
    print(f"Connection failed with error: {e}")

Connection successful!


In [4]:

metadata = MetaData()
metadata.reflect(bind=engine)
# Get the reflected table from the metadata
reflected_employee_table = metadata.tables['claim']

In [9]:
from sqlalchemy import select 

stmt = select(reflected_employee_table)

with engine.connect() as connection: 
    results = connection.execute(stmt).fetchall()

In [10]:
df = pd.DataFrame(results, columns = reflected_employee_table.columns.keys())
df.head()

Unnamed: 0,index,Patient,Age,Age_Group,Sex,Diagnosis_Code,Diagnosis_Group,Diagnosis_Family,Diagnosis_Description,Med_Code,Med_Description,Med_Description_Simp,Quantity,Status,Amount_Billed,Amount_Paid
0,0,2112140237,37,26-45,Male,K21.9,K2,K,Gastro-esophageal reflux disease without esoph...,17381100000000.0,(SODIUM CHLORIDE : 9 MG/ML) SOLUTION FOR INFU...,SODIUM CHLORIDE,1,Paid,3.0,3.0
1,1,2002110188,38,26-45,Male,I21.3,I0,I,ST elevation (STEMI) myocardial infarction of ...,9933860000000.0,(CLOPIDOGREL (AS BESILATE) : 75 MG) FILM COAT...,CLOPIDOGREL,4,Rejected,17.44,0.0
2,2,1510110229,59,46-65,Male,B34.2,B3,B,"Coronavirus infection, unspecified",1372430000000.0,(PANTOPRAZOLE (AS SODIUM) : 40 MG) ENTERIC CO...,PANTOPRAZOLE,2,Paid,5.36,5.36
3,3,2312040128,38,26-45,Male,I69.354,I1,I,Hemiplegia and hemiparesis following cerebral ...,271792000000.0,(AMLODIPINE : 5 MG) (VALSARTAN : 160 MG) FILM...,"AMLODIPINE, VALSARTAN",7,Paid,51.66,51.66
4,4,2311110151,44,26-45,Male,J32.9,J3,J,"Chronic sinusitis, unspecified",3551200000000.0,(GENTAMICIN : 0.3%) EYE OINTMENT,GENTAMICIN,1,Paid,8.5,8.5


In [None]:
connection.close()

In [3]:
df.dtypes

Patient                    int64
Age                        int64
Age_Group                 object
Sex                       object
Diagnosis_Code            object
Diagnosis_Group           object
Diagnosis_Family          object
Diagnosis_Description     object
Med_Code                   int64
Med_Description           object
Med_Description_Simp      object
Quantity                   int64
Status                    object
Amount_Billed            float64
Amount_Paid              float64
dtype: object

In [4]:
df.nunique()

Patient                  15255
Age                         99
Age_Group                    5
Sex                          2
Diagnosis_Code            1847
Diagnosis_Group             74
Diagnosis_Family            23
Diagnosis_Description     1847
Med_Code                  1612
Med_Description           1063
Med_Description_Simp       478
Quantity                    79
Status                       2
Amount_Billed             2452
Amount_Paid               4187
dtype: int64

In [5]:
df['Status'].value_counts()

Paid        155131
Rejected     60422
Name: Status, dtype: int64

In [6]:
# Select relevant columns for prediction
features = ['Age', 'Sex', 'Diagnosis_Group']
target = 'Status'

X = df[features]
y = df[target]


In [7]:
# Perform one-hot encoding for categorical variables
encoder = OneHotEncoder(drop='first')
X_encoded = encoder.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)


Inital trial

In [9]:
# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf_classifier.predict(X_test)

In [10]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.8157314838440305


In [11]:
# Generate classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

        Paid       0.83      0.94      0.88     31132
    Rejected       0.76      0.49      0.60     11979

    accuracy                           0.82     43111
   macro avg       0.79      0.72      0.74     43111
weighted avg       0.81      0.82      0.80     43111



Optomization #1: tuning hyperparameters 

In [11]:
# from sklearn.model_selection import GridSearchCV

# # Define the parameter grid
# param_grid = {
#     'n_estimators': [100, 250, 500],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10]
# }

# # Initialize the grid search model
# grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)

# # Fit the grid search to the data
# grid_search.fit(X_train, y_train)

# # Get the best parameters
# best_params = grid_search.best_params_
# print(f"Best parameters: {best_params}")

#This took almost 50 minutes and didn't run, may try later

In [12]:
# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import randint

# # Define the parameter distribution
# param_dist = {
#     'n_estimators': randint(50, 200),
#     'max_depth': [None] + list(randint(1, 20).rvs(size=10)),
#     'min_samples_split': randint(2, 10)
# }

# # Initialize the randomized search model
# random_search = RandomizedSearchCV(estimator=rf_classifier, param_distributions=param_dist, n_iter=100, cv=3, scoring='accuracy', verbose=2, random_state=42, n_jobs=-1)

# # Fit the randomized search to the data
# random_search.fit(X_train, y_train)

# # Get the best parameters
# best_params = random_search.best_params_
# print(f"Best parameters: {best_params}")

Fitting 3 folds for each of 100 candidates, totalling 300 fits


KeyboardInterrupt: 

In [12]:
# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=500, max_depth= None, min_samples_split=2, random_state=42)
rf_classifier.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf_classifier.predict(X_test)

In [13]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8156155041636706


In [14]:
# Generate classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

        Paid       0.83      0.94      0.88     31132
    Rejected       0.76      0.49      0.60     11979

    accuracy                           0.82     43111
   macro avg       0.79      0.72      0.74     43111
weighted avg       0.81      0.82      0.80     43111



Optomization #2: Using class_weight='balanced' to adjust the weight inversely proportional to class freuqnecies.

In [15]:
# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators= 250, max_depth= None , min_samples_split= 2, random_state=42, class_weight='balanced')
rf_classifier.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf_classifier.predict(X_test)

In [16]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.751501936860662


In [17]:
# Generate classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

        Paid       0.89      0.75      0.81     31132
    Rejected       0.54      0.77      0.63     11979

    accuracy                           0.75     43111
   macro avg       0.71      0.76      0.72     43111
weighted avg       0.79      0.75      0.76     43111



Optomization #3: Using SMOTE to oversample the Rejected class

In [18]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [19]:
# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators= 250, max_depth= None , min_samples_split= 2, random_state=42, class_weight='balanced')
rf_classifier.fit(X_resampled, y_resampled)

# Make predictions on the testing set
y_pred = rf_classifier.predict(X_test)

In [20]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.74765141147271


In [21]:
# Generate classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

        Paid       0.90      0.74      0.81     31132
    Rejected       0.53      0.78      0.63     11979

    accuracy                           0.75     43111
   macro avg       0.71      0.76      0.72     43111
weighted avg       0.79      0.75      0.76     43111



Optomization #4: manually setting the weights

In [26]:
# Initialize and train the Random Forest classifier

weights = {'Paid': 1, 'Rejected': 2}
rf_classifier = RandomForestClassifier(n_estimators=250, max_depth=None, min_samples_split=4, random_state=42, class_weight=weights)
rf_classifier.fit(X_train, y_train)

In [27]:
# Make predictions on the testing set
y_pred = rf_classifier.predict(X_test)

In [28]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7798009788685022


In [29]:
# Generate classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

        Paid       0.87      0.81      0.84     31132
    Rejected       0.59      0.69      0.64     11979

    accuracy                           0.78     43111
   macro avg       0.73      0.75      0.74     43111
weighted avg       0.79      0.78      0.78     43111



Optomization #5: Combining over (SMOT) and under (TomekLinks) sampling techniques 

In [30]:
from imblearn.combine import SMOTETomek

smt = SMOTETomek(random_state=42)
X_resampled, y_resampled = smt.fit_resample(X_train, y_train)

In [31]:
# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=250, max_depth=None, min_samples_split=6, random_state=42)
rf_classifier.fit(X_resampled, y_resampled)

# Make predictions on the testing set
y_pred = rf_classifier.predict(X_test)

In [32]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7478833708334299


In [33]:
# Generate classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

        Paid       0.90      0.74      0.81     31132
    Rejected       0.53      0.78      0.63     11979

    accuracy                           0.75     43111
   macro avg       0.71      0.76      0.72     43111
weighted avg       0.79      0.75      0.76     43111



Optimization #6: using Diagnosis_famiy as one of the feautres

In [11]:
# Select relevant columns for prediction
features = ['Age', 'Sex', 'Diagnosis_Family']
target = 'Status'

X = df[features]
y = df[target]

In [12]:
# Perform one-hot encoding for categorical variables
encoder = OneHotEncoder(drop='first')
X_encoded = encoder.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [21]:
# Initialize and train the Random Forest classifier

weights = {'Paid': 1, 'Rejected': 2}
rf_classifier = RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=10, random_state=42, class_weight=weights)
rf_classifier.fit(X_train, y_train)

In [22]:
# Make predictions on the testing set
y_pred = rf_classifier.predict(X_test)

In [23]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7635406276820301


In [24]:
# Generate classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

        Paid       0.84      0.83      0.83     31164
    Rejected       0.57      0.60      0.58     11947

    accuracy                           0.76     43111
   macro avg       0.71      0.71      0.71     43111
weighted avg       0.77      0.76      0.77     43111

