In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sqlalchemy import create_engine
from sqlalchemy import MetaData
import json
import pickle

In [2]:
#Loading Database credentials 
username = 'postgres'
password = 'myproject'
hostname = 'database-1.cfg4ma0mq56c.us-east-2.rds.amazonaws.com'
port = '5432'
database_name = 'tx-dx'

# Create connection string
engine = create_engine(f'postgresql://{username}:{password}@{hostname}:{port}/{database_name}')

In [3]:
#Testing connection to database server 
try:
    connection = engine.connect()
    print("Connection successful!")
    connection.close()
except Exception as e:
    print(f"Connection failed with error: {e}")

Connection successful!


In [4]:

metadata = MetaData()
metadata.reflect(bind=engine)
# Get the reflected table from the metadata
reflected_employee_table = metadata.tables['claim']

In [5]:
from sqlalchemy import select 

stmt = select(reflected_employee_table)

with engine.connect() as connection: 
    results = connection.execute(stmt).fetchall()

In [6]:
df = pd.DataFrame(results, columns = reflected_employee_table.columns.keys())
df.head()

Unnamed: 0,index,Patient,Age,Age_Group,Sex,Diagnosis_Code,Diagnosis_Group,Diagnosis_Family,Diagnosis_Description,Med_Code,Med_Description,Med_Description_Simp,Quantity,Status,Amount_Billed,Amount_Paid
0,0,2112140237,37,26-45,Male,K21.9,K2,K,Gastro-esophageal reflux disease without esoph...,17381100000000.0,(SODIUM CHLORIDE : 9 MG/ML) SOLUTION FOR INFU...,SODIUM CHLORIDE,1,Paid,3.0,3.0
1,1,2002110188,38,26-45,Male,I21.3,I0,I,ST elevation (STEMI) myocardial infarction of ...,9933860000000.0,(CLOPIDOGREL (AS BESILATE) : 75 MG) FILM COAT...,CLOPIDOGREL,4,Rejected,17.44,0.0
2,2,1510110229,59,46-65,Male,B34.2,B3,B,"Coronavirus infection, unspecified",1372430000000.0,(PANTOPRAZOLE (AS SODIUM) : 40 MG) ENTERIC CO...,PANTOPRAZOLE,2,Paid,5.36,5.36
3,3,2312040128,38,26-45,Male,I69.354,I1,I,Hemiplegia and hemiparesis following cerebral ...,271792000000.0,(AMLODIPINE : 5 MG) (VALSARTAN : 160 MG) FILM...,"AMLODIPINE, VALSARTAN",7,Paid,51.66,51.66
4,4,2311110151,44,26-45,Male,J32.9,J3,J,"Chronic sinusitis, unspecified",3551200000000.0,(GENTAMICIN : 0.3%) EYE OINTMENT,GENTAMICIN,1,Paid,8.5,8.5


In [7]:
connection.close()

In [8]:
df.dtypes

index                      int64
Patient                    int64
Age                        int64
Age_Group                 object
Sex                       object
Diagnosis_Code            object
Diagnosis_Group           object
Diagnosis_Family          object
Diagnosis_Description     object
Med_Code                 float64
Med_Description           object
Med_Description_Simp      object
Quantity                   int64
Status                    object
Amount_Billed            float64
Amount_Paid              float64
dtype: object

In [9]:
df['Sex'] = df['Sex'].replace({'Male': 0, 'Female': 1})

In [10]:
df.nunique()

index                    215553
Patient                   15255
Age                          99
Age_Group                     5
Sex                           2
Diagnosis_Code             1847
Diagnosis_Group              74
Diagnosis_Family             23
Diagnosis_Description      1847
Med_Code                   1260
Med_Description            1063
Med_Description_Simp        507
Quantity                     79
Status                        2
Amount_Billed              2452
Amount_Paid                4187
dtype: int64

In [11]:
df['Status'].value_counts()

Paid        155131
Rejected     60422
Name: Status, dtype: int64

In [12]:
age_scaled = StandardScaler().fit_transform(df[["Age"]])

# Diplay the first five rows of the scaled data
print(age_scaled)

#Creating a new dataframe withs scaled age 
df['Age_Scaled'] = pd.DataFrame(age_scaled)

[[-0.35537409]
 [-0.29912292]
 [ 0.88215179]
 ...
 [-0.13036939]
 [-0.29912292]
 [-0.69288115]]


In [13]:
df_age_map= pd.concat([df['Age'], df['Age_Scaled']], axis = 1)
df_age_map.head(50)

Unnamed: 0,Age,Age_Scaled
0,37,-0.355374
1,38,-0.299123
2,59,0.882152
3,38,-0.299123
4,44,0.038384
5,5,-2.155412
6,37,-0.355374
7,46,0.150886
8,27,-0.917886
9,64,1.163408


In [14]:
# Create a dictionary with age as key and age_scaled as value
age_dict = dict(zip(df_age_map['Age'], df_age_map['Age_Scaled']))

# Ensure no duplicate keys
unique_age_dict = {k: v for k, v in age_dict.items() if list(age_dict.keys()).count(k) == 1}

print(unique_age_dict)

{37: -0.35537409358990435, 38: -0.2991229171951276, 59: 0.8821517870951836, 44: 0.03838414117353271, 5: -2.1554117382227593, 46: 0.15088649396308615, 27: -0.9178858575376716, 64: 1.1634076690690671, 74: 1.7259194330168344, 36: -0.4116252699846811, 51: 0.4321423759369698, 34: -0.5241276227742345, 61: 0.994654139884737, 55: 0.6571470815160767, 39: -0.2428717408003509, 26: -0.9741370339324483, 31: -0.6928811519585647, 35: -0.4678764463794578, 62: 1.0509053162795137, 22: -1.1991417395115551, 86: 2.4009335497541553, 32: -0.636629975563788, 47: 0.20713767035786287, 41: -0.13036938801079745, 29: -0.8053835047481182, 52: 0.4883935523317465, 67: 1.3321611982533974, 24: -1.0866393867220017, 28: -0.8616346811428949, 60: 0.9384029634899603, 50: 0.3758911995421931, 68: 1.388412374648174, 48: 0.2633888467526396, 43: -0.017867035221244013, 25: -1.030388210327225, 53: 0.5446447287265233, 72: 1.613417080227281, 4: -2.2116629146175364, 65: 1.219658845463844, 21: -1.255392915906332, 10: -1.87415585624887

In [15]:
with open('../Deployment/age_map.json', 'w') as json_file:
    json.dump(unique_age_dict, json_file)

In [16]:
#Creating Dummies for Diagnosis_Group
diag_dummies = pd.get_dummies(df["Diagnosis_Group"])
diag_dummies.head(50)

Unnamed: 0,A0,A1,A4,A5,A8,A9,B3,B4,B5,B9,...,R7,S0,S2,S3,T1,U0,Z0,Z3,Z4,Z9
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
#Creating Diagnosis Grup encoing mapping 
diag_dummies_copy = diag_dummies.copy()

# Create a new DataFrame with 'Index_Column' as the index
diag_dummies_copy.set_index(df['Diagnosis_Group'], inplace=True)
diag_dummies_copy.head()

Unnamed: 0_level_0,A0,A1,A4,A5,A8,A9,B3,B4,B5,B9,...,R7,S0,S2,S3,T1,U0,Z0,Z3,Z4,Z9
Diagnosis_Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
K2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
I0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
I1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
J3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
#Creating index dictoanry 
# Initialize an empty dictionary 
encoding_dict = {}

# Iterate over DataFrame rows
for index, row in diag_dummies_copy.iterrows():
    #Check if key already exsists 
    if index not in encoding_dict:  
        encoding_dict[index] = row.tolist()

print(encoding_dict)

{'K2': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'I0': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'B3': [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'I1': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'J3': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [19]:
# File path to save the JSON file
file_path = '../Deployment/output.json'

# Convert dictionary to JSON and write to file
with open(file_path, 'w') as json_file:
    json.dump(encoding_dict, json_file)

print("Dictionary converted to JSON and saved to", file_path)

Dictionary converted to JSON and saved to ../Deployment/output.json


In [20]:
#Combining all dataframe into one for processing
df_combined = pd.concat([df['Age_Scaled'], df['Sex'], diag_dummies, df['Status']], axis = 1)
df_combined.head(50)

Unnamed: 0,Age_Scaled,Sex,A0,A1,A4,A5,A8,A9,B3,B4,...,S0,S2,S3,T1,U0,Z0,Z3,Z4,Z9,Status
0,-0.355374,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Paid
1,-0.299123,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Rejected
2,0.882152,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Paid
3,-0.299123,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Paid
4,0.038384,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Paid
5,-2.155412,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Paid
6,-0.355374,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Paid
7,0.150886,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Paid
8,-0.917886,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Paid
9,1.163408,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Paid


In [21]:
X = df_combined.drop(columns=['Status'])
y = df_combined['Status']

In [22]:
df_combined.dtypes

Age_Scaled    float64
Sex             int64
A0              uint8
A1              uint8
A4              uint8
               ...   
Z0              uint8
Z3              uint8
Z4              uint8
Z9              uint8
Status         object
Length: 77, dtype: object

In [23]:
df_combined = df_combined.astype(object)

In [24]:
df_combined['Age_Scaled'] = df_combined['Age_Scaled'].astype(float)

In [25]:
df_combined.dtypes

Age_Scaled    float64
Sex            object
A0             object
A1             object
A4             object
               ...   
Z0             object
Z3             object
Z4             object
Z9             object
Status         object
Length: 77, dtype: object

In [26]:
df_combined.columns

Index(['Age_Scaled', 'Sex', 'A0', 'A1', 'A4', 'A5', 'A8', 'A9', 'B3', 'B4',
       'B5', 'B9', 'C0', 'C4', 'C5', 'C6', 'C7', 'C8', 'D0', 'D1', 'D2', 'D3',
       'D4', 'D5', 'E0', 'E1', 'E2', 'E5', 'E7', 'E8', 'F1', 'F2', 'G0', 'G2',
       'G3', 'G8', 'H0', 'H6', 'I0', 'I1', 'I8', 'I9', 'J0', 'J3', 'K0', 'K2',
       'K4', 'K5', 'K7', 'K9', 'L0', 'M0', 'M3', 'M4', 'M6', 'M7', 'M8', 'N1',
       'N4', 'N6', 'O0', 'O6', 'P0', 'Q0', 'R0', 'R1', 'R7', 'S0', 'S2', 'S3',
       'T1', 'U0', 'Z0', 'Z3', 'Z4', 'Z9', 'Status'],
      dtype='object')

In [33]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

Inital trial

In [34]:
# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
X.columns = X.columns.astype(str)
rf_classifier.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf_classifier.predict(X_test)

In [None]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


In [None]:
# Generate classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Optomization #1: tuning hyperparameters 

In [None]:
# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=500, max_depth= None, min_samples_split=2, random_state=42)
X.columns = X.columns.astype(str)
rf_classifier.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf_classifier.predict(X_test)

In [None]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
# Generate classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Optomization #2: Using class_weight='balanced' to adjust the weight inversely proportional to class freuqnecies.

In [None]:
# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators= 250, max_depth= None , min_samples_split= 2, random_state=42, class_weight='balanced')
X.columns = X.columns.astype(str)
rf_classifier.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf_classifier.predict(X_test)

In [None]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
# Generate classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Optomization #3: Using SMOTE to oversample the Rejected class

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [None]:
# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators= 250, max_depth= None , min_samples_split= 2, random_state=42, class_weight='balanced')
X.columns = X.columns.astype(str)
rf_classifier.fit(X_resampled, y_resampled)

# Make predictions on the testing set
y_pred = rf_classifier.predict(X_test)

In [None]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
# Generate classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Optomization #4: manually setting the weights

In [35]:
# Initialize and train the Random Forest classifier
weights = {'Paid': 1, 'Rejected': 2}
rf_classifier = RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=4, random_state=42, class_weight=weights)

X.columns = X.columns.astype(str)
rf_classifier.fit(X_train, y_train)

In [36]:
# Make predictions on the testing set
y_pred = rf_classifier.predict(X_test)

In [37]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7802148861548739


In [38]:
# Generate classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

        Paid       0.87      0.82      0.84     38935
    Rejected       0.59      0.68      0.63     14954

    accuracy                           0.78     53889
   macro avg       0.73      0.75      0.74     53889
weighted avg       0.79      0.78      0.78     53889



In [39]:
#Saving Pickle File 
with open('../Deployment/model.pkl', 'wb') as f:
    pickle.dump(rf_classifier, f)

Optomization #5: Combining over (SMOT) and under (TomekLinks) sampling techniques 

In [None]:
from imblearn.combine import SMOTETomek

smt = SMOTETomek(random_state=42)
X_resampled, y_resampled = smt.fit_resample(X_train, y_train)

In [None]:
# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=250, max_depth=None, min_samples_split=6, random_state=42)
X.columns = X.columns.astype(str)
rf_classifier.fit(X_resampled, y_resampled)

# Make predictions on the testing set
y_pred = rf_classifier.predict(X_test)

In [None]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
# Generate classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Optimization #6: using Diagnosis_famiy as one of the feautres

In [None]:
# Select relevant columns for prediction
features = ['Age', 'Sex', 'Diagnosis_Family']
target = 'Status'

X = df[features]
y = df[target]

In [None]:
# Perform one-hot encoding for categorical variables
encoder = OneHotEncoder(drop='first')
X_encoded = encoder.fit_transform(X[['Diagnosis_Family']])

# Create a scaler object 
scaler = StandardScaler()

scaled_data = scaler.fit_transform(X[['Age','Sex']], X_encoded)


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [None]:
# Initialize and train the Random Forest classifier

weights = {'Paid': 1, 'Rejected': 2}
rf_classifier = RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=10, random_state=42, class_weight=weights)
rf_classifier.fit(X_train, y_train)

In [None]:
# Make predictions on the testing set
y_pred = rf_classifier.predict(X_test)

In [None]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
# Generate classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)