In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score, recall_score



In [2]:
medication_df = pd.read_csv("/kaggle/input/high-cost-diabetes-patients-prediction/Softec_Medication_Data_Kaggle_V1.csv")
patient_data_training_df = pd.read_csv("/kaggle/input/high-cost-diabetes-patients-prediction/Softec_Patient_Data_Training_Kaggle_V1.csv")
socio_economic_df = pd.read_csv("/kaggle/input/high-cost-diabetes-patients-prediction/Softec_Socio_Economics_Data_Kaggle_V1.csv")
patient_prediction_data_df = pd.read_csv("/kaggle/input/high-cost-diabetes-patients-prediction/Softec_Patient_Data_Testing_Kaggle_V1.csv")
patient_testing_data_df = pd.read_csv("/kaggle/input/high-cost-diabetes-patients-prediction/Softec_Patient_Data_Testing_Features.csv")

**Merging Dataset**

In [3]:
# merge the datasets on a common column
merged_df_train = pd.merge(patient_data_training_df, medication_df, on='ID')
merged_df_train = pd.merge(merged_df_train, socio_economic_df, on='ID')

In [4]:
merged_df_train.head()

Unnamed: 0,ID,Age,Race,Sex,State,Zip Code,MSA,Enrollment Type Categorized,Enrollment Months,HCC Score,...,pioglitazone,sitagliptin and metformin hydrochloride,Avg ADI,Max ADI,Min ADI,SVI1,SVI3,SVI4,SVI,MDI
0,68802021,68,White,Male,Michigan,48176,11460,10,12.0,1.33,...,0,0,30.157895,67.0,8.0,-27.707059,-18.227551,-27.590337,-27.727856,14.16
1,3422021,60,Black,Male,Michigan,49224,12980,20,12.0,4.8,...,0,0,80.2,100.0,52.0,-21.266317,0.272289,-21.27153,-21.282509,15.39
2,7892017,43,White,Male,Michigan,49016,12980,20,12.0,0.25,...,0,0,82.545455,97.0,63.0,0.529493,0.376685,0.616187,0.55524,15.39
3,7892020,46,White,Male,Michigan,49016,12980,20,12.0,0.5,...,0,0,82.545455,97.0,63.0,0.529493,0.376685,0.616187,0.55524,15.39
4,42792020,84,Black,Female,Michigan,49224,12980,10,12.0,0.43,...,0,0,80.2,100.0,52.0,-21.266317,0.272289,-21.27153,-21.282509,14.16


In [5]:
merged_df_test = pd.merge(patient_testing_data_df, medication_df, on='ID')
merged_df_test = pd.merge(merged_df_test, socio_economic_df, on='ID')

In [6]:
merged_df_test.head()

Unnamed: 0,ID,Age,Race,Sex,State,Zip Code,MSA,Enrollment Type Categorized,Enrollment Months,HCC Score,...,pioglitazone,sitagliptin and metformin hydrochloride,Avg ADI,Max ADI,Min ADI,SVI1,SVI3,SVI4,SVI,MDI
0,44782019,71,White,Male,Michigan,48848,99022,10,12.0,1.2,...,0,0,51.428571,66.0,36.0,0.233687,0.196252,0.383752,0.252361,14.16
1,44842017,75,White,Male,Michigan,49765,99022,10,12.0,2.02,...,0,0,77.5,98.0,60.0,0.520989,0.158867,0.426078,0.414422,14.16
2,44842018,76,White,Male,Michigan,49765,99022,10,12.0,1.69,...,0,0,77.5,98.0,60.0,0.520989,0.158867,0.426078,0.414422,14.16
3,44862019,73,White,Male,Michigan,48617,99022,10,12.0,1.03,...,0,0,75.090909,95.0,54.0,0.63055,0.2751,0.625072,0.496167,14.16
4,44862018,72,White,Male,Michigan,48617,99022,10,12.0,1.66,...,0,0,75.090909,95.0,54.0,0.63055,0.2751,0.625072,0.496167,14.16


**Spliting Patient id and Year**

In [7]:
patient_id = merged_df_train["ID"]
temp = list(patient_id)
ids = []
year = []
for i in range (len(temp)):
    mystr = str(temp[i])
    year.append(mystr[-4:])
    ids.append(mystr[:-4])

merged_df_train['patient_id'] = ids
merged_df_train['year'] = year

# patient_data_training_df.drop(['ID'], inplace = True, axis =1)

cols = merged_df_train.columns.tolist()
cols = cols[-1:] + cols[:-1]
merged_df_train = merged_df_train[cols]
cols = cols[-1:] + cols[:-1]
merged_df_train = merged_df_train[cols]

In [8]:
merged_df_train.head()

Unnamed: 0,patient_id,year,ID,Age,Race,Sex,State,Zip Code,MSA,Enrollment Type Categorized,...,pioglitazone,sitagliptin and metformin hydrochloride,Avg ADI,Max ADI,Min ADI,SVI1,SVI3,SVI4,SVI,MDI
0,6880,2021,68802021,68,White,Male,Michigan,48176,11460,10,...,0,0,30.157895,67.0,8.0,-27.707059,-18.227551,-27.590337,-27.727856,14.16
1,342,2021,3422021,60,Black,Male,Michigan,49224,12980,20,...,0,0,80.2,100.0,52.0,-21.266317,0.272289,-21.27153,-21.282509,15.39
2,789,2017,7892017,43,White,Male,Michigan,49016,12980,20,...,0,0,82.545455,97.0,63.0,0.529493,0.376685,0.616187,0.55524,15.39
3,789,2020,7892020,46,White,Male,Michigan,49016,12980,20,...,0,0,82.545455,97.0,63.0,0.529493,0.376685,0.616187,0.55524,15.39
4,4279,2020,42792020,84,Black,Female,Michigan,49224,12980,10,...,0,0,80.2,100.0,52.0,-21.266317,0.272289,-21.27153,-21.282509,14.16


In [9]:
merged_df_train.columns

Index(['patient_id', 'year', 'ID', 'Age', 'Race', 'Sex', 'State', 'Zip Code',
       'MSA', 'Enrollment Type Categorized', 'Enrollment Months', 'HCC Score',
       'Avg. LOS', 'Diagnosis', '# Total Claims', '# Hospital OP',
       '# All Physician OP', '# Inpatient', '# Short Term Stay Hospital',
       '# Long Term Stay Hospital', '# Rehabilitation Hospital',
       '# Psychiatric Hospital', '# Readmissions', '% Readmissions', '# ER',
       '# ER Admissions', '# SNF', '# Non Swing Bed SNF Claim',
       '# Swing Bed SNF Claim', '# Home Health', '# Hospice', '# Labs',
       '# Part B Labs', '# Imaging', '# Part B Imaging', '# Part B E&M',
       '# Part B Drugs', '# Part B Ambulance', '# Dialysis',
       '# Part-B Dialysis', '# Rx Claims', '# DME', '# Miscellaneous',
       'TotalCost_Y_Actual', 'TotalCost_Y_Expected', 'dapagliflozin',
       'exenatide', 'glimepiride', 'glyburide and metformin hydrochloride',
       'insulin', 'metformin hydrochloride', 'pioglitazone',
       'sita

**Performing EDA**

In [10]:
merged_df_train.describe()

Unnamed: 0,ID,Age,Zip Code,Enrollment Type Categorized,Enrollment Months,HCC Score,Avg. LOS,Diagnosis,# Total Claims,# Hospital OP,...,pioglitazone,sitagliptin and metformin hydrochloride,Avg ADI,Max ADI,Min ADI,SVI1,SVI3,SVI4,SVI,MDI
count,16328.0,16328.0,16328.0,16328.0,16328.0,16254.0,16328.0,16328.0,16328.0,16328.0,...,16328.0,16328.0,16323.0,16323.0,16323.0,16314.0,16314.0,16314.0,16314.0,16180.0
mean,40590220.0,70.639331,48602.02707,11.762494,11.884248,1.174375,2.039874,38.517455,82.280132,8.812837,...,0.065715,0.020823,66.17949,91.121117,39.964835,-11.894648,-7.893884,-21.578732,-22.054581,14.397443
std,26260240.0,9.761983,1903.882326,3.831102,0.833668,0.843247,7.454327,23.534057,49.355265,8.110442,...,0.247791,0.142796,14.020581,10.637588,16.691933,16.136713,10.598677,33.762651,33.822063,0.545182
min,12019.0,20.0,14211.0,10.0,0.9996,0.14,0.0,1.0,4.0,0.0,...,0.0,0.0,14.181818,22.0,1.0,-199.382,-132.804447,-199.339873,-199.37642,10.22
25%,18379520.0,66.0,48429.0,10.0,12.0,0.61,0.0,21.0,49.0,4.0,...,0.0,0.0,55.857143,88.0,26.0,-20.122837,-16.095146,-24.285136,-28.376737,14.16
50%,37532020.0,71.0,48708.0,10.0,12.0,0.93,0.0,33.0,71.0,7.0,...,0.0,0.0,67.625,95.0,38.0,0.233687,0.0942,0.383752,0.252361,14.16
75%,60894520.0,77.0,48895.0,10.0,12.0,1.4375,0.0,50.0,102.0,11.0,...,0.0,0.0,75.578947,99.0,52.0,0.468933,0.202412,0.508021,0.414422,14.16
max,97262020.0,102.0,93725.0,31.0,12.0,8.71,96.0,203.0,629.0,173.0,...,1.0,1.0,97.8,100.0,97.0,0.740583,0.903254,0.775071,0.692433,24.58


In [11]:
missing_values = merged_df_train.isnull().sum()
print((missing_values))

patient_id      0
year            0
ID              0
Age             0
Race            0
             ... 
SVI1           14
SVI3           14
SVI4           14
SVI            14
MDI           148
Length: 61, dtype: int64


In [12]:
fig = px.bar(missing_values, x=missing_values.index, y=missing_values.values, labels={'y': 'Number of Missing Values'})
fig.show()

In [13]:
numeric_columns = ['Age', 'HCC Score', 'TotalCost_Y_Actual', 'TotalCost_Y_Expected', 'Avg ADI', 'Max ADI', 'Min ADI']

correlation_matrix = merged_df_train[numeric_columns].corr()

fig = ff.create_annotated_heatmap(
    z=correlation_matrix.values,
    x=list(correlation_matrix.columns),
    y=list(correlation_matrix.index),
    annotation_text=correlation_matrix.round(2).values,
    showscale=True,
    colorscale='Viridis'
)

fig.update_layout(title='Correlation Matrix Heatmap')
fig.show()

**Bar chart for Average Actual Cost and Expected Cost average per State**

In [14]:
state_avg = merged_df_train.groupby('State').agg({'TotalCost_Y_Actual': 'mean', 'TotalCost_Y_Expected': 'mean'}).reset_index()

# Create the plot
fig = px.bar(state_avg, x='State', y=['TotalCost_Y_Actual', 'TotalCost_Y_Expected'], barmode='group',
             labels={'value': 'Average Cost', 'variable': 'Cost Type'})

# Display the plot
fig.show()

In [15]:
grouped_actual = merged_df_train.groupby(['State', 'year'])['TotalCost_Y_Actual'].mean().reset_index()
grouped_expected = merged_df_train.groupby(['State', 'year'])['TotalCost_Y_Expected'].mean().reset_index()

# Merge the two grouped datasets
merged_grouped = grouped_actual.merge(grouped_expected, on=['State', 'year'])

# Melt the dataset to have a single cost column
melted_grouped = merged_grouped.melt(id_vars=['State', 'year'], value_vars=['TotalCost_Y_Actual', 'TotalCost_Y_Expected'], var_name='Cost Type', value_name='Cost')

# Create the bar plot
fig = px.bar(melted_grouped, x='State', y='Cost', color='year', text='Cost', facet_col='Cost Type', labels={'Cost': 'Average Cost'})

# Customize the plot
fig.update_layout(title='Average Actual and Expected Cost per State by Year',
                  xaxis_title='State',
                  yaxis_title='Average Cost')

# Show the plot
fig.show()


In [16]:
# Prepare data for the grouped bar plot
actual_costs = merged_df_train.groupby(['State', 'year'])['TotalCost_Y_Actual'].mean().reset_index()
expected_costs = merged_df_train.groupby(['State', 'year'])['TotalCost_Y_Expected'].mean().reset_index()

# Merge actual and expected costs into a single DataFrame
actual_costs['Type'] = 'Actual'
expected_costs['Type'] = 'Expected'
actual_costs.rename(columns={'TotalCost_Y_Actual': 'Cost'}, inplace=True)
expected_costs.rename(columns={'TotalCost_Y_Expected': 'Cost'}, inplace=True)

combined_data = pd.concat([actual_costs, expected_costs], ignore_index=True)

# Create the grouped bar plot
fig = px.bar(combined_data, x='State', y='Cost', color='Type', barmode='group', text='Cost', facet_col='year',
             labels={'Cost': 'Average Cost'})

# Customize the plot
fig.update_layout(title='Average Actual and Expected Cost per State by Year',
                  xaxis_title='State',
                  yaxis_title='Average Cost')

# Show the plot
fig.show()


In [17]:
df_grouped = merged_df_train.groupby(['year', 'State']).agg({
    'TotalCost_Y_Actual': 'mean',
    'TotalCost_Y_Expected': 'mean'
}).reset_index()

fig = px.scatter(df_grouped, x="State", y="TotalCost_Y_Actual", color="year", 
                 size="TotalCost_Y_Actual", hover_data=["TotalCost_Y_Actual"])
fig.add_scatter(x=df_grouped['State'], y=df_grouped['TotalCost_Y_Expected'], 
                mode='markers', name='Expected Cost', marker=dict(size=8))

fig.update_layout(
    title="Actual and Expected Cost Average per State Every Year",
    xaxis_title="State",
    yaxis_title="Cost ($)",
    legend_title="Year",
    height=600,
    hoverlabel=dict(
        bgcolor="white",
        font_size=12,
        font_family="Rockwell"
    )
)

fig.show()


In [18]:
fig = px.histogram(merged_df_train, x='Sex', title='Distribution of Patients by Sex')
fig.update_xaxes(title='Sex')
fig.update_yaxes(title='Count')
fig.show()

In [19]:
# Group the data by sex and calculate the average of the actual and expected cost
sex_avg = merged_df_train.groupby('Sex').agg({'TotalCost_Y_Actual': 'mean', 'TotalCost_Y_Expected': 'mean'}).reset_index()

# Create the plot
fig = px.bar(sex_avg, x='Sex', y=['TotalCost_Y_Actual', 'TotalCost_Y_Expected'], barmode='group',
             labels={'value': 'Average Cost', 'variable': 'Cost Type'})

# Display the plot
fig.show()

In [20]:
fig = px.histogram(merged_df_train, x='Race', title='Distribution of Patients by Race')
fig.update_xaxes(title='Race')
fig.update_yaxes(title='Count')
fig.show()

In [21]:
# Group the data by race and calculate the average of the actual and expected cost
race_avg = merged_df_train.groupby('Race').agg({'TotalCost_Y_Actual': 'mean', 'TotalCost_Y_Expected': 'mean'}).reset_index()

# Create the plot
fig = px.bar(race_avg, x='Race', y=['TotalCost_Y_Actual', 'TotalCost_Y_Expected'], barmode='group',
             labels={'value': 'Average Cost', 'variable': 'Cost Type'})

# Display the plot
fig.show()

In [22]:
avg_cost_by_race_sex = merged_df_train.groupby(['Race', 'Sex'])['TotalCost_Y_Actual'].mean().reset_index()

fig = px.bar(avg_cost_by_race_sex, x='Race', y='TotalCost_Y_Actual', color='Sex', title='Average Total Cost (Actual) by Race and Sex')
fig.update_xaxes(title='Race')
fig.update_yaxes(title='Average Total Cost (Actual)')
fig.show()


**average of the actual and expected cost by age group**

In [23]:
bins=[-float('inf'), 64, float('inf')]
labels = ['<64', '65>'] # Define the age group labels
merged_df_train['Age Group'] = pd.cut(merged_df_train['Age'], bins=bins, labels=labels, include_lowest=True) # Group the data by age bins
age_avg = merged_df_train.groupby('Age Group').agg({'TotalCost_Y_Actual': 'mean', 'TotalCost_Y_Expected': 'mean'}).reset_index()

# Create the plot
fig = px.bar(age_avg, x='Age Group', y=['TotalCost_Y_Actual', 'TotalCost_Y_Expected'], barmode='group',
             labels={'value': 'Average Cost', 'variable': 'Cost Type'})

# Display the plot
fig.show()

In [24]:
fig = px.histogram(merged_df_train, x='Age', nbins=20, title='Distribution of Age Among Patients')
fig.update_xaxes(title='Age')
fig.update_yaxes(title='Count')
fig.show()

In [25]:
fig = px.histogram(merged_df_train, x='TotalCost_Y_Actual', nbins=20, title='Distribution of Total Cost (Actual) Among Patients')
fig.update_xaxes(title='Total Cost (Actual)')
fig.update_yaxes(title='Count')
fig.show()

In [26]:
fig = px.scatter(merged_df_train, x='HCC Score', y='TotalCost_Y_Actual', title='Correlation Between HCC Score and Total Cost (Actual)')
fig.update_xaxes(title='HCC Score')
fig.update_yaxes(title='Total Cost (Actual)')
fig.show()


In [27]:
medications = ['dapagliflozin', 'exenatide', 'glimepiride', 'glyburide and metformin hydrochloride', 'insulin', 'metformin hydrochloride', 'pioglitazone', 'sitagliptin and metformin hydrochloride']
state_medication = merged_df_train.groupby('State')[medications].sum().reset_index()

fig = px.bar(state_medication, x='State', y=medications, title='Medication Usage Across States')
fig.show()


In [28]:
medication_actual_cost = merged_df_train.groupby('year')[medications + ['TotalCost_Y_Actual']].sum().reset_index()

fig = px.line(medication_actual_cost, x='year', y=medications + ['TotalCost_Y_Actual'], title='Medication Usage and Total Actual Cost Over Time')
fig.show()


In [29]:
# Select medication columns and the 'year' column
medication_columns = ['dapagliflozin', 'exenatide', 'glimepiride', 'glyburide and metformin hydrochloride', 'insulin', 'metformin hydrochloride', 'pioglitazone', 'sitagliptin and metformin hydrochloride']
selected_data = merged_df_train[['year'] + medication_columns]

# Melt medication columns into a single 'Medication' column and a 'Usage' column
melted_data = selected_data.melt(id_vars=['year'], var_name='Medication', value_name='Usage')

# Group data by 'year' and 'Medication', summing the 'Usage' column
grouped_data = melted_data.groupby(['year', 'Medication'])['Usage'].sum().reset_index()

# Select top 5 medications for each year based on usage
top5_medications_per_year = grouped_data.groupby('year').apply(lambda x: x.nlargest(5, 'Usage')).reset_index(drop=True)

# Create a bar chart for the usage of the top 5 medications per year
fig = px.bar(top5_medications_per_year, x='year', y='Usage', color='Medication', text='Medication', labels={'Usage': 'Medication Usage'})
fig.update_layout(title='Usage of Top Medications by Patient per Year', xaxis_title='Year', yaxis_title='Usage')
fig.show()

In [30]:
top_5_medications = ['dapagliflozin', 'exenatide', 'glimepiride', 'glyburide and metformin hydrochloride', 'insulin']
merged_df_train['Top Medications'] = merged_df_train[top_5_medications].idxmax(axis=1)

avg_cost_by_medication = merged_df_train.groupby('Top Medications')['TotalCost_Y_Actual'].mean().reset_index()

fig = px.bar(avg_cost_by_medication, x='Top Medications', y='TotalCost_Y_Actual', title='Average Total Cost (Actual) by Top 5 Prescribed Medications')
fig.update_xaxes(title='Top Prescribed Medications')
fig.update_yaxes(title='Average Total Cost (Actual)')
fig.show()

In [31]:
avg_hcc_by_medication = merged_df_train.groupby('Top Medications')['HCC Score'].mean().reset_index()

fig = px.bar(avg_hcc_by_medication, x='Top Medications', y='HCC Score', title='Average HCC Score by Medication')
fig.update_xaxes(title='Top Prescribed Medications')
fig.update_yaxes(title='Average HCC Score')
fig.show()


In [32]:
avg_readmissions_by_medication = merged_df_train.groupby('Top Medications')['% Readmissions'].mean().reset_index()

fig = px.bar(avg_readmissions_by_medication, x='Top Medications', y='% Readmissions', title='Average Percentage of Readmissions by Medication')
fig.update_xaxes(title='Top Prescribed Medications')
fig.update_yaxes(title='Average Percentage of Readmissions')
fig.show()

In [33]:
avg_cost_by_readmissions = merged_df_train.groupby('# Readmissions')['TotalCost_Y_Actual'].mean().reset_index()

fig = px.line(avg_cost_by_readmissions, x='# Readmissions', y='TotalCost_Y_Actual', title='Average Total Cost (Actual) by Number of Readmissions')
fig.update_xaxes(title='Number of Readmissions')
fig.update_yaxes(title='Average Total Cost (Actual)')
fig.show()


In [34]:
avg_cost_by_er_visits = merged_df_train.groupby('# ER')['TotalCost_Y_Actual'].mean().reset_index()

fig = px.line(avg_cost_by_er_visits, x='# ER', y='TotalCost_Y_Actual', title='Average Total Cost (Actual) by Number of ER Visits')
fig.update_xaxes(title='Number of ER Visits')
fig.update_yaxes(title='Average Total Cost (Actual)')
fig.show()


In [35]:
fig = px.histogram(merged_df_train, x='Avg ADI', title='Distribution of Average Area Deprivation Index (ADI)')
fig.update_xaxes(title='Average ADI')
fig.update_yaxes(title='Count')
fig.show()

In [36]:
merged_df_train['ADI_bin'] = pd.qcut(merged_df_train['Avg ADI'], 5, labels=False)
avg_cost_by_ADI = merged_df_train.groupby('ADI_bin')['TotalCost_Y_Actual'].mean().reset_index()

fig = px.bar(avg_cost_by_ADI, x='ADI_bin', y='TotalCost_Y_Actual', title='Average Total Cost (Actual) by Average ADI (Binned)')
fig.update_xaxes(title='Average ADI (Binned)')
fig.update_yaxes(title='Average Total Cost (Actual)')
fig.show()

In [37]:
avg_cost_by_ADI_medication = merged_df_train.groupby(['ADI_bin', 'Top Medications'])['TotalCost_Y_Actual'].mean().reset_index()

fig = px.bar(avg_cost_by_ADI_medication, x='ADI_bin', y='TotalCost_Y_Actual', color='Top Medications', title='Average Total Cost (Actual) by Average ADI (Binned) and Medication')
fig.update_xaxes(title='Average ADI (Binned)')
fig.update_yaxes(title='Average Total Cost (Actual)')
fig.show()


In [38]:
avg_cost_by_ADI_state = merged_df_train.groupby(['ADI_bin', 'State'])['TotalCost_Y_Actual'].mean().reset_index()

fig = px.bar(avg_cost_by_ADI_state, x='ADI_bin', y='TotalCost_Y_Actual', color='State', title='Average Total Cost (Actual) by Average ADI (Binned) and State')
fig.update_xaxes(title='Average ADI (Binned)')
fig.update_yaxes(title='Average Total Cost (Actual)')
fig.show()


In [39]:
avg_er_by_ADI_year_medication = merged_df_train.groupby(['ADI_bin', 'year', 'Top Medications'])['# ER'].mean().reset_index()

fig = px.bar(avg_er_by_ADI_year_medication, x='ADI_bin', y='# ER', color='Top Medications', facet_col='year', title='Average ER Visits by Average ADI (Binned), Year, and Medication')
fig.update_xaxes(title='Average ADI (Binned)')
fig.update_yaxes(title='Average ER Visits')
fig.show()

In [40]:
fig = make_subplots(rows=2, cols=2, subplot_titles=("Age Distribution", "HCC Score Distribution", "Avg ADI Distribution", "TotalCost_Y_Actual Distribution"))

fig.add_trace(go.Histogram(x=merged_df_train['Age'], nbinsx=20, name='Age'), row=1, col=1)
fig.add_trace(go.Histogram(x=merged_df_train['HCC Score'], nbinsx=20, name='HCC Score'), row=1, col=2)
fig.add_trace(go.Histogram(x=merged_df_train['Avg ADI'], nbinsx=20, name='Avg ADI'), row=2, col=1)
fig.add_trace(go.Histogram(x=merged_df_train['TotalCost_Y_Actual'], nbinsx=20, name='TotalCost_Y_Actual'), row=2, col=2)

fig.update_layout(title="Univariate Analysis: Age, HCC Score, Avg ADI, and TotalCost_Y_Actual", showlegend=False)
fig.show()


In [41]:
fig = make_subplots(rows=1, cols=3, subplot_titles=("TotalCost_Y_Actual vs Age", "TotalCost_Y_Actual vs HCC Score", "TotalCost_Y_Actual vs Avg ADI"))

fig.add_trace(go.Scatter(x=merged_df_train['Age'], y=merged_df_train['TotalCost_Y_Actual'], mode='markers', name='Age'), row=1, col=1)
fig.add_trace(go.Scatter(x=merged_df_train['HCC Score'], y=merged_df_train['TotalCost_Y_Actual'], mode='markers', name='HCC Score'), row=1, col=2)
fig.add_trace(go.Scatter(x=merged_df_train['Avg ADI'], y=merged_df_train['TotalCost_Y_Actual'], mode='markers', name='Avg ADI'), row=1, col=3)

fig.update_layout(title="Bivariate Analysis: TotalCost_Y_Actual vs. Age, HCC Score, and Avg ADI", showlegend=False)
fig.show()


In [42]:
fig = px.scatter_3d(merged_df_train, x='Age', y='HCC Score', z='TotalCost_Y_Actual', color='Avg ADI', size='TotalCost_Y_Expected', hover_name='State', opacity=0.7)

fig.update_layout(title="Multivariate Analysis: TotalCost_Y_Actual vs. Age, HCC Score, and Avg ADI (Color: Avg ADI, Size: TotalCost_Y_Expected)")

fig.show()


In [43]:
adi_by_state_race = merged_df_train.groupby(['State', 'Race'])['Avg ADI'].mean().reset_index()
fig = px.bar(adi_by_state_race, x='State', y='Avg ADI', color='Race', title='Average ADI by State and Race')
fig.show()


**Reading Data**

In [44]:
medication_df = pd.read_csv("/kaggle/input/high-cost-diabetes-patients-prediction/Softec_Medication_Data_Kaggle_V1.csv")
patient_data_training_df = pd.read_csv("/kaggle/input/high-cost-diabetes-patients-prediction/Softec_Patient_Data_Training_Kaggle_V1.csv")
socio_economic_df = pd.read_csv("/kaggle/input/high-cost-diabetes-patients-prediction/Softec_Socio_Economics_Data_Kaggle_V1.csv")
patient_prediction_data_df = pd.read_csv("/kaggle/input/high-cost-diabetes-patients-prediction/Softec_Patient_Data_Testing_Kaggle_V1.csv")
patient_testing_data_df = pd.read_csv("/kaggle/input/high-cost-diabetes-patients-prediction/Softec_Patient_Data_Testing_Features.csv")

In [45]:
# merge the datasets on a common column
merged_df_train = pd.merge(patient_data_training_df, medication_df, on='ID')
merged_df_train = pd.merge(merged_df_train, socio_economic_df, on='ID')

In [46]:
merged_df_test = pd.merge(patient_testing_data_df, medication_df, on='ID')
merged_df_test = pd.merge(merged_df_test, socio_economic_df, on='ID')

In [47]:
null_counts = merged_df_train.isnull().sum()
print(null_counts)

ID                                           0
Age                                          0
Race                                         0
Sex                                          0
State                                        0
Zip Code                                     0
MSA                                          0
Enrollment Type Categorized                  0
Enrollment Months                            0
HCC Score                                   74
Avg. LOS                                     0
Diagnosis                                    0
# Total Claims                               0
# Hospital OP                                0
# All Physician OP                           0
# Inpatient                                381
# Short Term Stay Hospital                   0
# Long Term Stay Hospital                    0
# Rehabilitation Hospital                    0
# Psychiatric Hospital                       0
# Readmissions                             272
% Readmission

In [48]:
# Compare the two columns and get a boolean mask for rows where the values are different
different_rows_mask = merged_df_train['TotalCost_Y_Actual'] != merged_df_train['TotalCost_Y_Expected']

# Create a new DataFrame containing only the rows where the values are different
new_df = merged_df_train[different_rows_mask]

In [49]:
new_df

Unnamed: 0,ID,Age,Race,Sex,State,Zip Code,MSA,Enrollment Type Categorized,Enrollment Months,HCC Score,...,pioglitazone,sitagliptin and metformin hydrochloride,Avg ADI,Max ADI,Min ADI,SVI1,SVI3,SVI4,SVI,MDI
51,4062018,55,White,Male,Michigan,48708,13020,20,12.0,0.83,...,0,0,86.428571,100.0,63.0,0.488592,0.451766,0.481356,0.491430,15.39
95,7202021,78,White,Male,Michigan,48706,13020,10,12.0,1.07,...,0,0,75.354839,100.0,39.0,0.518867,0.202722,0.501356,0.463585,14.16
240,16152021,60,White,Female,Michigan,48706,13020,21,12.0,1.24,...,0,0,75.354839,100.0,39.0,0.518867,0.202722,0.501356,0.463585,15.39
269,17822020,81,White,Male,Michigan,48706,13020,10,12.0,1.02,...,0,0,75.354839,100.0,39.0,0.518867,0.202722,0.501356,0.463585,14.16
334,21322017,91,White,Female,Michigan,48650,13020,10,12.0,2.32,...,0,0,72.777778,84.0,42.0,0.528620,0.091240,0.626820,0.515980,14.16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16080,21262018,79,White,Female,Michigan,48893,99022,10,12.0,1.97,...,0,0,67.500000,88.0,44.0,0.630550,0.275100,0.625072,0.496167,14.16
16117,24102019,73,White,Male,Michigan,48656,99022,10,12.0,1.95,...,0,0,87.625000,98.0,66.0,0.553754,0.088992,0.502515,0.507277,14.16
16120,24192021,69,White,Male,Michigan,48418,99022,10,12.0,1.25,...,0,0,51.500000,66.0,39.0,0.190317,0.098542,0.251535,0.151908,14.16
16220,35192018,84,White,Female,Michigan,48622,99022,10,12.0,1.55,...,0,0,78.666667,95.0,54.0,0.740583,0.088717,0.628092,0.640567,14.16


In [50]:
new_df = new_df.copy()  # Create an explicit copy of the DataFrame

le = LabelEncoder()

# Use .loc[] to assign new values and avoid the SettingWithCopyWarning
new_df['Race'] = le.fit_transform(new_df['Race'])
new_df['Sex'] = le.fit_transform(new_df['Sex'])
new_df['State'] = le.fit_transform(new_df['State'])

In [51]:
null_counts = new_df.isnull().sum()
print(null_counts)

ID                                          0
Age                                         0
Race                                        0
Sex                                         0
State                                       0
Zip Code                                    0
MSA                                         0
Enrollment Type Categorized                 0
Enrollment Months                           0
HCC Score                                   3
Avg. LOS                                    0
Diagnosis                                   0
# Total Claims                              0
# Hospital OP                               0
# All Physician OP                          0
# Inpatient                                15
# Short Term Stay Hospital                  0
# Long Term Stay Hospital                   0
# Rehabilitation Hospital                   0
# Psychiatric Hospital                      0
# Readmissions                              6
% Readmissions                    

In [52]:
imputer = KNNImputer(n_neighbors=7)
new_df = pd.DataFrame(imputer.fit_transform(new_df), columns=new_df.columns)

# patient_data_training_df = patient_data_training_df.dropna(axis=0)

new_df['MSA']= pd.to_numeric(new_df['MSA'], errors='coerce').astype(int)
new_df['Zip Code']= pd.to_numeric(new_df['Zip Code'], errors='coerce').astype(int)
new_df['Diagnosis']= pd.to_numeric(new_df['Diagnosis'], errors='coerce').astype(int)
new_df['# Home Health']= pd.to_numeric(new_df['# Home Health'], errors='coerce').astype(int)
new_df['# Labs']= pd.to_numeric(new_df['# Labs'], errors='coerce').astype(int)
new_df['HCC Score']= pd.to_numeric(new_df['HCC Score'], errors='coerce').astype(int)

In [53]:
# create a new column Y based on TotalCost_Y_Actual or TotalCost_Y_Expected
new_df['TotalCost_Y_Actual'] = pd.cut(new_df['TotalCost_Y_Actual'], bins=[-float('inf'), 11000, float('inf')], labels=[0, 1])
new_df['TotalCost_Y_Expected'] = pd.cut(new_df['TotalCost_Y_Expected'], bins=[-float('inf'), 11000, float('inf')], labels=[0, 1])

# categorize Diagnosis, # Home health, #Labs columns
new_df['Diagnosis'] = pd.cut(new_df['Diagnosis'], bins=[-float('inf'), 0, 1, 2, 3, 6, 13, float('inf')], labels=[0, 1, 2, 3, 4, 5, 7])
new_df['# Home health'] = pd.cut(new_df['# Home Health'], bins=[-float('inf'), 0, 1, 2, 3, 6, 13, float('inf')], labels=[0, 1, 2, 3, 4, 5, 7])
new_df['# Labs'] = pd.cut(new_df['# Labs'], bins=[-float('inf'), 0, 1, 2, 3, 6, 13, float('inf')], labels=[0, 1, 2, 3, 4, 5, 7])

# categorize HCC Score, Age, and other demographic features
new_df['HCC Score'] = pd.cut(new_df['HCC Score'], bins=[-float('inf'), 1, float('inf')], labels=[0, 1])
new_df['Age'] = pd.cut(new_df['Age'], bins=[-float('inf'), 64, float('inf')], labels=[0, 1])

In [54]:
new_df['TotalCost_Y_Actual'] = pd.to_numeric(new_df['TotalCost_Y_Actual'], errors='coerce')
new_df['TotalCost_Y_Expected'] = pd.to_numeric(new_df['TotalCost_Y_Expected'], errors='coerce')

In [55]:
# Calculate the correlation matrix
correlation_matrix = new_df.corr()

# Get the correlations of 'TotalCost_Y_Actual' and 'TotalCost_Y_Expected' with all the other features
total_cost_actual_correlations = correlation_matrix['TotalCost_Y_Actual']
total_cost_expected_correlations = correlation_matrix['TotalCost_Y_Expected']
# Combine the two correlation Series into a new DataFrame
correlations_df = pd.concat([total_cost_actual_correlations, total_cost_expected_correlations], axis=1)
correlations_df.columns = ['TotalCost_Y_Actual', 'TotalCost_Y_Expected']
sorted_df = correlations_df.abs().sort_values(by='TotalCost_Y_Actual', ascending=False)

# Display the correlation DataFrame
print((sorted_df))

                                         TotalCost_Y_Actual  \
TotalCost_Y_Actual                                 1.000000   
TotalCost_Y_Expected                               0.817287   
# Part B Imaging                                   0.174183   
# Part B E&M                                       0.172832   
# All Physician OP                                 0.148313   
# Part B Labs                                      0.123819   
# Imaging                                          0.104365   
# DME                                              0.099432   
SVI4                                               0.094210   
# Inpatient                                        0.093190   
SVI                                                0.091556   
# Hospital OP                                      0.088681   
insulin                                            0.079527   
# ER                                               0.075269   
Min ADI                                            0.07





In [56]:
higher_actual = (sorted_df['TotalCost_Y_Actual'] > sorted_df['TotalCost_Y_Expected']).sum()
higher_expected = (sorted_df['TotalCost_Y_Expected'] > sorted_df['TotalCost_Y_Actual']).sum()

print(f"TotalCost_Y_Actual has {higher_actual} higher values.")
print(f"TotalCost_Y_Expected has {higher_expected} higher values.")

if higher_actual > higher_expected:
    print("TotalCost_Y_Actual has more higher values.")
elif higher_expected > higher_actual:
    print("TotalCost_Y_Expected has more higher values.")
else:
    print("Both columns have an equal number of higher values.")

TotalCost_Y_Actual has 33 higher values.
TotalCost_Y_Expected has 20 higher values.
TotalCost_Y_Actual has more higher values.


In [57]:
patient_id = merged_df_train["ID"]
temp = list(patient_id)
ids = []
year = []
for i in range (len(temp)):
    mystr = str(temp[i])
    year.append(mystr[-4:])
    ids.append(mystr[:-4])

merged_df_train['patient_id'] = ids
merged_df_train['year'] = year

# patient_data_training_df.drop(['ID'], inplace = True, axis =1)

cols = merged_df_train.columns.tolist()
cols = cols[-1:] + cols[:-1]
merged_df_train = merged_df_train[cols]
cols = cols[-1:] + cols[:-1]
merged_df_train = merged_df_train[cols]

In [58]:
merged_df_train.drop(['TotalCost_Y_Expected'], inplace = True, axis =1)

In [59]:
merged_df_train.head()

Unnamed: 0,patient_id,year,ID,Age,Race,Sex,State,Zip Code,MSA,Enrollment Type Categorized,...,pioglitazone,sitagliptin and metformin hydrochloride,Avg ADI,Max ADI,Min ADI,SVI1,SVI3,SVI4,SVI,MDI
0,6880,2021,68802021,68,White,Male,Michigan,48176,11460,10,...,0,0,30.157895,67.0,8.0,-27.707059,-18.227551,-27.590337,-27.727856,14.16
1,342,2021,3422021,60,Black,Male,Michigan,49224,12980,20,...,0,0,80.2,100.0,52.0,-21.266317,0.272289,-21.27153,-21.282509,15.39
2,789,2017,7892017,43,White,Male,Michigan,49016,12980,20,...,0,0,82.545455,97.0,63.0,0.529493,0.376685,0.616187,0.55524,15.39
3,789,2020,7892020,46,White,Male,Michigan,49016,12980,20,...,0,0,82.545455,97.0,63.0,0.529493,0.376685,0.616187,0.55524,15.39
4,4279,2020,42792020,84,Black,Female,Michigan,49224,12980,10,...,0,0,80.2,100.0,52.0,-21.266317,0.272289,-21.27153,-21.282509,14.16


In [60]:
null_counts = merged_df_train.isnull().sum()
print(null_counts)

patient_id                                   0
year                                         0
ID                                           0
Age                                          0
Race                                         0
Sex                                          0
State                                        0
Zip Code                                     0
MSA                                          0
Enrollment Type Categorized                  0
Enrollment Months                            0
HCC Score                                   74
Avg. LOS                                     0
Diagnosis                                    0
# Total Claims                               0
# Hospital OP                                0
# All Physician OP                           0
# Inpatient                                381
# Short Term Stay Hospital                   0
# Long Term Stay Hospital                    0
# Rehabilitation Hospital                    0
# Psychiatric

In [61]:
def remove_outliers_zscore(df, threshold=2.7, ignore_cols=[]):
    # Iterate over each column in the DataFrame
    for column in df.select_dtypes(include=np.number).columns:
        # Ignore columns in the ignore_cols list
        if column in ignore_cols:
            continue
        
        # Calculate the z-score of the column
        z_scores = np.abs((df[column] - df[column].mean()) / df[column].std())
        
        # Identify outliers based on the threshold
        outliers = z_scores > threshold
        
        # Remove outliers from the column
        df = df[~outliers]
    
    return df

In [62]:
clean_data = remove_outliers_zscore(merged_df_train, ignore_cols=['ID'])
merged_df_train=clean_data
clean_data

Unnamed: 0,patient_id,year,ID,Age,Race,Sex,State,Zip Code,MSA,Enrollment Type Categorized,...,pioglitazone,sitagliptin and metformin hydrochloride,Avg ADI,Max ADI,Min ADI,SVI1,SVI3,SVI4,SVI,MDI
4,4279,2020,42792020,84,Black,Female,Michigan,49224,12980,10,...,0,0,80.200000,100.0,52.0,-21.266317,0.272289,-21.271530,-21.282509,14.16
5,4279,2018,42792018,82,Black,Female,Michigan,49224,12980,10,...,0,0,80.200000,100.0,52.0,-21.266317,0.272289,-21.271530,-21.282509,14.16
6,4279,2019,42792019,83,Black,Female,Michigan,49224,12980,10,...,0,0,80.200000,100.0,52.0,-21.266317,0.272289,-21.271530,-21.282509,14.16
7,4279,2017,42792017,81,Black,Female,Michigan,49224,12980,10,...,0,0,80.200000,100.0,52.0,-21.266317,0.272289,-21.271530,-21.282509,14.16
8,6867,2019,68672019,76,White,Male,Michigan,49068,12980,10,...,0,0,64.000000,85.0,41.0,0.529493,0.376685,0.616187,0.555240,14.16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16315,4372,2019,43722019,70,White,Female,Michigan,48858,99022,10,...,0,0,73.607143,93.0,51.0,0.320495,0.160248,0.459319,0.325162,14.16
16319,4389,2017,43892017,73,White,Female,Michigan,48893,99022,10,...,0,0,67.500000,88.0,44.0,0.630550,0.275100,0.625072,0.496167,14.16
16320,4389,2018,43892018,74,White,Female,Michigan,48893,99022,10,...,0,0,67.500000,88.0,44.0,0.630550,0.275100,0.625072,0.496167,14.16
16323,4434,2019,44342019,65,White,Male,Michigan,48858,99022,10,...,0,0,73.607143,93.0,51.0,0.320495,0.160248,0.459319,0.325162,14.16


In [63]:
le = LabelEncoder()

merged_df_train['Race'] = le.fit_transform(merged_df_train['Race'])
merged_df_train['Sex'] = le.fit_transform(merged_df_train['Sex'])
merged_df_train['State'] = le.fit_transform(merged_df_train['State'])
merged_df_train['MSA'] = merged_df_train['MSA'].replace('Unknown', np.nan)
merged_df_train['MSA']= pd.to_numeric(merged_df_train['MSA'], errors='coerce').fillna(0).astype(int)
merged_df_train['Zip Code']= pd.to_numeric(merged_df_train['Zip Code'], errors='coerce').fillna(0).astype(int)

In [64]:
merged_df_train = merged_df_train.apply(lambda col: col.fillna(col.mean()), axis=0)

In [65]:
null_counts = merged_df_train.isnull().sum()
print(null_counts)

patient_id                                 0
year                                       0
ID                                         0
Age                                        0
Race                                       0
Sex                                        0
State                                      0
Zip Code                                   0
MSA                                        0
Enrollment Type Categorized                0
Enrollment Months                          0
HCC Score                                  0
Avg. LOS                                   0
Diagnosis                                  0
# Total Claims                             0
# Hospital OP                              0
# All Physician OP                         0
# Inpatient                                0
# Short Term Stay Hospital                 0
# Long Term Stay Hospital                  0
# Rehabilitation Hospital                  0
# Psychiatric Hospital                     0
# Readmiss

In [66]:
merged_df_train['TotalCost_Y_Actual'] = pd.cut(merged_df_train['TotalCost_Y_Actual'], bins=[-float('inf'), 11000, float('inf')], labels=[0, 1])

In [67]:
merged_df_train['TotalCost_Y_Actual'] = pd.to_numeric(merged_df_train['TotalCost_Y_Actual'], errors='coerce')

In [68]:
# Calculate the correlation matrix
correlation_matrix = merged_df_train.corr()

# Get the correlations of 'TotalCost_Y_Actual' and 'TotalCost_Y_Expected' with all the other features
total_cost_actual_correlations = correlation_matrix['TotalCost_Y_Actual']

# Combine the two correlation Series into a new DataFrame
correlations_df = pd.concat([total_cost_actual_correlations], axis=1)
correlations_df.columns = ['TotalCost_Y_Actual']

# Filter correlations below cutoff
cutoff = 0.07
correlations_df = correlations_df[(correlations_df.abs() >= cutoff).any(axis=1)]

# Sort correlations by absolute value of 'TotalCost_Y_Actual'
sorted_df = correlations_df.abs().sort_values(by='TotalCost_Y_Actual', ascending=False)

# Display the correlation DataFrame
print((sorted_df))

                    TotalCost_Y_Actual
TotalCost_Y_Actual            1.000000
HCC Score                     0.166027
Diagnosis                     0.139961
# Total Claims                0.122930
# All Physician OP            0.109542
# Hospital OP                 0.095042
# Part B E&M                  0.085884
# Part B Drugs                0.085828
# Part B Imaging              0.085590
# Part B Labs                 0.079772
Age                           0.076935
# Rx Claims                   0.076830
# Labs                        0.073563






In [69]:
top_features = sorted_df['TotalCost_Y_Actual'].nlargest(28).index.tolist()[2:]
X = merged_df_train[top_features]
X.columns

Index(['Diagnosis', '# Total Claims', '# All Physician OP', '# Hospital OP',
       '# Part B E&M', '# Part B Drugs', '# Part B Imaging', '# Part B Labs',
       'Age', '# Rx Claims', '# Labs'],
      dtype='object')

In [70]:
y= merged_df_train['TotalCost_Y_Actual']
y

4        0
5        0
6        0
7        0
8        0
        ..
16315    0
16319    0
16320    1
16323    0
16324    1
Name: TotalCost_Y_Actual, Length: 8087, dtype: int64

**Normalizing data**

In [71]:
scaler = StandardScaler()
# keep our unscaled features just in case we need to process them alternatively
features_scaled = X
X_scaled = scaler.fit_transform(features_scaled)

scaler = MinMaxScaler()
# keep our unscaled features just in case we need to process them alternatively
features_minmax = X
X_minmax = scaler.fit_transform(features_minmax)

**Splitting the data**

In [72]:
X_train_scaled,X_test_scaled , y_train, y_test = train_test_split(
    X_scaled, 
    y, 
    test_size=.05, 
    random_state=69
)

**Traing and Testing our models**

In [73]:
from sklearn.metrics import precision_score, recall_score

classification_models = [
    KNeighborsClassifier(),
    SVC(kernel='linear', probability=True),
    SVC(kernel='rbf', probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    xgb.XGBClassifier(),
    LogisticRegression(max_iter=1000, penalty='l2', C=0.3),
    SVC(kernel='poly', probability=True),
    GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=7, random_state=30)
]

scores = []
for model in classification_models:
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy = model.score(X_test_scaled, y_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted')

    model_name = type(model).__name__
    if model_name == 'SVC' and model.kernel == 'rbf':
        model_name += ' RBF kernel'
    if model_name == 'SVC' and model.kernel == 'linear':
        model_name += ' linear'

    scores.append((model_name, f'{100 * accuracy:.2f}%', f'{100 * f1:.2f}%', f'{100 * precision:.2f}%', f'{100 * recall:.2f}%'))

# Make it pretty
scores_df = pd.DataFrame(scores, columns=['Classifier', 'Accuracy Score', 'F1 Score', 'Precision Score', 'Recall Score'])
scores_df.sort_values(by='Accuracy Score', axis=0, ascending=False)


Unnamed: 0,Classifier,Accuracy Score,F1 Score,Precision Score,Recall Score
1,SVC linear,84.94%,78.02%,72.15%,84.94%
2,SVC RBF kernel,84.94%,78.02%,72.15%,84.94%
4,RandomForestClassifier,84.94%,78.02%,72.15%,84.94%
5,AdaBoostClassifier,84.94%,78.02%,72.15%,84.94%
9,LogisticRegression,84.94%,78.02%,72.15%,84.94%
10,SVC,84.94%,78.02%,72.15%,84.94%
11,GradientBoostingClassifier,84.20%,78.50%,76.65%,84.20%
8,XGBClassifier,83.70%,77.83%,74.29%,83.70%
0,KNeighborsClassifier,83.46%,77.28%,71.95%,83.46%
7,QuadraticDiscriminantAnalysis,80.99%,77.33%,74.74%,80.99%


In [74]:
from sklearn.metrics import roc_curve, auc

# Initialize a Plotly figure
fig = go.Figure()

# Iterate over classification models
for model in classification_models:
    
    # Predict probabilities
    y_pred_prob = model.predict_proba(X_test_scaled)[:, 1]
    
    # Calculate ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
    
    # Calculate AUC
    roc_auc = auc(fpr, tpr)
    
    # Get the model name
    model_name = type(model).__name__
    if model_name == 'SVC' and model.kernel == 'rbf':
        model_name += ' RBF kernel'
    if model_name == 'SVC' and model.kernel == 'linear':
        model_name += ' linear'
    
    # Add ROC curve to the plot
    fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'{model_name} (AUC = {roc_auc:.2f})'))

# Add a diagonal line to represent a random classifier
fig.add_shape(type='line', x0=0, x1=1, y0=0, y1=1, yref='y', xref='x', line=dict(color='gray', dash='dash'))

# Update plot layout
fig.update_layout(
    title='ROC Curves',
    xaxis=dict(title='False Positive Rate'),
    yaxis=dict(title='True Positive Rate'),
    showlegend=True
)

# Show the plot
fig.show()

In [75]:
# Create subplots for accuracy and F1 scores
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'bar'}, {'type': 'bar'}]], subplot_titles=['Accuracy Score', 'F1 Score'])

# Add bar chart for accuracy scores
fig.add_trace(go.Bar(x=scores_df['Classifier'], y=scores_df['Accuracy Score'], name='Accuracy Score'), row=1, col=1)

# Add bar chart for F1 scores
fig.add_trace(go.Bar(x=scores_df['Classifier'], y=scores_df['F1 Score'], name='F1 Score'), row=1, col=2)

# Update layout
fig.update_layout(title='Classifier Performance', showlegend=False, yaxis_tickformat='%')
fig.update_yaxes(title_text='Score', row=1, col=1)
fig.update_yaxes(title_text='Score', row=1, col=2)

# Show the plot
fig.show()

In [76]:
# Create subplots for accuracy, F1 scores, precision, and recall
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'bar'}, {'type': 'bar'}]], subplot_titles=['Precision Score', 'Recall Score'])

# Add bar chart for precision scores
fig.add_trace(go.Bar(x=scores_df['Classifier'], y=scores_df['Precision Score'], name='Precision Score'), row=1, col=1)

# Add bar chart for recall scores
fig.add_trace(go.Bar(x=scores_df['Classifier'], y=scores_df['Recall Score'], name='Recall Score'), row=1, col=2)

# Update layout
fig.update_layout(title='Classifier Performance', showlegend=False, yaxis_tickformat='%')
fig.update_yaxes(title_text='Score', row=1, col=1)
fig.update_yaxes(title_text='Score', row=1, col=2)

# Show the plot
fig.show()

**Smoothing the data**

In [77]:
from imblearn.over_sampling import ADASYN
# Apply the ADASYN sampling
adasyn = ADASYN(sampling_strategy='minority', random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X_train_scaled, y_train)
# Check the class distribution before and after applying ADASYN
print("Class distribution before ADASYN:", pd.Series(y_train).value_counts())
print("Class distribution after ADASYN:", pd.Series(y_resampled).value_counts())

Class distribution before ADASYN: 0    6554
1    1128
Name: TotalCost_Y_Actual, dtype: int64
Class distribution after ADASYN: 1    6702
0    6554
Name: TotalCost_Y_Actual, dtype: int64


In [78]:
from sklearn.metrics import precision_score, recall_score

classification_models = [
    KNeighborsClassifier(),
    SVC(kernel='linear', probability=True),
    SVC(kernel='rbf', probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    xgb.XGBClassifier(),
    LogisticRegression(max_iter=1000, penalty='l2', C=0.3),
    SVC(kernel='poly', probability=True),
    GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=7, random_state=30)
]

scores = []
for model in classification_models:
    model.fit(X_resampled, y_resampled)
    y_pred = model.predict(X_test_scaled)
    accuracy = model.score(X_test_scaled, y_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted')

    model_name = type(model).__name__
    if model_name == 'SVC' and model.kernel == 'rbf':
        model_name += ' RBF kernel'
    if model_name == 'SVC' and model.kernel == 'linear':
        model_name += ' linear'

    scores.append((model_name, f'{100 * accuracy:.2f}%', f'{100 * f1:.2f}%', f'{100 * precision:.2f}%', f'{100 * recall:.2f}%'))

# Make it pretty
scores_df = pd.DataFrame(scores, columns=['Classifier', 'Accuracy Score', 'F1 Score', 'Precision Score', 'Recall Score'])
scores_df.sort_values(by='Accuracy Score', axis=0, ascending=False)


Unnamed: 0,Classifier,Accuracy Score,F1 Score,Precision Score,Recall Score
11,GradientBoostingClassifier,84.94%,79.31%,80.12%,84.94%
4,RandomForestClassifier,83.21%,78.65%,76.54%,83.21%
8,XGBClassifier,82.96%,77.81%,74.70%,82.96%
10,SVC,77.04%,77.92%,78.95%,77.04%
5,AdaBoostClassifier,75.56%,77.37%,79.85%,75.56%
3,DecisionTreeClassifier,73.33%,74.56%,75.96%,73.33%
6,GaussianNB,65.68%,70.34%,79.91%,65.68%
1,SVC linear,62.96%,68.19%,80.27%,62.96%
0,KNeighborsClassifier,61.98%,67.12%,76.51%,61.98%
2,SVC RBF kernel,60.74%,66.23%,77.46%,60.74%


In [79]:
from sklearn.metrics import roc_curve, auc

# Initialize a Plotly figure
fig = go.Figure()

# Iterate over classification models
for model in classification_models:
    model.fit(X_resampled, y_resampled)
    
    # Predict probabilities
    y_pred_prob = model.predict_proba(X_test_scaled)[:, 1]
    
    # Calculate ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
    
    # Calculate AUC
    roc_auc = auc(fpr, tpr)
    
    # Get the model name
    model_name = type(model).__name__
    if model_name == 'SVC' and model.kernel == 'rbf':
        model_name += ' RBF kernel'
    if model_name == 'SVC' and model.kernel == 'linear':
        model_name += ' linear'
    
    # Add ROC curve to the plot
    fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'{model_name} (AUC = {roc_auc:.2f})'))

# Add a diagonal line to represent a random classifier
fig.add_shape(type='line', x0=0, x1=1, y0=0, y1=1, yref='y', xref='x', line=dict(color='gray', dash='dash'))

# Update plot layout
fig.update_layout(
    title='ROC Curves',
    xaxis=dict(title='False Positive Rate'),
    yaxis=dict(title='True Positive Rate'),
    showlegend=True
)

# Show the plot
fig.show()

In [80]:
# Create subplots for accuracy and F1 scores
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'bar'}, {'type': 'bar'}]], subplot_titles=['Accuracy Score', 'F1 Score'])

# Add bar chart for accuracy scores
fig.add_trace(go.Bar(x=scores_df['Classifier'], y=scores_df['Accuracy Score'], name='Accuracy Score'), row=1, col=1)

# Add bar chart for F1 scores
fig.add_trace(go.Bar(x=scores_df['Classifier'], y=scores_df['F1 Score'], name='F1 Score'), row=1, col=2)

# Update layout
fig.update_layout(title='Classifier Performance', showlegend=False, yaxis_tickformat='%')
fig.update_yaxes(title_text='Score', row=1, col=1)
fig.update_yaxes(title_text='Score', row=1, col=2)

# Show the plot
fig.show()

In [81]:
# Create subplots for accuracy, F1 scores, precision, and recall
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'bar'}, {'type': 'bar'}]], subplot_titles=['Precision Score', 'Recall Score'])

# Add bar chart for precision scores
fig.add_trace(go.Bar(x=scores_df['Classifier'], y=scores_df['Precision Score'], name='Precision Score'), row=1, col=1)

# Add bar chart for recall scores
fig.add_trace(go.Bar(x=scores_df['Classifier'], y=scores_df['Recall Score'], name='Recall Score'), row=1, col=2)

# Update layout
fig.update_layout(title='Classifier Performance', showlegend=False, yaxis_tickformat='%')
fig.update_yaxes(title_text='Score', row=1, col=1)
fig.update_yaxes(title_text='Score', row=1, col=2)

# Show the plot
fig.show()