## Part 1: Building up a basic predictive model

### Data Visualization

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('cleaned_data.csv')
df.head()

Unnamed: 0,patient_nbr,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,...,number_diagnoses,metformin,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,change,diabetesMed,readmitted
0,20377854,Female,[60-70),0.2,0.0,0.0,0.153846,MC,Nephrology,0.644444,...,0.333333,No,No,No,No,No,Steady,No,Yes,0
1,20408121,Female,[90-100),0.0,0.0,0.285714,0.230769,MC,Emergency/Trauma,0.611111,...,0.333333,No,No,No,No,No,No,No,Yes,0
2,20542797,Male,[70-80),0.0,0.071429,0.285714,0.692308,MC,InternalMedicine,0.744444,...,0.333333,Steady,No,No,No,No,Steady,Ch,Yes,0
3,7239654,Female,[70-80),0.0,0.142857,0.238095,0.846154,UN,InternalMedicine,0.844444,...,0.266667,No,No,No,No,No,Steady,No,Yes,0
4,15466212,Male,[70-80),0.0,0.142857,0.238095,0.846154,MC,InternalMedicine,0.655556,...,0.266667,No,No,No,No,No,No,No,No,0


#### Plot the distribution of unique classes of the target variable, i.e., readmitted.  

In [23]:
import plotly.express as px

# Create a DataFrame with counts of each unique class of 'readmitted'
readmitted_counts = df['readmitted'].value_counts().reset_index()
readmitted_counts.columns = ['Readmission Status', 'Count']

# Create an interactive bar chart using Plotly
fig = px.bar(readmitted_counts, x='Readmission Status', y='Count', color='Readmission Status',
             title='Distribution of Unique Classes of "readmitted"',
             labels={'Count': 'Count'},
             color_discrete_map={'NO': 'lightblue', '>30': 'lightgreen', '<30': 'lightcoral'})

fig.show()


#### Plot the count of number of readmitted cases against age.

In [24]:
# Group the data by age and readmitted status and count the occurrences
grouped_df = df.groupby(['age', 'readmitted']).size().reset_index(name='count')

# Create an interactive bar plot using Plotly Express
fig = px.bar(grouped_df, x='age', y='count', color='readmitted',
             title='Count of Readmitted Cases against Age',
             labels={'count': 'Count', 'age': 'Age'},
             color_discrete_map={'NO': 'lightblue', '>30': 'lightgreen', '<30': 'lightcoral'})

fig.update_layout(xaxis_title='Age', yaxis_title='Count')
fig.show()

#### Plot a graph that displays the count of target variable against the number of medications. 

In [25]:
# Group the data by 'num_medications' and 'readmitted', and count occurrences
grouped_df = df.groupby(['num_medications', 'readmitted']).size().reset_index(name='count')

# Create an interactive bar plot using Plotly Express
fig = px.bar(grouped_df, x='num_medications', y='count', color='readmitted',
             title='Count of Readmitted Cases against Number of Medications',
             labels={'count': 'Count', 'num_medications': 'Number of Medications'},
             color_discrete_map={'NO': 'lightblue', '>30': 'lightgreen', '<30': 'lightcoral'})

fig.update_layout(xaxis_title='Number of Medications', yaxis_title='Count')
fig.show()

#### Show the scatter matrix plot and the correlation matrix. This should be a very large matrix and you might find it difficult to analyse. Which pair of features are highly correlated?

In [26]:
# Select only numeric columns for computing the correlation matrix
numeric_columns = df.select_dtypes(include=['float64', 'int64'])

# Calculate the correlation matrix
correlation_matrix = numeric_columns.corr()

# Plot the correlation matrix using Plotly Express
fig = px.imshow(correlation_matrix,
                labels=dict(x="Features", y="Features", color="Correlation"),
                x=correlation_matrix.index,
                y=correlation_matrix.columns,
                title="Correlation Matrix")
fig.show()

#### Additional Plots

#### 1) Distribution of Age

In [27]:
fig = px.histogram(df, x='age', title='Distribution of Age', nbins=20)
fig.update_layout(xaxis_title='Age', yaxis_title='Count')
fig.show()

#### 2) Number of Medications by Age

In [28]:
fig = px.box(df, x='age', y='num_medications', title='Number of Medications by Age')
fig.update_layout(xaxis_title='Age', yaxis_title='Number of Medications')
fig.show()

#### 3) Readmission Status by Age and Gender

In [29]:
fig = px.histogram(df, x='age', color='readmitted', facet_col='gender', barmode='group', title='Readmission Status by Age and Gender')
fig.update_layout(xaxis_title='Age', yaxis_title='Count', legend_title='Readmission Status')
fig.show()