In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [6]:
filePath = 'C2T1_Train.csv'
 
rawDF=pd.read_csv(filePath)

rawDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90766 entries, 0 to 90765
Data columns (total 50 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   encounter_id2             90766 non-null  int64 
 1   patient_nbr2              90766 non-null  int64 
 2   race                      90766 non-null  object
 3   gender                    90766 non-null  object
 4   age                       90766 non-null  object
 5   weight                    90766 non-null  object
 6   admission_type_id         90766 non-null  int64 
 7   discharge_disposition_id  90766 non-null  int64 
 8   admission_source_id       90766 non-null  int64 
 9   time_in_hospital          90766 non-null  int64 
 10  payer_code                90766 non-null  object
 11  medical_specialty         90766 non-null  object
 12  num_lab_procedures        90766 non-null  int64 
 13  num_procedures            90766 non-null  int64 
 14  num_medications       

Weight Column has almost all '?' values renderring it useless for the model to train on.

In [7]:
rawDF.drop('weight', axis=1, inplace=True)

In [8]:
rawDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90766 entries, 0 to 90765
Data columns (total 49 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   encounter_id2             90766 non-null  int64 
 1   patient_nbr2              90766 non-null  int64 
 2   race                      90766 non-null  object
 3   gender                    90766 non-null  object
 4   age                       90766 non-null  object
 5   admission_type_id         90766 non-null  int64 
 6   discharge_disposition_id  90766 non-null  int64 
 7   admission_source_id       90766 non-null  int64 
 8   time_in_hospital          90766 non-null  int64 
 9   payer_code                90766 non-null  object
 10  medical_specialty         90766 non-null  object
 11  num_lab_procedures        90766 non-null  int64 
 12  num_procedures            90766 non-null  int64 
 13  num_medications           90766 non-null  int64 
 14  number_outpatient     

In [9]:
rawDF.isnull().sum()

encounter_id2                   0
patient_nbr2                    0
race                            0
gender                          0
age                             0
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                      0
medical_specialty               0
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                          0
diag_2                          0
diag_3                          0
number_diagnoses                0
max_glu_serum               85581
A1Cresult                   75927
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide                   0
glipizide     

Drop max_glu_serum & A1Cresult as it is mostly NuLL values.

In [14]:
rawDF.drop('max_glu_serum', axis=1, inplace=True)


KeyError: "['max_glu_serum'] not found in axis"

In [15]:
rawDF.drop('A1Cresult', axis=1, inplace=True)

In [16]:
rawDF.describe(include='all')

Unnamed: 0,encounter_id2,patient_nbr2,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
count,90766.0,90766.0,90766,90766,90766,90766.0,90766.0,90766.0,90766.0,90766,...,90766,90766,90766,90766,90766,90766,90766,90766,90766,90766
unique,,,6,3,10,,,,,18,...,1,4,4,2,2,2,2,2,2,3
top,,,Caucasian,Female,[70-80),,,,,?,...,No,No,No,No,No,No,No,No,Yes,NO
freq,,,67515,48748,23309,,,,,38730,...,90766,42793,90199,90758,90765,90764,90765,49655,69573,49361
mean,162824400.0,52918680.0,,,,2.025406,3.775092,5.847267,4.408942,,...,,,,,,,,,,
std,108379200.0,39544960.0,,,,1.454453,5.359248,4.147569,2.995873,,...,,,,,,,,,,
min,5283.0,5.0,,,,1.0,1.0,1.0,1.0,,...,,,,,,,,,,
25%,77940630.0,22206770.0,,,,1.0,1.0,1.0,2.0,,...,,,,,,,,,,
50%,140253400.0,43613970.0,,,,1.0,1.0,7.0,4.0,,...,,,,,,,,,,
75%,242196800.0,86811600.0,,,,3.0,4.0,7.0,6.0,,...,,,,,,,,,,


In [6]:

df = pd.read_csv('C2T1_Train.csv')

# Replace '?' with NaN
df = df.replace('?', pd.NA)

# Define columns again, ensuring we're working with the correct data types
num_columns = [col for col in df.columns if df[col].dtype in ['int64', 'float64'] and col not in ['encounter_id2', 'patient_nbr2']]
cat_columns = [col for col in df.columns if df[col].dtype == 'object']

# Impute missing values
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent', add_indicator=True)

df[num_columns] = num_imputer.fit_transform(df[num_columns])
# The categorical imputation needs to be handled differently to avoid the error.
# Manually impute missing values for categorical columns
for col in cat_columns:
    # If a column is categorical, fill missing values with the mode (most frequent value)
    if df[col].dtype == 'object':
        df[col] = df[col].fillna(df[col].mode()[0])

# Proceed with OneHotEncoder for categorical variables
onehot_encoder = OneHotEncoder(sparse=False, drop='first', handle_unknown='ignore')
encoded_features = onehot_encoder.fit_transform(df[cat_columns])
encoded_features_df = pd.DataFrame(encoded_features, columns=onehot_encoder.get_feature_names_out(cat_columns))
encoded_features_df.index = df.index  # Ensure alignment of indices

# Remove original categorical columns and merge the encoded features
df = df.drop(cat_columns, axis=1)
df = pd.concat([df, encoded_features_df], axis=1)

# Standardize numerical features (excluding identifiers)
scaler = StandardScaler()
df[num_columns] = scaler.fit_transform(df[num_columns])

# Attempt to display the first few rows of the corrected DataFrame
df.head()

df.to_csv('cleandata.csv', index=False)



In [8]:
df = pd.read_csv('cleandata.csv')

df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90766 entries, 0 to 90765
Columns: 2377 entries, encounter_id2 to readmitted_NO
dtypes: float64(2375), int64(2)
memory usage: 1.6 GB


In [10]:
df = pd.read_csv('cleandata.csv')

# Remove columns where all values are 0
df = df.loc[:, (df != 0).any(axis=0)]

# Optionally, save the cleaned DataFrame back to a new CSV file
df.to_csv('Cleaned_Trainfixed.csv', index=False)