In [43]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder

pd.set_option("display.max_columns",100)

In [44]:
# Load the training data
train_data = pd.read_csv('/kaggle/input/playground-series-s3e22/train.csv')

# Load the test data
test_data = pd.read_csv('/kaggle/input/playground-series-s3e22/test.csv')

# Display the first few rows of the training data
train_data.head()

Unnamed: 0,id,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,peristalsis,abdominal_distention,nasogastric_tube,nasogastric_reflux,nasogastric_reflux_ph,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome
0,0,yes,adult,530001,38.1,132.0,24.0,cool,reduced,dark_cyanotic,more_3_sec,depressed,absent,slight,slight,less_1_liter,6.5,decreased,distend_small,57.0,8.5,serosanguious,3.4,yes,2209,0,0,no,died
1,1,yes,adult,533836,37.5,88.0,12.0,cool,normal,pale_cyanotic,more_3_sec,mild_pain,absent,moderate,none,more_1_liter,2.0,absent,distend_small,33.0,64.0,serosanguious,2.0,yes,2208,0,0,no,euthanized
2,2,yes,adult,529812,38.3,120.0,28.0,cool,reduced,pale_pink,less_3_sec,extreme_pain,hypomotile,moderate,slight,none,3.5,,distend_large,37.0,6.4,serosanguious,3.4,yes,5124,0,0,no,lived
3,3,yes,adult,5262541,37.1,72.0,30.0,cold,reduced,pale_pink,more_3_sec,mild_pain,hypomotile,moderate,slight,more_1_liter,2.0,decreased,distend_small,53.0,7.0,cloudy,3.9,yes,2208,0,0,yes,lived
4,4,no,adult,5299629,38.0,52.0,48.0,normal,normal,normal_pink,less_3_sec,alert,hypomotile,none,slight,less_1_liter,7.0,normal,normal,47.0,7.3,cloudy,2.6,no,0,0,0,yes,lived


In [45]:
def null_values(a):
    null = a.isnull().sum()
    null_percent = a.isnull().mean()*100

    nulls = pd.concat([null.rename('Null Values'), null_percent.rename("Null Values Percent")],axis=1)
    nulls = nulls.sort_values(by='Null Values Percent',ascending=False)
    print(nulls)
    
null_values(test_data)

                       Null Values  Null Values Percent
abdomen                        154            18.689320
rectal_exam_feces              125            15.169903
nasogastric_tube                64             7.766990
peripheral_pulse                47             5.703883
temp_of_extremities             35             4.247573
abdomo_appearance               31             3.762136
pain                            29             3.519417
abdominal_distention            22             2.669903
peristalsis                     19             2.305825
nasogastric_reflux              14             1.699029
mucous_membrane                 13             1.577670
capillary_refill_time            6             0.728155
lesion_2                         0             0.000000
lesion_1                         0             0.000000
surgical_lesion                  0             0.000000
abdomo_protein                   0             0.000000
packed_cell_volume               0             0

In [46]:
# Separate features and target variable in the training data
X_train = train_data.drop('outcome', axis=1)
y_train = train_data['outcome']

In [47]:
# Identifying categorical and numerical columns
categorical_cols = test_data.select_dtypes(include=['object']).columns.tolist()
numerical_cols = train_data.select_dtypes(exclude=['object']).columns.tolist()

In [48]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='mean')

In [49]:
# Preprocessing for categorical data
categorical_transformer = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore'))

In [50]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

In [51]:
# Define the model
model = RandomForestClassifier(n_estimators=100, random_state=0)

In [52]:
# Bundle preprocessing and modeling code in a pipeline
clf = make_pipeline(preprocessor, model)

In [53]:
# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)

In [54]:
# Predicting outcomes for the test dataset
predictions = clf.predict(test_data)

In [55]:
# Creating a DataFrame with the id and predicted outcomes
output = pd.DataFrame({'id': test_data.id, 'outcome': predictions})

In [56]:
# Save the output to a CSV file
output.to_csv("predicted_submission.csv", index=False)