In [59]:
import pandas as pd
import zipfile
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score

In [2]:
zf = zipfile.ZipFile("data\playground-series-s3e22.zip")
train_df = pd.read_csv(zf.open("train.csv"))
test_df = pd.read_csv(zf.open("test.csv"))


In [3]:
train_df.head(5).transpose()

Unnamed: 0,0,1,2,3,4
id,0,1,2,3,4
surgery,yes,yes,yes,yes,no
age,adult,adult,adult,adult,adult
hospital_number,530001,533836,529812,5262541,5299629
rectal_temp,38.1,37.5,38.3,37.1,38.0
pulse,132.0,88.0,120.0,72.0,52.0
respiratory_rate,24.0,12.0,28.0,30.0,48.0
temp_of_extremities,cool,cool,cool,cold,normal
peripheral_pulse,reduced,normal,reduced,reduced,normal
mucous_membrane,dark_cyanotic,pale_cyanotic,pale_pink,pale_pink,normal_pink


In [15]:

dummy_train_df = pd.get_dummies(train_df, drop_first= False, dtype = float)

In [16]:
dummy_train_df.head(5).transpose()

Unnamed: 0,0,1,2,3,4
id,0.0,1.0,2.0,3.0,4.0
hospital_number,530001.0,533836.0,529812.0,5262541.0,5299629.0
rectal_temp,38.1,37.5,38.3,37.1,38.0
pulse,132.0,88.0,120.0,72.0,52.0
respiratory_rate,24.0,12.0,28.0,30.0,48.0
...,...,...,...,...,...
cp_data_no,1.0,1.0,1.0,0.0,0.0
cp_data_yes,0.0,0.0,0.0,1.0,1.0
outcome_died,1.0,0.0,0.0,0.0,0.0
outcome_euthanized,0.0,1.0,0.0,0.0,0.0


In [48]:
#Features to be used in training model
x_feat = dummy_train_df.drop(["id" , "outcome_died","outcome_lived","outcome_euthanized", "hospital_number", "nasogastric_reflux_slight",
"pain_slight",
"peristalsis_distend_small",
"rectal_exam_feces_serosanguious" ], axis = 1)

#label for training
y_label = dummy_train_df[["outcome_died", "outcome_euthanized", "outcome_lived"]]

In [49]:
x_train, x_test, y_train, y_test = train_test_split(x_feat,y_label,test_size = 0.1, random_state = 250)

rf_model = RandomForestClassifier(random_state = 250)
rf_model.fit(x_train, y_train)
y_pred = rf_model.predict(x_test)

print(f1_score(y_test,y_pred, average='micro'))
print(mean_squared_error(y_test, y_pred,squared=False))
print(rf_model.score(x_train,y_train))

0.7586206896551724
0.3868188740122162
1.0


In [52]:
dummy_test_df = pd.get_dummies(test_df, drop_first= False, dtype = float)
predict_test_df = dummy_test_df.drop(["id", "hospital_number","pain_moderate" ], axis = 1)

prediction = rf_model.predict(predict_test_df)
prediction

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [0., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [53]:
# Create a function to map the values to outcomes
def map_to_outcome(row):
    if row[0] == 1:
        return "died"
    elif row[1] == 1:
        return "euthanized"
    elif row[2] == 1:
        return "lived"
    else:
        return None  # Handle cases where no outcome is selected

In [None]:
outcome_labels = np.apply_along_axis(map_to_outcome, axis=1, arr=prediction)
df = pd.DataFrame({'Outcome': outcome_labels})
df

In [69]:
#Create Submission File
sub_file = pd.DataFrame({"id":test_df.id, "Outcome":df.Outcome})


In [58]:
sub_file.to_csv('Submission.csv', index=False)

In [68]:
#Start of KNN

knn_model = KNeighborsClassifier(n_neighbors=3 )
x_train_np = x_train.to_numpy()
x_test_np = x_test.to_numpy()


knn_model.fit(x_train_np, y_train)
y_pred = knn_model.predict(x_test_np)

print(f1_score(y_test,y_pred, average='micro'))
print(mean_squared_error(y_test, y_pred,squared=False))
print(knn_model.score(x_train_np,y_train))

predict_test_np = predict_test_df.to_numpy()
prediction_knn = knn_model.predict(predict_test_np)

0.7058823529411764
0.43076901241822224
0.7470747074707471


In [70]:
outcome_labels_knn = np.apply_along_axis(map_to_outcome, axis=1, arr=prediction_knn)
df = pd.DataFrame({'Outcome': outcome_labels_knn})
df

Unnamed: 0,Outcome
0,lived
1,
2,lived
3,eutha
4,lived
...,...
819,died
820,eutha
821,lived
822,lived


In [71]:
#Create Submission File
sub_file = pd.DataFrame({"id":test_df.id, "Outcome":df.Outcome})


In [72]:
sub_file.to_csv('SubmissionKNN.csv', index=False)