<a href="https://www.kaggle.com/code/benzilla987/ps-s3-ep22-eda-modeling-submission?scriptVersionId=144876409" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from scipy import stats
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score

#Importing Data
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
train = pd.read_csv('/kaggle/input/playground-series-s3e22/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s3e22/test.csv')
submission = pd.read_csv('/kaggle/input/playground-series-s3e22/sample_submission.csv')

test.head()

# Initial Exporatory Analysis

In [None]:
# Deaing with null values and converting categorical columns to appropriate datatype.
# List of categorical columns to process
numeric_columns = train.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = [
    'temp_of_extremities',
    'peripheral_pulse',
    'mucous_membrane',
    'capillary_refill_time',
    'pain',
    'peristalsis',
    'abdominal_distention',
    'nasogastric_tube',
    'nasogastric_reflux',
    'rectal_exam_feces',
    'abdomen',
    'abdomo_appearance',
    'surgery',
    'surgical_lesion',
    'cp_data',
    'outcome',
    'age'
]

# Setting a custom color palette for Seaborn
custom_palette = sns.color_palette("Set2")

# Creating a square grid for all the plots
num_plots = len(categorical_columns) + len(numeric_columns) + 1  # Total number of plots
grid_size = int(num_plots ** 0.5) + 1  # Calculating the grid size

# Creating a square grid of subplots
fig, axes = plt.subplots(grid_size, grid_size, figsize=(15, 15))

# Looping through the categorical columns
for i, column in enumerate(categorical_columns):
    ax = axes[i // grid_size, i % grid_size]
    sns.countplot(data=train, x=train[column], palette=custom_palette, ax=ax)
    ax.set_title(f'Distribution of {column}', fontsize=10)
    ax.set_xlabel(column, fontsize=8)
    ax.set_ylabel('Count', fontsize=8)
    ax.tick_params(axis='x', labelrotation=45, labelsize=6)
    ax.tick_params(axis='y', labelsize=6)

# Loop through the numeric columns 
for i, column in enumerate(numeric_columns):
    ax = axes[(i + len(categorical_columns)) // grid_size, (i + len(categorical_columns)) % grid_size]
    sns.histplot(data=train, x=train[column], kde=True, color=custom_palette[2], ax=ax)
    ax.set_title(f'Distribution of {column}', fontsize=10)
    ax.set_xlabel(column, fontsize=8)
    ax.set_ylabel('Frequency', fontsize=8)
    ax.tick_params(axis='x', labelsize=6)
    ax.tick_params(axis='y', labelsize=6)

# Removing empty subplots
for i in range(num_plots, grid_size ** 2):
    fig.delaxes(axes[i // grid_size, i % grid_size])

plt.tight_layout()
plt.show()

## Highest Score Yet
## Submission 17
## Score: 76.829

In [None]:
import pandas as pd
import xgboost as xgb
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier  # Import the Gradient Boosting Classifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier  # Import the Gradient Boosting Classifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

## Reading in the data
train = pd.read_csv('/kaggle/input/playground-series-s3e22/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s3e22/test.csv')

## Lesion 2 & Lession 2 values are not useful, dropping these variables because most values are = 0.
train.drop('lesion_2', axis=1, inplace=True)
test.drop('lesion_2', axis=1, inplace=True)
train.drop('lesion_3', axis=1, inplace=True)
test.drop('lesion_3', axis=1, inplace=True)

# Dropping NA values from train df
train.dropna(inplace=True)

# Defining X/Y df objects for train/test
X_train=train.drop('outcome', axis=1)
y_train=train['outcome']
y_train = y_train.map({'died': 0, 'euthanized': 1, 'lived': 2})
X_test=test

# Defining list of categorical columns
categorical_columns = [
    'temp_of_extremities',
    'peripheral_pulse',
    'mucous_membrane',
    'capillary_refill_time',
    'pain',
    'peristalsis',
    'abdominal_distention',
    'nasogastric_tube',
    'nasogastric_reflux',
    'rectal_exam_feces',
    'abdomen',
    'abdomo_appearance',
    'surgery',
    'surgical_lesion',
    'cp_data',
    'age'
]

# Seperating numerical columns from categorical columns
numerical_columns=[col for col in train.columns if col not in categorical_columns]

# Combining data temporarily to one hot encode
combined_data = pd.concat([X_train, X_test], axis=0)

# One hot encoding categorical columns
combined_encoded = pd.get_dummies(combined_data, columns=categorical_columns)

# Splitting back up the training / testing data
X_train_encoded = combined_encoded.iloc[:len(train)]
X_test_encoded = combined_encoded.iloc[len(train):]

# Creating and training the gb classifier
xgb_classifier = xgb.XGBClassifier(n_estimators=300, random_state=42, learning_rate=0.1)
xgb_classifier.fit(X_train_encoded, y_train)

# Creating Predictions
predictions = xgb_classifier.predict(X_test_encoded)

# Creating a mapping dictionary
label_mapping = {0: 'died', 1: 'euthanized', 2: 'lived'}

# Applying the mapping to the predictions array
predictions = np.vectorize(label_mapping.get)(predictions)

# Appending predictions back onto test df
X_test_encoded['outcome']=predictions

# Building submission df
submission_17=X_test_encoded[['id','outcome']]
