In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import accuracy_score,classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier

In [None]:
train=pd.read_csv("/kaggle/input/playground-series-s3e26/train.csv")
test=pd.read_csv("/kaggle/input/playground-series-s3e26/test.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.drop(['N_Days'],axis=1,inplace=True)
test.drop(['N_Days'],axis=1,inplace=True)

In [None]:
columns=['Sex','Ascites','Hepatomegaly','Spiders','Edema','Drug']
le=LabelEncoder()
train['Status']=le.fit_transform(train['Status'])
for i in columns:
    train[i]=le.fit_transform(train[i])
    test[i]=le.transform(test[i])

In [None]:
sc=StandardScaler()
columns_to_scale=['Age','Cholesterol','Albumin','Copper','Alk_Phos','SGOT','Tryglicerides','Platelets','Prothrombin','Stage']
train[columns_to_scale]=sc.fit_transform(train[columns_to_scale])
test[columns_to_scale]=sc.transform(test[columns_to_scale])

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
power_transformer = PowerTransformer(method='yeo-johnson')
train[columns_to_scale] = power_transformer.fit_transform(train[columns_to_scale])
test[columns_to_scale] = power_transformer.transform(test[columns_to_scale])

In [None]:
y=train['Status']
X=train.drop('Status',axis=1)

In [None]:
num_cols = len(train.columns)
num_rows = num_cols // 6 + (num_cols % 6 > 0)  # Calculate the number of rows needed

fig, axes = plt.subplots(num_rows, 6, figsize=(20, 5 * num_rows))

for i, ax in enumerate(axes.flatten()):
    if i < num_cols:
        sns.histplot(train.iloc[:, i], kde=True, ax=ax)
        ax.set_title(train.columns[i])
    else:
        fig.delaxes(ax)  # Remove empty subplots if there are fewer columns than expected

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(train.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
sns.pairplot(X)
plt.title('Pairplot')
plt.show()

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
rf_classifier = RandomForestClassifier()
param_grid = {'n_estimators': [50, 100, 200],'max_depth': [None, 10, 20],'min_samples_split': [2, 5, 10],'min_samples_leaf': [1, 2, 4]}
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=3, scoring='neg_log_loss', n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Best Hyperparameters:", grid_search.best_params_)
best_rf_model = grid_search.best_estimator_

In [None]:
y_pred = best_rf_model.predict(X_test)

In [None]:
acc = accuracy_score(y_test, y_pred)
print("Accuracy:",acc)
cr=classification_report(y_test, y_pred)
print("Classification report:\n",cr)

In [None]:
y_test_pred=best_rf_model.predict(test)
prob2=best_rf_model.predict_proba(test)

In [None]:
importances = best_rf_model.feature_importances_
plt.figure(figsize=(10, 6))
sns.barplot(x=importances, y=X.columns)
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

In [None]:
conf_mat = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_mat, annot=True, cmap='Blues', fmt='g')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(prob2, kde=True, bins=30)
plt.title('Probability Distribution of Predictions')
plt.xlabel('Probability')
plt.ylabel('Frequency')
plt.show()

In [None]:
xgb_classifier = XGBClassifier()
xgb_classifier.fit(X_train, y_train)

y_pred_xgb = xgb_classifier.predict(X_test)
acc_xgb = accuracy_score(y_test, y_pred_xgb)
print("XGBoost Accuracy:", acc_xgb)

y_test_pred_xgb = xgb_classifier.predict(test)
prob_xgb = xgb_classifier.predict_proba(test)

In [None]:
submission=pd.DataFrame(prob2,columns=['Status_C','Status_D','Status_CL'])
submission['id']=test['id']
submission=submission[['id','Status_C','Status_D','Status_CL']]
submission.to_csv('submission.csv',index=False)