<a href="https://colab.research.google.com/github/aliabdelmonam/Poisonous_Mushrooms/blob/main/Poisonous_MushroomsV3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'playground-series-s4e8:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F76727%2F9045607%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240829%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240829T191921Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D2423d255dbec57c72656c629bead59e30a3713a2dde15dcadc64c3a2ef18af05e6a04c191ee4163c93a97e8067138b62ae3b48a005bbad873a0375b1f769ad620216e052531491934684cac1804b6a7ab296ea7461e6a52ebb71dd8086d607b463679db674414c9a8fb7af1f623bd2b5fbcee396307216c2f8e25755f0cc02d2058627459d0b32711e7d610862d5f5ef935e34af474c0e84faa22dd207c353f116631608f0aa5ba66d74d4d5d24b9479d2554028a885f4fc52259fd625034faa5857bca0159d55a87705801c168b23f7a008480cf82101c2aaaefa5687dd2efbf07be4fb63b784e5fc82f65eba1b106c8b065507dec534c43500de19d1596bc9'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


### **Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder,LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score,classification_report
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier,IsolationForest
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
import copy



## Data

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
# df=pd.read_csv('/content/sample_submission.csv')
tr=pd.read_csv("/kaggle/input/playground-series-s4e8/train.csv")


In [None]:
tr.shape

In [None]:
tr.head()

In [None]:
tr.info()

In [None]:
df_train = copy.deepcopy(tr)

# **Category**

In [None]:
def subclass_details (df):
  df=df.select_dtypes(include='category')
  for col in df.columns:
    print('Number of unique classes:',df[col].nunique())
    print(df[col].value_counts().head(10))
    print('############################')

In [None]:
def remove_att(df,threshold=200):
  cat_coln=df.select_dtypes(include='object')
  for col in cat_coln:
    attrib_drop=[]
    for att , count in df[col].value_counts().items() :
      if count <threshold:
        attrib_drop.append(att)
    mask = df[col].isin(attrib_drop)
    df.loc[mask,col] = 'UNK'
  return df

In [None]:
def convert_cate (df):
  for clas in df.select_dtypes(include='object'):
    df[clas] =   df[clas].astype('category')
  return df

In [None]:
df_train = remove_att(df_train)
df_train = convert_cate(df_train)
subclass_details(df_train)

In [None]:
df_train.info()

# **Missing Value**

In [None]:
def plot_missing_feature(df):
  null_df=(df.isna().sum()*100/df.shape[0]).sort_values(ascending=False)
  sns.barplot(x=null_df.index,y=null_df.values,palette='plasma')
  plt.xticks(rotation=90)
  plt.xlabel('Feature')
  plt.ylabel('Percent(%)')
  plt.title('Missing Values')
  plt.show()

In [None]:
def missing_feature (df):
  null_df=(df.isna().sum()*100/df.shape[0]).sort_values(ascending=False)
  return null_df

In [None]:
null_df_train = missing_feature(df_train)
null_df_train

In [None]:
plot_missing_feature(df_train)

In [None]:
def columns_drop(df):
  column_drop=[]
  null_df=missing_feature(df)
  for col,val in null_df.items():
    if val >4:
      column_drop.append(col)
  return column_drop

In [None]:
column_drop_train = columns_drop(df_train)
print('-----------------------------------------')
print(column_drop_train)

In [None]:
df_train.drop(column_drop_train,axis=1,inplace=True)
df_train.drop('id',axis=1,inplace=True)

In [None]:
df_train.isna().sum()

In [None]:
df_train.plot(kind='hist',subplots=True,sharex=True,figsize=(15,15),bins=100)

# **Splitting Data**

In [None]:
x=df_train.drop('class',axis=1)
y=df_train['class']
y=np.array([0 if i =='e' else 1 for i in y])
y.reshape(-1,1)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=.7,stratify=y)

# **Pipeline**

In [None]:
num_data_train_columns = x.select_dtypes(include='number').columns
cat_data_train_columns = x.select_dtypes(include='category').columns
cat_data_train_columns

In [None]:
num_pipe=Pipeline (steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])
cat_pipe=Pipeline (steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    # ('encoder',OneHotEncoder(sparse=False, handle_unknown='ignore'))
    ('encoder',OrdinalEncoder())

])

In [None]:
df_preprocessing=ColumnTransformer(
     transformers=[
        ('num', num_pipe, num_data_train_columns),
        ('cat', cat_pipe, cat_data_train_columns)
    ]
)

In [None]:
final_pipe = Pipeline(steps=[
    ('preprocessor',df_preprocessing)
    # ('PCA',PCA(n_components=.95)),
#     ('MCA',prince.MCA( n_components=2,  n_iter=3,       check_input=True, engine='auto',   random_state=42))

])

In [None]:
x_train=final_pipe.fit_transform(x_train)
x_test=final_pipe.transform(x_test)

In [None]:
isolation_forest = IsolationForest(contamination=0.024, random_state=42)
x_train_labels = isolation_forest.fit_predict(x_train) # 1 indicate normal -1--> outlier (anomaly)
normal_label_boolean = x_train_labels !=-1
x_train = x_train[normal_label_boolean]
y_train = y_train[normal_label_boolean]

In [None]:
1-len(x_train)/len(x_train_labels)

# **Modeling**

In [None]:
def Bayesian_Optimization (model,search_space):
  bayes = BayesSearchCV(model,
                        search_space,
                         n_iter= 20,
                        n_jobs=-1,
                        scoring='accuracy',
                        random_state=42)
  return bayes

In [None]:
xgb_space = {
    'n_estimators': Integer(50, 150),
    'max_depth': Integer(2, 8),
    'learning_rate': Real(0.01, .4, 'log-uniform'),
    'subsample': Real(0.5, 1.0, 'uniform'),
    'colsample_bytree': Real(0.5, 1.0, 'uniform'),
    'alpha':Real(.1,.5,'uniform'),
    'min_child_weight':Integer(5,10)
}

In [None]:
xgb = XGBClassifier(random_state=42,use_label_encoder=False)
xgb_optimzied = Bayesian_Optimization(xgb,xgb_space)

In [None]:
xgb_optimzied.fit(x_train,y_train)

In [None]:
xgb_optimzied.best_params_

In [None]:
xgb=xgb_optimzied.best_estimator_

In [None]:
y_pred_xgb=xgb.predict(x_test)
print('XGB --> ',accuracy_score(y_test,y_pred_xgb))

In [None]:

print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb))

> ***Submession***

In [None]:
ts = pd.read_csv('/kaggle/input/playground-series-s4e8/test.csv')

In [None]:
df_test = copy.deepcopy(ts)
df_test.drop('id',axis=1,inplace=True)

In [None]:
ts_col=columns_drop(df_test)

In [None]:
df_test = remove_att(df_test)
df_test.drop(ts_col,axis=1,inplace=True)

In [None]:
df_test=final_pipe.transform(df_test)

In [None]:
final_pred = xgb.predict(df_test)

In [None]:
final_pred_trans=['e' if i==0 else 'p' for i in final_pred]

In [None]:
submession= pd.DataFrame({'id':ts['id'].values,
                          'class':final_pred_trans
                         }
                        )


In [None]:
submession.to_csv('submession.csv',index=False)