# Q-2. 
```
A chemist had two chemical flasks labeled 0 and 1 which consist of two different chemicals. He extracted 3 features from these chemicals in order to distinguish between them, you provided the results derived by the chemicals and your task is to create a model that will label chemical 0 or 1 given its three features and built-in docker and use some library to display that in frontend.
Note : Use only pyspark
```

In [263]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [264]:
df = pd.read_csv("indian_liver_patient.csv")

In [265]:
df.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [266]:
df.Dataset.nunique()

2

In [267]:
df.Dataset.value_counts()

1    416
2    167
Name: Dataset, dtype: int64

In [268]:
cat_cols = ['Gender']

In [269]:
num_cols = ['Age', 'Total_Bilirubin', 'Direct_Bilirubin', 'Alkaline_Phosphotase',
       'Alamine_Aminotransferase', 'Aspartate_Aminotransferase',
       'Total_Protiens', 'Albumin', 'Albumin_and_Globulin_Ratio']

In [270]:
cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OrdinalEncoder())
])

num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
    ("cat", cat_pipeline, cat_cols),
    ("num", num_pipeline, num_cols)
])

In [271]:
from sklearn.cluster import KMeans
from kneed import KneeLocator
from sklearn.metrics import silhouette_score

In [272]:
X = df.drop(["Dataset"], axis=1)
y = df["Dataset"]

In [273]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

In [274]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [275]:
from collections import Counter
from imblearn.over_sampling import SMOTE


# summarize class distribution
print("Original Class Distribution:", Counter(y_train))

# define the oversample strategies
oversample = SMOTE(sampling_strategy=1, random_state=42)

# fit and apply the transform
X_o, y_o = oversample.fit_resample(X_train_processed, y_train)

print("Class Distribution:", Counter(y_o))


Original Class Distribution: Counter({1: 288, 2: 120})
Class Distribution: Counter({1: 288, 2: 288})


In [276]:
# Basic Import
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
import xgboost as xgb
from lightgbm import LGBMClassifier
from sklearn import svm

In [277]:
models = {
    'rfc': RandomForestClassifier(random_state=42, n_estimators=101, criterion="entropy"),
    "et": ExtraTreesClassifier(random_state=42, n_estimators=101, criterion="entropy"),
    "gbc": GradientBoostingClassifier(random_state=42, n_estimators=101),
    "cat": CatBoostClassifier(random_state=42, verbose=0, learning_rate=0.01),
    "ada": AdaBoostClassifier(random_state=42, n_estimators=101),
    "light": LGBMClassifier(random_state=42, n_estimators=101),
}

macro_average_list = []
trained_models_list = []

# looping through dictionary, create model and evaluates it
for model in list(models.values()):
    model.fit(X_o, y_o)
    y_pred = model.predict(X_test_processed)
    report = classification_report(y_test, y_pred, output_dict=True)
    print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.79      0.72      0.75       128
           2       0.38      0.47      0.42        47

    accuracy                           0.65       175
   macro avg       0.58      0.59      0.59       175
weighted avg       0.68      0.65      0.66       175

              precision    recall  f1-score   support

           1       0.83      0.78      0.81       128
           2       0.49      0.57      0.53        47

    accuracy                           0.73       175
   macro avg       0.66      0.68      0.67       175
weighted avg       0.74      0.73      0.73       175

              precision    recall  f1-score   support

           1       0.80      0.69      0.74       128
           2       0.38      0.53      0.45        47

    accuracy                           0.65       175
   macro avg       0.59      0.61      0.59       175
weighted avg       0.69      0.65      0.66       175

              preci