In [206]:
import pandas as pd
import numpy as np
import itertools

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (precision_score, recall_score, matthews_corrcoef , f1_score, classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay)
from sklearn.model_selection import (learning_curve, validation_curve, train_test_split, KFold, StratifiedKFold, 
                                    cross_val_score, GridSearchCV, RandomizedSearchCV, cross_validate, RepeatedStratifiedKFold)

from sklearn.linear_model import Perceptron, LogisticRegression, SGDClassifier, PassiveAggressiveClassifier, RidgeClassifier, RidgeClassifierCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier

from scipy.stats import loguniform, beta, uniform

from mlxtend.feature_selection import SequentialFeatureSelector as SFS

from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.pipeline import Pipeline as IMBPipeline

import missingno as msno

import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

PIPELINE; is the union of several transformers or imputer - for nan values
SIMPLE IMPUTER: IF HAVE MISSING DATATS: FILL WITH MOST FREQUENT

ORDINAL ENCODER: AS ONEHOT BUT WITH ORDER - COL: GENDER(MALE - FEMALE) , TARTAR(Y - N) , ORAL (=> CAN BE DROPPED)

MINMAX SCALER OR STANDARD SCALER: FOR NUMERICAL DATAS: ALL COLUMNS with standard scaler


In [None]:
import requests

GITHUB_TOKEN = "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"

file_name = "dataset_1.csv"
file_url = f"https://raw.githubusercontent.com/brusati04/smoking_ml_project/main/{file_name}"
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

response = requests.get(file_url, headers=headers)

if response.status_code == 200:
    from io import StringIO
    csv_dataset = StringIO(response.text)
    dataset = pd.read_csv(csv_dataset)
else:
    print("Download failed")

dataset.info()

In [208]:
# if input("do you whant to finish quickly the project?(y/n):") == "y":
#     dataset.dropna(axis=1)
#     dataset.info()
#     print("well done, now 30L")
# else:
#     pass

In [None]:

# Analizziamo il bilanciamento della variabile target
sm = dataset["smoking"].value_counts(normalize=True)
print(f"Distribuzione della variabile target: {sm}")

# analizziamo la distribuzione dei Nan values:
for col in dataset:
    Nan=dataset[col].isnull().sum()
    print(f"missing values in {col}: {Nan}")


In [None]:
msno.matrix(dataset)

In [None]:
n = 2  # Set your threshold
num_rows = (dataset.isna().sum(axis=1) > n).sum()
print(f"Number of rows with more than {n} NaN values: {num_rows}")

In [212]:
dataset = dataset.dropna(axis=0, thresh=len(dataset.columns)-2)

The columns not reported in the figure will be discarded.

For features age and fare the pipeline is composed by two transformers:

KNNImputer: both features contain missing values, so we have to apply an imputation strategy. In this case the strategy is based on the idea of 
k
k-nearest neighbors.
StandardScaler: both features are numerical
For features pclass:

An OrdinalEncoder transforms the strings '3','2' and '1' corresponding to the ticket classes into the numerical values 3,2 and 1.
For features sex and embarked, we apply:

SimpleImputer: feature embarked contains two missing values, while the column sex will be untouched. As a strategy we use 'most_frequent' since both features are categorical
OneHotEncoder: features are categorical.
For features sbsp and parch we define a customer transformer that builds a new feature is_alone indicating whether the passenger travelled alone or not. More details about how to code customer transformers in the following optional section.

For feature name we define a further customer transformer to infer the title (Mr, Miss, Doc, Captain, etc..) from the fullname.

ID: drop
gender: ordinal, 
age: minmax,
oral: ordinal
dental caries: none
tartar: ordinal
O/W: standardization x 20, 


In [213]:
minmax_age = MinMaxScaler()

oe_oral = Pipeline([
        ("pipe_sim", SimpleImputer(strategy="most_frequent")),
        ("pipe_ord",  OrdinalEncoder(categories=[["N","Y"]]))
        ])

oe_tartar = Pipeline([
        ("pipe_sim", SimpleImputer(strategy="most_frequent")),
        ("pipe_ord",  OrdinalEncoder(categories=[["N","Y"]]))
        ])

oe_gender = Pipeline([
        ("pipe_sim", SimpleImputer(strategy="most_frequent")),
        ("pipe_ord", OrdinalEncoder(categories=[["F","M"]]))
        ])

std_body_signals = Pipeline([
        ("pipe_sim", SimpleImputer(strategy="most_frequent")),
        ("pipe_std", StandardScaler())
        ])

In [None]:
# COLUMN TRASFORMATION
body_signals = [
    "height(cm)","weight(kg)","waist(cm)","eyesight(left)","eyesight(right)",
    "hearing(left)","hearing(right)","systolic","relaxation","fasting blood sugar",
    "Cholesterol","triglyceride","HDL","LDL","hemoglobin","Urine protein",
    "serum creatinine","AST","ALT","Gtp"
]

smoking_tr = ColumnTransformer(
    transformers=[
        ("id", "drop", ["ID"]),
        ("gender", oe_gender, ["gender"]),
        ("age", minmax_age, ["age"]),
        ("body_signals", std_body_signals, body_signals),
        ("oral", oe_oral, ["oral"]),
        ("tartar", oe_tartar, ["tartar"])
    ],
    remainder="passthrough", 
    sparse_threshold=1
)

In [215]:
# Separiamo feature e target columns:
X = dataset.drop(columns=["smoking"])
y = dataset["smoking"]

# Divisione in train e test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5,  stratify = y, random_state=42, shuffle=True)

In [None]:
sum(y_train==1)/len(y_train) , sum(y_test==1)/len(y_test)

In [217]:
model_pipeline = IMBPipeline([
    ('trans', smoking_tr),
    ('sampler', SMOTE()),
    ('dim_reduction', PCA(n_components=0.8)),
    ('classifier', Perceptron())
])

In [None]:
model_pipeline.fit(X_train,y_train)

In [None]:
model_pipeline.predict(X_test)

In [220]:
X_train = smoking_tr.fit_transform(X_train)
X_test = smoking_tr.fit_transform(X_test)


model_rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight="balanced")
model_perceptron = Perceptron(max_iter=1000, random_state=42, eta0=0.01)
model_knn = KNeighborsClassifier(n_neighbors=3)
model_logistic = LogisticRegression(random_state=42, class_weight="balanced")
model_tree = DecisionTreeClassifier(random_state=42, class_weight="balanced")
model_lda = LDA()
model_sgd = SGDClassifier(random_state=42, class_weight="balanced")
model_ridge = RidgeClassifier(random_state=42, class_weight="balanced")
model_pa = PassiveAggressiveClassifier(random_state=42, class_weight="balanced")

# creiamo una lista di tutti i modelli qui sopra:
models = [model_rf, model_perceptron, model_knn, model_logistic, model_tree, model_lda, model_sgd, model_ridge, model_pa]

In [221]:
# pd.DataFrame(X_test).to_csv("X_test.csv", index=False, header=True)

In [None]:
for model in models:
    print(f"MODEL: {model}")
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    train_confusion_matrix = ConfusionMatrixDisplay.from_predictions(y_test, y_test_pred, cmap=plt.cm.Blues)
    print(f"accouracy: {accuracy_score(y_test, y_test_pred)}")
    print(f"precision: {precision_score(y_test, y_test_pred)}")
    print(f"recall: {recall_score(y_test, y_test_pred)}")
    print(f"f1-score: {f1_score(y_test, y_test_pred)}")
    
    plt.show()


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=77cb4bcc-2fc9-439b-9d91-de217bfdb267' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>

In [223]:
pd.concat([pd.DataFrame(X_test), pd.DataFrame(y_test)], axis=1).to_csv("test.csv", index=False) # save test set