# **Modelling and Tuning**

In [2]:
import pandas as pd
import numpy as np

In [3]:
import warnings
from sklearn.exceptions import ConvergenceWarning

# Ignore ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [40]:
import pickle

# Load train/val/test split data from notebook3
with open(r'Nata_Files\\train_test_split.pkl', 'rb') as f:
    train_test_split_data = pickle.load(f)

# Core datasets
X = train_test_split_data.get('X')
y = train_test_split_data.get('y')

# Splits and processed feature sets
X_train = train_test_split_data.get('X_train')
y_train = train_test_split_data.get('y_train')
X_val = train_test_split_data.get('X_val')
y_val = train_test_split_data.get('y_val')
X_test = train_test_split_data.get('X_test')
y_test = train_test_split_data.get('y_test')
X_train_val = train_test_split_data.get('X_train_val')
y_train_val = train_test_split_data.get('y_train_val')

numeric_cols = train_test_split_data.get('numeric_cols')
kf = train_test_split_data.get('kf')
rkf = train_test_split_data.get('rkf')

print("Train/Val/Test split data loaded successfully!")
if X is not None and y is not None:
    print(f"Full dataset X shape: {X.shape} | y shape: {y.shape}")
if X_train is not None:
    print(f"X_train shape: {X_train.shape}")
if X_val is not None:
    print(f"X_val shape: {X_val.shape}")
if X_test is not None:
    print(f"X_test shape: {X_test.shape}")
if kf is not None:
    try:
        print(f"kf splits: {kf.get_n_splits()}")
    except Exception:
        print("kf loaded (object), get_n_splits() unavailable for this object")
if rkf is not None:
    try:
        print(f"rkf splits: {rkf.get_n_splits()}")
    except Exception:
        print("rkf loaded (object), get_n_splits() unavailable for this object")

Train/Val/Test split data loaded successfully!
Full dataset X shape: (5196, 14) | y shape: (5196,)
X_train shape: (3117, 14)
X_val shape: (1039, 14)
X_test shape: (1040, 14)
kf splits: 10
rkf splits: 14


In [7]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import r2_score, accuracy_score

In [8]:
def fit_model(model, X, y):
    return model.fit(X, y)


In [9]:
def eval_clf_model(model, X, y):
    return model.score(X,y)

In [44]:
def avg_score_clf(method,X,y,model):
    score_train = []
    score_val = []
    for train_index, val_index in method.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        trained_model = fit_model(model, X_train, y_train)
        value_train = eval_clf_model(trained_model, X_train, y_train)
        value_val = eval_clf_model(trained_model, X_val, y_val)
        score_train.append(value_train)
        score_val.append(value_val)

    print('Train:', np.mean(score_train))
    print('Validation:', np.mean(score_val))

## **Model Selection**

Que modelos podemos usar?
Isto é um classification problem então podemos descartar logo alguns.
Podemos testar :
- Logistic Regression
- Decision Trees
- Naive Bayes
- Random Forest
- K nearest Classifier

In [11]:
logr = LogisticRegression()

In [29]:
logr_fit = fit_model(logr, X_train, y_train)

In [39]:
logr_pred_tr = eval_clf_model(logr_fit, X_train, y_train)
logr_pred_val = eval_clf_model(logr_fit, X_val, y_val)

display(logr_pred_tr)
display(logr_pred_val)

0.741738851459737

0.7507218479307026

In [45]:
avg_score_clf(kf, X, y, logr)

Train: 0.7413181197396282
Validation: 0.7403768341485104


### **Decision Tree Classifier model**


In [22]:
dtr = DecisionTreeClassifier()

In [35]:
dtr_fit = fit_model(dtr, X_train, y_train)


dtr_pred_tr = eval_clf_model(dtr_fit, X_train, y_train)
dtr_pred_val = eval_clf_model(dtr_fit, X_val, y_val)



### **K-Nearest Classifier**

In [26]:
from sklearn.neighbors import KNeighborsClassifier

In [27]:
modelKNN = KNeighborsClassifier()

In [36]:
KNN_fit = fit_model(modelKNN, X_train, y_train)


KNN_pred_tr = eval_clf_model(KNN_fit, X_train, y_train)
KNN_pred_val = eval_clf_model(KNN_fit, X_val, y_val)

In [32]:
#comparing accuracy scores in training and validation
display(accuracy_score(y_train, logr_pred_tr))

display(accuracy_score(y_val, logr_pred_val))

0.741738851459737

0.7507218479307026

In [33]:
logr_pred_test = logr.predict(X_test)

display(accuracy_score(y_test, logr_pred_test))

0.7423076923076923