In [1]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
print(model.get_params())

{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}


In [2]:
from sklearn.svm import SVC
model = SVC()
print(model.get_params())

{'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}


In [3]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
print(model.get_params())

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False, 'tol': 1e-06}


In [4]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
print(model.get_params())

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


## **Choosing the Right Hyperparameters to Tune**
Each model has specific hyperparameters that significantly impact performance:

| Model                  | Key Hyperparameters to Tune |
|------------------------|---------------------------|
| **Logistic Regression** | `C`, `penalty`, `solver` |
| **Decision Tree**       | `max_depth`, `min_samples_split`, `criterion` |
| **Random Forest**       | `n_estimators`, `max_depth`, `min_samples_split`, `bootstrap` |
| **SVM**                | `C`, `kernel`, `gamma` |
| **k-NN**               | `n_neighbors`, `weights`, `metric` |
| **Neural Networks (MLPClassifier)** | `hidden_layer_sizes`, `activation`, `solver`, `alpha` |


# Logistic Regression Fine Tuning

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns

# Load dataset
df = sns.load_dataset("titanic")
X = df[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']]
y = df['survived']

# Convert categorical variable 'sex' to numerical using one-hot encoding
X = pd.get_dummies(X, columns=['sex'], drop_first=True)  # Avoid dummy variable trap

# Fill missing values in 'age' with mean age
X['age'] = X['age'].fillna(df['age'].mean())

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the model
model = LogisticRegression()

# Define hyperparameter grid for tuning
param_grid = {
    'C': [0.01, 0.1, 1, 10],  # Regularization strength
    'penalty': ['l1', 'l2'],  # Regularization type
    'solver': ['liblinear', 'saga'] , # Suitable solvers for l1 and l2
    'warm_start': [True,False]
}

# Perform GridSearchCV
grid_model = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_model.fit(X_train, y_train)

# Print best parameters
print("Best Parameters:", grid_model.best_params_)

# Evaluate on test set
test_accuracy = grid_model.best_estimator_.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)


Best Parameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'saga', 'warm_start': True}
Test Accuracy: 0.7932960893854749


# K-NN Fine Tuning

In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# Load dataset
df = sns.load_dataset("titanic")
X = df[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']]
y = df['survived']

# Convert categorical variable 'sex' to numerical using one-hot encoding
X = pd.get_dummies(X, columns=['sex'], drop_first=True)  # Avoid dummy variable trap

# Fill missing values in 'age' with mean age
X['age'] = X['age'].fillna(df['age'].mean())
X['fare'] = X['fare'].fillna(df['fare'].mean())


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features (important for K-NN)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the K-NN model
model = KNeighborsClassifier()

# Define hyperparameter grid for tuning
param_grid = {
    'n_neighbors': np.arange(1,30),  # Different K values
    'weights': ['uniform', 'distance'],  # Weighting strategies
    'metric': ['euclidean', 'manhattan'],  # Distance metrics
    'p':[1,2,4,5,6,7]
}

# Perform GridSearchCV
grid_model = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_model.fit(X_train, y_train)

# Print best parameters
print("Best Parameters:", grid_model.best_params_)

# Evaluate on test set
test_accuracy = grid_model.best_estimator_.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)


Best Parameters: {'metric': 'manhattan', 'n_neighbors': np.int64(15), 'p': 1, 'weights': 'uniform'}
Test Accuracy: 0.7877094972067039


# Best Model with Tuning

In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load dataset
df = sns.load_dataset("titanic")

# Select relevant features
X = df[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']]
y = df['survived']

# Convert categorical variable 'sex' to numerical using one-hot encoding
X = pd.get_dummies(X, columns=['sex'], drop_first=True)  # Avoid dummy variable trap

# Fill missing values
X['age'] = X['age'].fillna(df['age'].mean())
X['fare'] = X['fare'].fillna(df['fare'].mean())  # Fix missing 'fare' values

# Import necessary libraries from sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# Import models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define models and hyperparameter grids
models_with_params = {
    "Logistic Regression": (LogisticRegression(), {
        'C': [0.1, 1, 10],
        'max_iter': [100, 200]
    }),
    "k-Nearest Neighbors": (KNeighborsClassifier(), {
        'n_neighbors': list(range(1, 10)),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    }),
    "Decision Tree": (DecisionTreeClassifier(), {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30]
    }),
    "Random Forest": (RandomForestClassifier(), {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'criterion': ['gini', 'entropy']
    }),
    "Support Vector Machine": (SVC(probability=True), {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly']
    })
}

# Train and tune models
best_models = []
for name, (model, param_grid) in models_with_params.items():
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    best_score = grid_search.best_score_
    best_params = grid_search.best_params_
    
    best_models.append((name, best_model, best_score, best_params))

# Sort models by accuracy
sorted_models = sorted(best_models, key=lambda x: x[2], reverse=True)

# Print best model and its parameters
print("\nBest Model:")
print(f"Model: {sorted_models[0][0]}")
print(f"Cross-Validation Accuracy: {sorted_models[0][2]:.2f}")
print(f"Best Parameters: {sorted_models[0][3]}")


# Evaluate the best model on the test set
best_model = sorted_models[0][1]
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {test_accuracy:.2f}\n")
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Fitting 5 folds for each of 9 candidates, totalling 45 fits

Best Model:
Model: Random Forest
Cross-Validation Accuracy: 0.83
Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'n_estimators': 200}
Test Accuracy: 0.81

              precision    recall  f1-score   support

           0       0.80      0.90      0.85       105
           1       0.82      0.69      0.75        74

    accuracy                           0.81       179
   macro avg       0.81      0.79      0.80       179
weighted avg       0.81      0.81      0.81       179

