<a href="https://colab.research.google.com/github/ashish78905/OPTICONNECT_CALLL_CENTER_ANALYSIS-ASSIGNMENT/blob/main/TILL_SUPERVISED_ML.IPYNB" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ========== 1. DATA LOADING ==========
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes

In [None]:
# Load diabetes dataset
diabetes = load_diabetes()
print(diabetes.DESCR)

In [None]:
# Create DataFrame from diabetes dataset
data = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
data['target'] = diabetes.target

In [None]:
plt.scatter(data.bmi, data.target)
plt.xlabel("bmi")
plt.ylabel("target")
plt.show()

In [None]:
# Define features (X) and target (y)
X = data.drop('target', axis=1)
y = data['target']

In [None]:
# ========== 3. TRAIN-TEST SPLIT ==========
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1
)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [None]:
# ========== 4. SCALING (STANDARDIZATION) ==========
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

plt.scatter(X_train[:, 0], y_train)
plt.xlabel("Scaled Age")
plt.ylabel("Target")
plt.show()

In [None]:
# ========== 5. MODEL TRAINING ==========
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

print("Coefficient:", model.coef_)
print("Intercept:", model.intercept_)

In [None]:
# ========== 6. MODEL PREDICTION ==========
y_pred = model.predict(X_test)

# On training data
plt.scatter(X_train[:, 0], y_train)
plt.plot(X_train[:, 0], model.predict(X_train), 'r')
plt.xlabel("Scaled Age")
plt.ylabel("Target")
plt.show()

In [None]:
# On testing data
y_pred_test = model.predict(X_test)
plt.scatter(X_test[:, 0], y_test)
plt.plot(X_test[:, 0], y_pred_test, 'r')
plt.xlabel("Scaled Age")
plt.ylabel("Target")
plt.show()

In [None]:
# ========== 7. PERFORMANCE METRICS ==========
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

mse = mean_squared_error(y_test, y_pred_test)
mae = mean_absolute_error(y_test, y_pred_test)
rmse = np.sqrt(mse)

print("MSE:", mse)
print("MAE:", mae)
print("RMSE:", rmse)

score = r2_score(y_test, y_pred_test)
print("R2 Score:", score)

adj_r2 = 1 - (1 - score) * (len(y_test) - 1) / (len(y_test) - X_test.shape[1] - 1)
print("Adjusted R2:", adj_r2)

In [None]:
# ========== 8. VISUALIZATION ==========
plt.scatter(X_test[:, 0], y_test, color='black', label='Actual data')
plt.plot(X_test[:, 0], y_pred_test, color='blue', linewidth=3, label="Linear regression line")
plt.xlabel("Scaled Age")
plt.ylabel("One year progression-target")
plt.title("Linear regression on diabetes data")
plt.legend()
plt.show()

In [None]:
# ========== 9. MODEL ASSUMPTIONS & RESIDUALS ==========
# Residuals
plt.scatter(y_test, y_pred_test)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Residuals plot")
plt.show()

error = y_test - y_pred_test
print("Residual errors:\n", error)

# MULTIPLE LINEAR REGRESSSION

In [None]:
# ========== 1. DATA LOADING ==========
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
print(diabetes.DESCR)
diabetes.data
diabetes.target
diabetes.feature_names
import pandas as pd
data = pd.DataFrame(diabetes.data, columns = diabetes.feature_names)
data
data['target'] = diabetes.target

In [None]:
# ========== 2. EDA & DATA PREPARATION ==========
import seaborn as sns

# Divide into X (features) and y (target)
# Define the independent features (X) by dropping the 'target' column
X = data.drop('target', axis = 1)
# Define the dependent feature (y) as the 'target' column
y = data['target']

# EDA on the data DataFrame
print("Data Info:")
data.info()
print("\nData Types:")
print(data.dtypes)
print("\nHead of Data:")
display(data.head())
print("\nTail of Data:")
display(data.tail())
print("\nSample of Data:")
display(data.sample(3))
print("\nMissing Values:")
print(data.isnull().sum())
print("\nData Description:")
display(data.describe())
print("\nCorrelation Matrix:")
display(data.corr())

# Heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# ========== 3. TRAIN-TEST SPLIT ==========
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=1)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# ========== 4. SCALING ==========
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train
X_test

In [None]:
# ========== 5. MODEL TRAINING ==========
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)
X_train.shape
len(model.coef_)
model.coef_
model.intercept_

In [None]:
# ========== 6. MODEL PREDICTION ==========
y_pred = model.predict(X_test)

In [None]:
# ========== 7. PERFORMANCE METRICS ==========
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(np.sqrt(mean_squared_error(y_test, y_pred)))

score = r2_score(y_test, y_pred)
1 - (1 - score) * (len(y_test) - 1) / (len(y_test) - X_test.shape[1] - 1)

In [None]:
# ========== 8. MODEL ASSUMPTIONS & RESIDUALS ==========
plt.scatter(y_test, y_pred)

error = y_test - y_pred
sns.distplot(error)
plt.scatter(y_pred, error)

# PICKLING MODEL

In [None]:
import pickle   # Importing pickle module for saving/loading Python objects

# Saving the trained model into a file named "model.pkl"
# pickle.dump(object, file, mode)
# object = model → the trained LinearRegression model
# open("model.pkl", "wb") → open file in write-binary mode to store bytes
pickle.dump(model, open("model.pkl", "wb"))  # write binary mode

# Explanation:
# Python object (here: model), along with its attributes (coefficients, intercept, etc.)
# and methods, is converted into a byte stream and saved into a file.
# Later, we can load this model back using pickle.load() without retraining.



# Loading the saved model from the file "model.pkl"

# pickle.load(file) → reads the byte stream and reconstructs the original Python object
# open("model.pkl", "rb") → open file in read-binary mode
model = pickle.load(open("model.pkl", 'rb'))

# Now 'model' is the same trained LinearRegression model we saved earlier.
# We can directly use it for predictions without retraining.



# Using the loaded model to make predictions on the test set

# model.predict(X_test) → applies the learned coefficients & intercept
# to the features in X_test and returns predicted values for target variable
y_pred = model.predict(X_test)

# Now y_pred contains the predictions made by the trained (or loaded) model


# polynomial Regression

In [None]:
# ========== 1. DATA GENERATION ==========
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(1)
X = 2 * np.random.rand(100, 1)
y = 4 + 3*X + 1.5*X**2 + np.random.randn(100, 1)
X
y

In [None]:
# ========== 2. TRAIN-TEST SPLIT ==========
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape


In [None]:
# ========== 3. POLYNOMIAL TRANSFORMATION ==========
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

degree = 2
poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_poly_train = poly_features.fit_transform(X_train)

In [None]:
# ========== 4. MODEL TRAINING ==========
poly_reg = LinearRegression()
poly_reg.fit(X_poly_train, y_train)
poly_reg.coef_
poly_reg.intercept_

In [None]:
# ========== 5. MODEL PREDICTION ==========
y_poly_predict = poly_reg.predict(X_poly_train)

In [None]:
# ========== 6. PERFORMANCE EVALUATION ==========
from sklearn.metrics import mean_squared_error
mse_train = mean_squared_error(y_train, y_poly_predict)
print(f'Mean Squared Error on Training Data: {mse_train}')

In [None]:
# ========== 7. VISUALIZATION ==========
plt.scatter(X_train, y_train, label="training data")
X_range = np.linspace(0, 2, 100).reshape(-1, 1)
X_range_poly = poly_features.transform(X_range)
plt.plot(X_range, poly_reg.predict(X_range_poly), color='red',
         label=f'Polynomial Regression (Degree {degree})')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Polynomial Regression')
plt.legend()
plt.show()

# Multicollenearity

In [None]:
# ========== 1. LOADING DATASET ==========
from sklearn.datasets import fetch_california_housing
import pandas as pd

data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['Price'] = data.target

In [None]:
# ========== 2. MULTICOLLINEARITY ANALYSIS ==========
import seaborn as sns
import matplotlib.pyplot as plt

sns.heatmap(df.corr(), annot=True, vmin=-1, vmax=1)
plt.figure(figsize=(5, 5))
sns.clustermap(df.corr(), vmin=-1, vmax=1, annot=True)

In [None]:
# ========== 3. VARIANCE INFLATION FACTOR (VIF) ==========
from statsmodels.stats.outliers_influence import variance_inflation_factor

df1 = df.copy()
df1.drop("Longitude", axis=1, inplace=True)

vif = pd.DataFrame()
vif["Feature"] = df1.columns
vif["VIF"] = [variance_inflation_factor(df1.values, i) for i in range(len(df1.columns))]
vif


In [None]:
df1.drop("AveRooms", axis=1, inplace=True)
vif = pd.DataFrame()
vif["Feature"] = df1.columns
vif["VIF"] = [variance_inflation_factor(df1.values, i) for i in range(len(df1.columns))]
vif

In [None]:
df1.drop("Latitude", axis=1, inplace=True)
vif = pd.DataFrame()
vif["Feature"] = df1.columns
vif["VIF"] = [variance_inflation_factor(df1.values, i) for i in range(len(df1.columns))]
vif

In [None]:
# ========== 4. FEATURE SELECTION (X & y) ==========
X = df1.iloc[:, :-1]
y = df1.iloc[:, -1]

In [None]:
# ========== 5. RFE (RECURSIVE FEATURE ELIMINATION) ==========
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X.columns

from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

X.columns
rfe = RFE(estimator=LinearRegression(), n_features_to_select=5)
rfe.fit(X, y)
rfe.predict(X)

print(rfe.support_)
print(rfe.ranking_)

selected_features = X.columns[rfe.support_]
print("Selected Features:", selected_features.tolist())

# LASSO RIDGE AND ELASTIC NET EMPLIMENATION

In [None]:
# -------------------- LOAD DATASET --------------------
import seaborn as sns
df = sns.load_dataset('mpg')
df.head()
df.drop("name", axis = 1, inplace = True)
df.isna().sum()
df['horsepower'] = df['horsepower'].fillna(df['horsepower'].median())
df.isna().sum()
df.info()
df.dtypes
df['origin'].value_counts()
df['origin'] = df['origin'].map({'usa': 1, "japan": 2, "europe": 3})
df['origin'] = df['origin'].astype(int)

In [None]:
# -------------------- FEATURE & TARGET SPLIT --------------------
X = df.drop('mpg', axis=1)
y = df['mpg']
X
y

In [None]:
# -------------------- TRAIN TEST SPLIT --------------------
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1)
X_train.shape, X_test.shape


In [None]:
# -------------------- LINEAR REGRESSION --------------------
from sklearn.linear_model import LinearRegression
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {regression_model.coef_[i]}")

from sklearn.metrics import r2_score
y_pred_linear = regression_model.predict(X_test)
r2_linear = r2_score(y_test, y_pred_linear)
print(f"R square of linear regression {r2_linear}")

In [None]:
# -------------------- RIDGE REGRESSION --------------------
from sklearn.linear_model import Ridge
ridge_regression_model = Ridge(alpha = 0.1)
ridge_regression_model.fit(X_train, y_train)
for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {ridge_regression_model.coef_[i]}")
y_pred_ridge = ridge_regression_model.predict(X_test)
r2_ridge = r2_score(y_test, y_pred_ridge)
print(f"R-squared score for Ridge Regression: {r2_ridge}")

In [None]:
# -------------------- LASSO REGRESSION --------------------
from sklearn.linear_model import Lasso
lasso_regression_model = Lasso(alpha = 0.5)
lasso_regression_model.fit(X_train, y_train)
for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {lasso_regression_model.coef_[i]}")
y_pred_lasso = lasso_regression_model.predict(X_test)
r2_lasso = r2_score(y_test, y_pred_lasso)
print(f"R-squared score for Lasso Regression: {r2_lasso}")

In [None]:
# -------------------- ELASTIC NET REGRESSION --------------------
from sklearn.linear_model import ElasticNet
elastic_net_model = ElasticNet(alpha = 1, l1_ratio = 0.5)
elastic_net_model.fit(X_train, y_train)
for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {elastic_net_model.coef_[i]}")
y_pred_elastic_net = elastic_net_model.predict(X_test)
r2_elastic_net = r2_score(y_test, y_pred_elastic_net)
print(f"R-squared score for Elastic Net Regression: {r2_elastic_net}")

In [None]:
# -------------------- LASSO WITH CROSS VALIDATION --------------------
from sklearn.linear_model import LassoCV
lassocv = LassoCV(cv=5)
lassocv.fit(X_train, y_train)
y_pred_lassocv = lassocv.predict(X_test)
score_lassocv = r2_score(y_test, y_pred_lassocv)
print("Best alpha chosen by LassoCV:", lassocv.alpha_)
print("R2 Score (LassoCV):", score_lassocv)

In [None]:
# -------------------- RIDGE WITH CROSS VALIDATION --------------------
from sklearn.linear_model import RidgeCV
ridgecv = RidgeCV(cv=5)
ridgecv.fit(X_train, y_train)
y_pred_ridgecv = ridgecv.predict(X_test)
score_ridgecv = r2_score(y_test, y_pred_ridgecv)
print("Best alpha chosen by RidgeCV:", ridgecv.alpha_)
print("R2 Score (RidgeCV):", score_ridgecv)
print("RidgeCV Parameters:", ridgecv.get_params())

# LOGISTIC REGRESSION

In [None]:
# ---------------- MASTER LOGISTIC REGRESSION PIPELINE ----------------
# This single, unified script combines all the code and concepts from the three
# Logistic Regression examples in your file into one complete workflow.

# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from sklearn.datasets import load_iris, make_classification
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_curve, auc, precision_score, recall_score

warnings.filterwarnings('ignore')  # Ignore warnings for cleaner output

# ====================================================================================
# --- STEP 1: LOAD, PREPARE, AND EXPLORE THE DATA (FROM IRIS EXAMPLE) ---
# ====================================================================================
print("--- Step 1: Loading and Preparing the Iris Dataset for Binary Classification ---")

# ---------------- 1.1 LOAD & EXPLORE IRIS DATA ----------------
data_iris = load_iris()
df_iris = pd.DataFrame(data_iris.data, columns=data_iris.feature_names)
df_iris['target'] = data_iris.target
print("Iris Data Head:")
print(df_iris.head())
print("\nUnique Target Classes:", df_iris['target'].unique())

# ---------------- 1.2 PREPARE FOR BINARY CLASSIFICATION ----------------
# We will convert this to a binary problem by removing class '2' (Virginica)
df_final = df_iris[df_iris['target'] != 2]
X = df_final.iloc[:, :-1]
y = df_final.iloc[:, -1]
print("\nData prepared for binary classification. Unique classes remaining:", y.unique())

# ---------------- 1.3 SPLIT DATA INTO TRAINING AND TESTING SETS ----------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
print(f"\nData split into {X_train.shape[0]} training samples and {X_test.shape[0]} testing samples.")
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 2: INITIAL MODEL TRAINING AND EVALUATION ---
# ====================================================================================
print("--- Step 2: Training an Initial Logistic Regression Model ---")

# ---------------- 2.1 TRAIN THE MODEL ----------------
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
y_pred_proba = classifier.predict_proba(X_test)[:, 1] # Get probabilities for the positive class
print("Model trained and initial predictions are made.")

# ---------------- 2.2 EVALUATION METRICS ----------------
print("\nStep 2.2: Evaluating the model with standard metrics...")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# ---------------- 2.3 ROC CURVE & AUC ----------------
print("\nStep 2.3: Plotting the ROC Curve and calculating AUC...")
fpr, tpr, thresholds_roc = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', linewidth=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', linewidth=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()


# ---------------- 2.4 K-FOLD CROSS-VALIDATION ----------------
print("\nStep 2.4: Performing K-Fold Cross-Validation (k=5) for robustness check...")
cv = KFold(n_splits=5)
scores = cross_val_score(classifier, X_train, y_train, cv=cv, scoring="accuracy")
print(f"Accuracy scores for each fold: {scores}")
print(f"Mean Accuracy from Cross-Validation: {np.mean(scores):.4f}")
print("\n" + "="*80 + "\n")


# ======================================================================================
# --- STEP 3: THRESHOLD ANALYSIS (FROM SYNTHETIC DATA EXAMPLE) ---
# ======================================================================================
print("--- Step 3: Analyzing the Precision-Recall-Accuracy Tradeoff ---")

# ---------------- 3.1 PRECISION-RECALL TRADEOFF ANALYSIS ----------------
print("\nStep 3.1: Plotting metrics vs. different classification thresholds...")
thresholds_tradeoff = np.linspace(0, 1, 100)
precisions, recalls, accuracies = [], [], []
for threshold in thresholds_tradeoff:
    y_pred_threshold = (y_pred_proba >= threshold).astype(int)
    precisions.append(precision_score(y_test, y_pred_threshold, zero_division=0))
    recalls.append(recall_score(y_test, y_pred_threshold))
    accuracies.append(accuracy_score(y_test, y_pred_threshold))

plt.figure(figsize=(10, 6))
plt.plot(thresholds_tradeoff, precisions, label='Precision')
plt.plot(thresholds_tradeoff, recalls, label='Recall')
plt.plot(thresholds_tradeoff, accuracies, label='Accuracy')
plt.xlabel('Threshold Probability')
plt.ylabel('Score')
plt.title('Precision, Recall, and Accuracy vs. Threshold Probability')
plt.legend()
plt.grid(True)
plt.show()


# ---------------- 3.2 EVALUATE WITH A CUSTOM THRESHOLD ----------------
print("\nStep 3.2: Re-evaluating model with an optimal custom threshold (e.g., 0.4)...")
# Based on the plot, we might choose a threshold that balances precision and recall.
custom_threshold = 0.4
new_pred_levels = np.where(y_pred_proba > custom_threshold, 1, 0)
print(f"Classification Report (custom threshold = {custom_threshold}):")
print(classification_report(y_test, new_pred_levels))
print("\n" + "="*80 + "\n")


# ======================================================================================
# --- STEP 4: HYPERPARAMETER TUNING (FROM GRIDSEARCHCV & RANDOMIZEDSEARCHCV EXAMPLE) ---
# ======================================================================================
print("--- Step 4: Finding the Best Model Parameters with Hyperparameter Tuning ---")

# ---------------- 4.1 HYPERPARAMETER TUNING WITH GRIDSEARCHCV ----------------
print("\nStep 4.1: Performing an exhaustive search with GridSearchCV...")
params = {'penalty': ['l1', 'l2', 'elasticnet'], 'C': [1, 10, 20, 30, 40]}
base_model = LogisticRegression(solver='liblinear') # 'l1' requires 'liblinear' solver
clf_grid = GridSearchCV(base_model, param_grid=params, cv=5)
clf_grid.fit(X_train, y_train)
print(f"Best Parameters from GridSearchCV: {clf_grid.best_params_}")
print(f"Best Score from GridSearchCV: {clf_grid.best_score_:.4f}")

# ---------------- 4.2 HYPERPARAMETER TUNING WITH RANDOMIZEDSEARCHCV ----------------
print("\nStep 4.2: Performing a randomized search with RandomizedSearchCV...")
randomized_clf = RandomizedSearchCV(base_model, param_distributions=params, cv=5, n_iter=10)
randomized_clf.fit(X_train, y_train)
print(f"Best Parameters from RandomizedSearchCV: {randomized_clf.best_params_}")
print(f"Best Score from RandomizedSearchCV: {randomized_clf.best_score_:.4f}")

# ---------------- 4.3 EVALUATE THE FINAL, TUNED MODEL ----------------
print("\nStep 4.3: Evaluating the final model with the best found parameters...")
# Using the best parameters found by GridSearchCV to build the final model.
final_model = LogisticRegression(**clf_grid.best_params_, solver='liblinear')
final_model.fit(X_train, y_train)
y_pred_final = final_model.predict(X_test)
print("Final Classification Report (after tuning):")
print(classification_report(y_test, y_pred_final))
print("\n--- MASTER PIPELINE COMPLETE ---")

# ---------------- END OF SCRIPT ----------------



In [None]:
# ---------------- MASTER MULTICLASS LOGISTIC REGRESSION PIPELINE ----------------
# This single, unified script combines all the code and concepts from your
# Multiclass Logistic Regression examples into one complete workflow.

# Import required libraries
import pandas as pd
import warnings
from sklearn.datasets import load_iris, make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

warnings.filterwarnings('ignore')  # Ignore warnings for cleaner output

# ====================================================================================
# --- STEP 1: LOAD, PREPARE, AND EXPLORE THE IRIS DATASET ---
# ====================================================================================
print("--- Step 1: Loading and Preparing the Iris Dataset for Multiclass Classification ---")

# ---------------- 1.1 LOAD & EXPLORE IRIS DATA ----------------
data = load_iris()
print("Available keys in the dataset object:", data.keys())
# print("\nDataset Description:")
# print(data.DESCR) # This can be very long, so it's commented out for cleaner output

# ---------------- 1.2 CREATE AND EXPLORE DATAFRAME ----------------
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
print("\nIris Data Head:")
print(df.head())
print("\nUnique Target Classes:", df.target.unique())

# ---------------- 1.3 SEPARATE FEATURES AND TARGET ----------------
X = df.iloc[:, :-1]  # All columns except the last are features
y = df.iloc[:, -1]   # The last column is the target
print("\nFeatures (X) and target (y) have been separated.")

# ---------------- 1.4 SPLIT DATA INTO TRAINING AND TESTING SETS ----------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
print(f"\nData split into {X_train.shape[0]} training samples and {X_test.shape[0]} testing samples.")
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 2: MODELING WITH 'ONE-VS-REST' (OVR) STRATEGY ---
# ====================================================================================
print("--- Step 2: Training a Model with the 'One-vs-Rest' (OVR) Strategy ---")
# The OVR strategy fits one classifier per class against all other classes.

# ---------------- 2.1 TRAIN THE OVR MODEL ----------------
# Note: By default, LogisticRegression uses 'auto' which often selects 'ovr' for multiclass.
# We will be explicit here for clarity.
ovr_model = LogisticRegression(multi_class='ovr', solver='lbfgs', max_iter=300)
ovr_model.fit(X_train, y_train)
print("OVR model trained successfully.")

# ---------------- 2.2 MAKE PREDICTIONS ----------------
y_pred_ovr = ovr_model.predict(X_test)
print("\nFirst 10 predictions from OVR model:", y_pred_ovr[:10])
# print("\nPrediction probabilities from OVR model:\n", ovr_model.predict_proba(X_test)[:5])

# ---------------- 2.3 EVALUATE THE OVR MODEL ----------------
print("\n--- OVR Model Evaluation ---")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_ovr))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred_ovr))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_ovr))
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 3: MODELING WITH 'MULTINOMIAL' STRATEGY ---
# ====================================================================================
print("--- Step 3: Training a Model with the 'Multinomial' Strategy ---")
# The Multinomial strategy considers all classes at once in a single model.

# ---------------- 3.1 TRAIN THE MULTINOMIAL MODEL ----------------
multinomial_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=300)
multinomial_model.fit(X_train, y_train)
print("Multinomial model trained successfully.")

# ---------------- 3.2 MAKE PREDICTIONS ----------------
y_pred_multinomial = multinomial_model.predict(X_test)
print("\nFirst 10 predictions from Multinomial model:", y_pred_multinomial[:10])
# print("\nPrediction probabilities from Multinomial model:\n", multinomial_model.predict_proba(X_test)[:5])

# ---------------- 3.3 EVALUATE THE MULTINOMIAL MODEL ----------------
print("\n--- Multinomial Model Evaluation ---")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_multinomial))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred_multinomial))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_multinomial))
print("\nNOTE: The ROC AUC curve is typically used for binary classification problems.")
print("\n--- MASTER PIPELINE COMPLETE ---")

# ---------------- END OF SCRIPT ----------------


# DECISION TREE CLASSIFIER

In [None]:
# ---------------- MASTER DECISION TREE CLASSIFIER PIPELINE ----------------
# This single, unified script combines all the code and concepts from your
# Decision Tree Classification examples into one complete workflow.

# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report

# ====================================================================================
# --- STEP 1: LOAD, PREPARE, AND EXPLORE THE IRIS DATASET ---
# ====================================================================================
print("--- Step 1: Loading and Preparing the Iris Dataset ---")

# ---------------- 1.1 LOAD & EXPLORE IRIS DATA ----------------
data = load_iris()
# print(data.DESCR) # Uncomment to see the full description

# ---------------- 1.2 CREATE DATAFRAME AND SEPARATE FEATURES/TARGET ----------------
df = pd.DataFrame(data.data, columns=data.feature_names)
X = df  # Feature matrix
y = data.target  # Target vector
print("Dataset loaded. Preview of features (X):")
print(X.head())
print("\nTarget (y) classes:", y[:10], "...")

# ---------------- 1.3 SPLIT DATA INTO TRAINING AND TESTING SETS ----------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
print(f"\nData split into {X_train.shape[0]} training samples and {X_test.shape[0]} testing samples.")
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 2: INITIAL MODEL AND POST-PRUNING ---
# ====================================================================================
print("--- Step 2: Training an Initial (Unpruned) Decision Tree ---")

# ---------------- 2.1 TRAIN AND VISUALIZE THE FULL TREE ----------------
# This model is not constrained and can grow to its full depth, risking overfitting.
unpruned_classifier = DecisionTreeClassifier(criterion='entropy')
unpruned_classifier.fit(X_train, y_train)

print("Visualizing the full, unpruned Decision Tree...")
plt.figure(figsize=(20, 15))
plot_tree(unpruned_classifier, filled=True, feature_names=data.feature_names, class_names=data.target_names)
plt.title("Full, Unpruned Decision Tree")
plt.show()


# ---------------- 2.2 TRAIN AND EVALUATE A POST-PRUNED TREE ----------------
print("\n--- Step 2.2: Training and Evaluating a Post-Pruned Tree (max_depth=2) ---")
# Post-pruning simplifies the tree by limiting its growth, e.g., by setting max_depth.
# This helps prevent overfitting and improves generalization.
pruned_classifier = DecisionTreeClassifier(criterion='entropy', max_depth=2)
pruned_classifier.fit(X_train, y_train)

print("Visualizing the post-pruned Decision Tree...")
plt.figure(figsize=(12, 8))
plot_tree(pruned_classifier, filled=True, feature_names=data.feature_names, class_names=data.target_names)
plt.title("Post-Pruned Decision Tree (max_depth=2)")
plt.show()


# --- Make predictions and evaluate the PRUNED model ---
y_pred_pruned = pruned_classifier.predict(X_test)
print("\n--- Evaluation of the Post-Pruned Model ---")
score = accuracy_score(y_pred_pruned, y_test)
print(f"Accuracy Score: {score:.4f}")
print("\nClassification Report:")
print(classification_report(y_pred_pruned, y_test))
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 3: PRE-PRUNING WITH HYPERPARAMETER TUNING (GRIDSEARCHCV) ---
# ====================================================================================
print("--- Step 3: Finding the Best Model with Pre-Pruning (GridSearchCV) ---")
# Pre-pruning finds the best hyperparameters BEFORE training the final model.

# ---------------- 3.1 DEFINE HYPERPARAMETER GRID ----------------
parameter = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'max_depth': [1, 2, 3, 4, 5, 6],
    'max_features': ['sqrt', 'log2', 'auto']
}
print("Defined parameter grid for hyperparameter search.")

# ---------------- 3.2 PERFORM GRID SEARCH WITH CROSS-VALIDATION ----------------
# GridSearchCV exhaustively tries all parameter combinations with 5-fold cross-validation.
clf = DecisionTreeClassifier()
grid_search_model = GridSearchCV(clf, param_grid=parameter, cv=5, scoring='accuracy')
print("\nRunning GridSearchCV... (This may take a moment)")
grid_search_model.fit(X_train, y_train)
print("GridSearchCV complete.")

# ---------------- 3.3 DISPLAY BEST PARAMETERS AND SCORE ----------------
print("\n--- GridSearchCV Results ---")
print("Best Hyperparameters Found:")
print(grid_search_model.best_params_)
print(f"\nBest Cross-Validated Accuracy Score on Training Data: {grid_search_model.best_score_:.4f}")

# ---------------- 3.4 EVALUATE THE BEST MODEL ON THE TEST SET ----------------
print("\n--- Final Evaluation of the Best Model from GridSearchCV on the Test Set ---")
# It's crucial to evaluate the final, tuned model on the unseen test data.
best_model = grid_search_model.best_estimator_
y_pred_best = best_model.predict(X_test)
print(f"Accuracy Score on Test Data: {accuracy_score(y_test, y_pred_best):.4f}")
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_pred_best))
print("\n--- MASTER PIPELINE COMPLETE ---")

# ---------------- END OF SCRIPT ----------------


# DECION TREE REGRESSOR

In [None]:
# ---------------- MASTER DECISION TREE REGRESSOR PIPELINE ----------------
# This single, unified script combines all the code and concepts from your
# Decision Tree Regression examples into one complete workflow.

# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.metrics import r2_score

# ====================================================================================
# --- STEP 1: LOAD, PREPARE, AND SAMPLE THE DATASET ---
# ====================================================================================
print("--- Step 1: Loading and Preparing the California Housing Dataset ---")

# ---------------- 1.1 LOAD DATASET AND CREATE DATAFRAME ----------------
data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['Price'] = data.target
print(f"Original dataset shape: {df.shape}")

# ---------------- 1.2 SAMPLE DATA ----------------
# Take a 20% random sample for faster processing, as done in your code.
df = df.sample(frac=0.20, random_state=1) # Added random_state for reproducibility
print(f"Shape after sampling (20%): {df.shape}")

# ---------------- 1.3 SEPARATE FEATURES AND TARGET ----------------
X = df.iloc[:, :-1]  # Features (all columns except the last)
y = df.iloc[:, -1]   # Target (the last column)
print("\nFeatures (X) and target (y) have been separated.")
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 2: INITIAL MODEL TRAINING AND EVALUATION ---
# ====================================================================================
print("--- Step 2: Training and Evaluating an Initial (Untuned) Decision Tree ---")

# ---------------- 2.1 SPLIT DATA INTO TRAINING AND TESTING SETS ----------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print(f"Data split into {X_train.shape[0]} training samples and {X_test.shape[0]} testing samples.")

# ---------------- 2.2 CREATE AND FIT THE INITIAL MODEL ----------------
initial_model = DecisionTreeRegressor(random_state=1) # Added random_state for reproducibility
initial_model.fit(X_train, y_train)
print("\nInitial Decision Tree Regressor model has been trained.")

# ---------------- 2.3 PREDICT AND EVALUATE ----------------
y_pred_initial = initial_model.predict(X_test)
initial_r2 = r2_score(y_test, y_pred_initial) # Correct order is (y_true, y_pred)
print(f"\nInitial R-squared Score: {initial_r2:.4f}")
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 3: HYPERPARAMETER TUNING WITH GRIDSEARCHCV ---
# ====================================================================================
print("--- Step 3: Finding the Best Model with Hyperparameter Tuning (GridSearchCV) ---")

# ---------------- 3.1 DEFINE HYPERPARAMETER GRID ----------------
parameter = {
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    'splitter': ['best', 'random'],
    'max_depth': [1, 2, 3, 4, 5, 6],
    'max_features': ['auto', 'sqrt', 'log2']
}
print("Defined parameter grid for hyperparameter search.")

# ---------------- 3.2 PERFORM GRID SEARCH WITH CROSS-VALIDATION ----------------
regressor = DecisionTreeRegressor(random_state=1)
grid_search_model = GridSearchCV(regressor, param_grid=parameter, cv=3, scoring='neg_mean_squared_error')
print("\nRunning GridSearchCV... (This may take a moment)")
grid_search_model.fit(X_train, y_train)
print("GridSearchCV complete.")

# ---------------- 3.3 DISPLAY BEST PARAMETERS ----------------
print("\n--- GridSearchCV Results ---")
print("Best Hyperparameters Found:")
print(grid_search_model.best_params_)
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 4: FINAL MODEL TRAINING, VISUALIZATION, AND EVALUATION ---
# ====================================================================================
print("--- Step 4: Training and Visualizing the Final, Tuned Model ---")

# ---------------- 4.1 CREATE AND TRAIN THE FINAL MODEL ----------------
# Use the best parameters found from GridSearchCV (as specified in your code).
# Note: Your code had a hardcoded example, we will use that for consistency.
final_model = DecisionTreeRegressor(
    criterion='poisson',
    max_depth=6,
    max_features='auto',
    splitter='best',
    random_state=1 # Added for reproducibility
)
final_model.fit(X_train, y_train)
print("Final model with best parameters has been trained.")

# ---------------- 4.2 VISUALIZE THE FINAL TREE ----------------
print("\nVisualizing the final, tuned Decision Tree...")
plt.figure(figsize=(20, 15))
tree.plot_tree(final_model, filled=True, feature_names=X.columns, rounded=True)
plt.title("Final Tuned Decision Tree Regressor")
plt.show()


# ---------------- 4.3 FINAL PREDICTION AND EVALUATION ----------------
# Your code uses the 'model' (GridSearchCV object) for the final prediction.
# The .predict() method of a fitted GridSearchCV object automatically uses the best estimator.
y_pred_final = grid_search_model.predict(X_test)
final_r2 = r2_score(y_test, y_pred_final) # Correct order is (y_true, y_pred)
print(f"\nFinal R-squared Score from Best Model: {final_r2:.4f}")
print("\n--- MASTER PIPELINE COMPLETE ---")

# ---------------- END OF SCRIPT ----------------


# SUPPORT VECTOR CLASSIFIER

In [None]:
# ---------------- MASTER SVM CLASSIFIER PIPELINE ----------------
# This single, unified script combines all the code and concepts from your
# Support Vector Machine (SVM) example into one complete workflow.

# Import required libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# ====================================================================================
# --- STEP 1: GENERATE AND VISUALIZE THE DATASET ---
# ====================================================================================
print("--- Step 1: Generating and Visualizing Synthetic Classification Data ---")

# ---------------- 1.1 GENERATE SYNTHETIC DATA ----------------
# As per your code, we create a dataset with 2 informative features and 2 classes.
X, y = make_classification(
    n_samples=1000,
    n_features=2,
    n_classes=2,
    n_clusters_per_class=2,
    n_redundant=0,
    random_state=42 # Added for reproducibility
)
print("Synthetic dataset with 1000 samples and 2 features has been generated.")

# ---------------- 1.2 VISUALIZE THE DATA ----------------
# A scatter plot helps to visualize the clusters for each class.
print("\nVisualizing the data...")
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x=pd.DataFrame(X)[0],
    y=pd.DataFrame(X)[1],
    hue=y
)
plt.title("Scatter Plot of Synthetic Data")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.grid(True)
plt.show()

print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 2: INITIAL MODEL TRAINING AND EVALUATION ---
# ====================================================================================
print("--- Step 2: Training and Evaluating an Initial (Untuned) SVM Classifier ---")

# ---------------- 2.1 SPLIT DATA INTO TRAINING AND TESTING SETS ----------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)
print(f"Data split into {X_train.shape[0]} training samples and {X_test.shape[0]} testing samples.")

# ---------------- 2.2 CREATE AND FIT THE INITIAL MODEL ----------------
# An SVM with a linear kernel attempts to find a straight line that best separates the classes.
initial_classifier = SVC(kernel='linear')
initial_classifier.fit(X_train, y_train)
print("\nInitial SVM model with linear kernel has been trained.")
print("Learned Coefficients (weights):", initial_classifier.coef_)

# ---------------- 2.3 PREDICT AND EVALUATE ----------------
y_pred_initial = initial_classifier.predict(X_test)
print("\n--- Initial Model Evaluation ---")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_initial))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred_initial))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_initial))
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 3: HYPERPARAMETER TUNING WITH GRIDSEARCHCV ---
# ====================================================================================
print("--- Step 3: Finding the Best Model with Hyperparameter Tuning (GridSearchCV) ---")

# ---------------- 3.1 DEFINE HYPERPARAMETER GRID ----------------
# Define the grid of parameters to search through.
params = {
    'C': [0.1, 1, 10, 50, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf'] # Added 'rbf' to make the search more comprehensive
}
print("Defined parameter grid for hyperparameter search.")

# ---------------- 3.2 PERFORM GRID SEARCH WITH CROSS-VALIDATION ----------------
# GridSearchCV exhaustively tests all parameter combinations.
# verbose=3 provides detailed output during the fitting process.
grid_search = GridSearchCV(SVC(), param_grid=params, cv=5, verbose=3)
print("\nRunning GridSearchCV... (This may take a moment)")
grid_search.fit(X_train, y_train)
print("GridSearchCV complete.")

# ---------------- 3.3 DISPLAY BEST PARAMETERS AND SCORE ----------------
print("\n--- GridSearchCV Results ---")
print("Best Hyperparameters Found:")
print(grid_search.best_params_)
print(f"\nBest Cross-Validated Accuracy Score: {grid_search.best_score_:.4f}")
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 4: FINAL EVALUATION OF THE BEST MODEL ---
# ====================================================================================
print("--- Step 4: Final Evaluation Using the Best Model from GridSearchCV ---")

# ---------------- 4.1 PREDICT USING THE BEST MODEL ----------------
# The fitted GridSearchCV object automatically uses the best estimator for predictions.
y_pred_final = grid_search.predict(X_test)
print("Predictions made using the best model found by GridSearchCV.")

# ---------------- 4.2 FINAL EVALUATION METRICS ----------------
print("\n--- Final Tuned Model Evaluation ---")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_final))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred_final))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_final))
print("\n--- MASTER PIPELINE COMPLETE ---")

# ---------------- END OF SCRIPT ----------------


# SUPPORT VACTOR REGRESSOR

In [None]:
# ---------------- MASTER SUPPORT VECTOR REGRESSOR (SVR) PIPELINE ----------------
# This single, unified script combines all the code and concepts from your
# Support Vector Regressor (SVR) example into one complete workflow.

# Import required libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import r2_score

# ====================================================================================
# --- STEP 1: GENERATE AND VISUALIZE THE DATASET ---
# ====================================================================================
print("--- Step 1: Generating and Visualizing Synthetic Regression Data ---")

# ---------------- 1.1 GENERATE SYNTHETIC DATA ----------------
# As per your code, we create a dataset with 2 features and 1 target.
X, y = make_regression(
    n_samples=1000,
    n_features=2,
    n_targets=1,
    noise=3.0,
    random_state=42 # Added for reproducibility
)
print("Synthetic dataset with 1000 samples and 2 features has been generated.")

# ---------------- 1.2 VISUALIZE THE DATA ----------------
# A scatter plot helps to visualize the relationship between features and the target.
print("\nVisualizing the data...")
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x=pd.DataFrame(X)[0],
    y=pd.DataFrame(X)[1],
    hue=y,
    palette='viridis' # Use a continuous color palette for regression
)
plt.title("Scatter Plot of Synthetic Regression Data")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.grid(True)
plt.show()

print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 2: INITIAL MODEL TRAINING AND EVALUATION ---
# ====================================================================================
print("--- Step 2: Training and Evaluating an Initial (Untuned) SVR ---")

# ---------------- 2.1 SPLIT DATA INTO TRAINING AND TESTING SETS ----------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)
print(f"Data split into {X_train.shape[0]} training samples and {X_test.shape[0]} testing samples.")

# ---------------- 2.2 CREATE AND FIT THE INITIAL MODEL ----------------
# An SVR with a linear kernel attempts to fit a hyperplane to the data.
initial_svr = SVR(kernel='linear')
initial_svr.fit(X_train, y_train)
print("\nInitial SVR model with linear kernel has been trained.")
print("Learned Coefficients (weights):", initial_svr.coef_)

# ---------------- 2.3 PREDICT AND EVALUATE ----------------
y_pred_initial = initial_svr.predict(X_test)
initial_r2 = r2_score(y_test, y_pred_initial)
print(f"\nInitial R-squared Score: {initial_r2:.4f}")
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 3: HYPERPARAMETER TUNING WITH GRIDSEARCHCV ---
# ====================================================================================
print("--- Step 3: Finding the Best Model with Hyperparameter Tuning (GridSearchCV) ---")

# ---------------- 3.1 DEFINE HYPERPARAMETER GRID ----------------
# Define the grid of parameters to search through.
params = {
    'C': [0.1, 1, 10, 50, 100],
    'gamma': [1, 0.1, 0.001], # 'gamma' is not used by linear kernel, but kept for consistency
    'kernel': ['linear'],
    'epsilon': [0.01, 0.1, 0.2, 0.3]
}
print("Defined parameter grid for hyperparameter search.")

# ---------------- 3.2 PERFORM GRID SEARCH WITH CROSS-VALIDATION ----------------
# GridSearchCV exhaustively tests all parameter combinations.
# verbose=3 provides detailed output during the fitting process.
grid_search = GridSearchCV(SVR(), param_grid=params, cv=5, verbose=3)
print("\nRunning GridSearchCV... (This may take a moment)")
grid_search.fit(X_train, y_train)
print("GridSearchCV complete.")

# ---------------- 3.3 DISPLAY BEST PARAMETERS AND SCORE ----------------
print("\n--- GridSearchCV Results ---")
print("Best Hyperparameters Found:")
print(grid_search.best_params_)
print(f"\nBest Cross-Validated R2 Score: {grid_search.best_score_:.4f}")
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 4: FINAL EVALUATION OF THE BEST MODEL ---
# ====================================================================================
print("--- Step 4: Final Evaluation Using the Best Model from GridSearchCV ---")

# ---------------- 4.1 PREDICT USING THE BEST MODEL ----------------
# The fitted GridSearchCV object automatically uses the best estimator for predictions.
y_pred_final = grid_search.predict(X_test)
print("Predictions made using the best model found by GridSearchCV.")

# ---------------- 4.2 FINAL EVALUATION METRICS ----------------
final_r2 = r2_score(y_test, y_pred_final)
print(f"\nFinal R-squared Score on Test Data: {final_r2:.4f}")
print("\n--- MASTER PIPELINE COMPLETE ---")

# ---------------- END OF SCRIPT ----------------


# SVM KERNAL

In [None]:
# ---------------- SVM NON-LINEAR DATA PREPARATION PIPELINE ----------------
# This script contains the exact code you provided for generating non-linear data,
# engineering new features to make it separable, and visualizing the results.

# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split

# ====================================================================================
# --- STEP 1: GENERATE AND VISUALIZE NON-LINEAR DATA (CONCENTRIC CIRCLES) ---
# ====================================================================================
print("--- Step 1: Generating and Visualizing Non-Linear Classification Data ---")

# ---------------- 1.1 CREATE DATA FOR TWO CIRCLES ----------------
# Create first circle points (radius 10)
x_outer = np.linspace(-6.0, 6.0, 100)
y_outer = np.sqrt(10**2 - x_outer**2)
y_outer = np.hstack([y_outer, -y_outer])
x_outer = np.hstack([x_outer, -x_outer])

# Create second circle points (radius 4)
x_inner = np.linspace(-6.0, 6.0, 100)
y_inner = np.sqrt(4**2 - x_inner**2)
y_inner = np.hstack([y_inner, -y_inner])
x_inner = np.hstack([x_inner, -x_inner])

# ---------------- 1.2 PLOT THE CIRCLES ----------------
print("\nVisualizing the original 2D data...")
plt.figure(figsize=(8, 8))
plt.scatter(y_outer, x_outer, label='Circle radius 10')
plt.scatter(y_inner, x_inner, label='Circle radius 4')
plt.title("Original Non-Linearly Separable Data")
plt.legend()
plt.grid(True)
plt.show()

print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 2: PREPARE DATAFRAME AND ENGINEER NEW FEATURES ---
# ====================================================================================
print("--- Step 2: Preparing DataFrame and Engineering Non-Linear Features ---")

# ---------------- 2.1 CREATE DATAFRAMES FOR EACH CLASS ----------------
# Create DataFrame for the outer circle and assign class 0
df1 = pd.DataFrame(np.vstack([y_outer, x_outer]).T, columns=['X1', 'X2'])
df1['Y'] = 0

# Create DataFrame for the inner circle and assign class 1
df2 = pd.DataFrame(np.vstack([y_inner, x_inner]).T, columns=['X1', 'X2'])
df2['Y'] = 1

# Combine both datasets into one
df = pd.concat([df1, df2], ignore_index=True)
print("Combined DataFrame created. Head:")
print(df.head(5))

# ---------------- 2.2 CREATE NEW FEATURES (NON-LINEAR TRANSFORMATION) ----------------
# Make a copy and drop any missing values from the sqrt operation
df1 = df.copy()
df1 = df1.dropna()

# This is the key step: creating new features from the original ones.
df1["x1square"] = df1["X1"]**2
df1["x2square"] = df1["X2"]**2
df1["x1x2"] = df1["X1"] * df1["X2"]
print("\nDataFrame with new polynomial features:")
print(df1.head())
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 3: SPLIT DATA AND VISUALIZE THE TRANSFORMED FEATURES ---
# ====================================================================================
print("--- Step 3: Splitting Data and Visualizing in 3D ---")

# ---------------- 3.1 DEFINE FEATURES, TARGET, AND SPLIT ----------------
# Define features (X) and target (y) for modeling
X = df1[["x1square", "x2square", "x1x2"]]  # Only the new non-linear features
y = df1['Y']                               # Target label

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)
print("Data has been split into training and testing sets. Training features head:")
print(X_train.head())

# ---------------- 3.2 VISUALIZE THE TRANSFORMED FEATURES IN 3D ----------------
# By plotting the new features, we can see if they are now linearly separable.
print("\nDisplaying the 3D plot of the transformed features...")
fig = px.scatter_3d(df1, x="x1square", y="x2square", z="x1x2", color="Y")
fig.update_layout(title="Data in Transformed 3D Feature Space")
fig.show()
print("\nThis concludes the data preparation and visualization steps from your code.")
print("\n--- PIPELINE COMPLETE ---")

# ---------------- END OF SCRIPT ----------------


# Gaussian Naive Bayes

In [None]:
# ---------------- GAUSSIAN NAIVE BAYES PIPELINE ----------------
# This script contains the exact code you provided for implementing a Gaussian
# Naive Bayes classifier on the Iris dataset.

# Import required libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
import pandas as pd

# ====================================================================================
# --- STEP 1: LOAD THE IRIS DATASET ---
# ====================================================================================
print("--- Step 1: Loading the Iris Dataset ---")

# Load the iris dataset directly as features (X) and target (y)
X, y = load_iris(return_X_y=True)

print("Dataset loaded successfully.")
print(f"Shape of features (X): {X.shape}")
print(f"Shape of target (y): {y.shape}")
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 2: SPLIT DATA INTO TRAINING AND TESTING SETS ---
# ====================================================================================
print("--- Step 2: Splitting Data into Training and Testing Sets ---")

# 30% of data is reserved for testing, 70% for training
# random_state=1 ensures the split is the same every time the code is run
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

print(f"Data split into {X_train.shape[0]} training samples and {X_test.shape[0]} testing samples.")
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 3: TRAIN THE GAUSSIAN NAIVE BAYES MODEL ---
# ====================================================================================
print("--- Step 3: Training the Gaussian Naive Bayes Classifier ---")

# Create a GaussianNB classifier instance
clf = GaussianNB()

# Fit the classifier on the training data
clf.fit(X_train, y_train)

print("Model has been trained successfully.")
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 4: MAKE PREDICTIONS ON THE TEST SET ---
# ====================================================================================
print("--- Step 4: Making Predictions on the Test Data ---")

# Predict the target labels for the test set
y_pred = clf.predict(X_test)

print("Predictions have been generated. Displaying the predicted labels for the test set:")
print(y_pred)
print("\nThis concludes the steps from the code you provided.")
print("\n--- PIPELINE COMPLETE ---")

# ---------------- END OF SCRIPT ----------------


# Ensamble Custom bagging classfier AND REGRESSOR

In [None]:
# ---------------- ENSEMBLE LEARNING PIPELINE (CLASSIFIER & REGRESSOR) ----------------
# This single, unified script combines all the code you provided for both
# ensemble classification (VotingClassifier) and ensemble regression (VotingRegressor).

# Import required libraries
import pandas as pd
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVC, SVR
from sklearn.ensemble import VotingClassifier, VotingRegressor
from sklearn.metrics import accuracy_score

# ====================================================================================
# --- PART 1: ENSEMBLE CUSTOM BAGGING CLASSIFIER ---
# ====================================================================================
print("--- Part 1: Building and Evaluating an Ensemble Classifier ---")

# ---------------- 1.1 GENERATE AND SPLIT CLASSIFICATION DATA ----------------
print("\nStep 1.1: Generating synthetic classification data...")
X_clf, y_clf = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=1)
X_clf_train, X_clf_test, y_clf_train, y_clf_test = train_test_split(X_clf, y_clf, test_size=0.2, random_state=1)
print(f"Classification data split into {X_clf_train.shape[0]} training and {X_clf_test.shape[0]} testing samples.")

# ---------------- 1.2 INITIALIZE BASE CLASSIFIERS ----------------
print("\nStep 1.2: Initializing base models for the ensemble...")
nb_clf = GaussianNB()
lr_clf = LogisticRegression(random_state=1)
dt_clf = DecisionTreeClassifier(random_state=1)
svm_clf = SVC(kernel="linear", random_state=1)
print("Base classifiers (Naive Bayes, Logistic Regression, Decision Tree, SVM) created.")

# ---------------- 1.3 CREATE AND TRAIN THE ENSEMBLE CLASSIFIER ----------------
print("\nStep 1.3: Creating and training the VotingClassifier...")
# A VotingClassifier combines different models and predicts based on a majority vote ('hard' voting).
ensemble_clf = VotingClassifier(
    estimators=[
        ('decision_tree', dt_clf),
        ('naive_bayes', nb_clf),
        ('log_reg', lr_clf),
        ('svm', svm_clf)
    ],
    voting="hard"
)
# Train the ensemble model on the training data
ensemble_clf.fit(X_clf_train, y_clf_train)
print("Ensemble classifier trained successfully.")

# ---------------- 1.4 EVALUATE THE ENSEMBLE CLASSIFIER ----------------
print("\nStep 1.4: Evaluating the ensemble classifier on the test data...")
y_pred_clf = ensemble_clf.predict(X_clf_test)
accuracy = accuracy_score(y_clf_test, y_pred_clf)
print(f"Accuracy of the Ensemble Classifier: {accuracy:.4f}")
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- PART 2: ENSEMBLE CUSTOM BAGGING REGRESSOR ---
# ====================================================================================
print("--- Part 2: Building an Ensemble Regressor ---")

# ---------------- 2.1 GENERATE AND SPLIT REGRESSION DATA ----------------
print("\nStep 2.1: Generating synthetic regression data...")
X_reg, y_reg = make_regression(n_samples=1000, n_features=10, noise=0.1, random_state=42)
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=1)
print(f"Regression data split into {X_reg_train.shape[0]} training and {X_reg_test.shape[0]} testing samples.")

# ---------------- 2.2 INITIALIZE BASE REGRESSORS ----------------
print("\nStep 2.2: Initializing base models for the ensemble...")
lr_reg = LinearRegression()
dtr_reg = DecisionTreeRegressor(random_state=1)
svr_reg = SVR(kernel="linear")
print("Base regressors (Linear Regression, Decision Tree, SVR) created.")

# ---------------- 2.3 CREATE AND TRAIN THE ENSEMBLE REGRESSOR ----------------
print("\nStep 2.3: Creating and training the VotingRegressor...")
# A VotingRegressor combines different models and averages their individual predictions.
ensemble_regressor = VotingRegressor(
    estimators=[
        ('mlr', lr_reg),
        ("dtr", dtr_reg),
        ("svr", svr_reg)
    ]
)
# Train the ensemble model on the training data
ensemble_regressor.fit(X_reg_train, y_reg_train)
print("Ensemble regressor trained successfully.")
print("\nNote: Further steps would involve making predictions with 'ensemble_regressor.predict(X_reg_test)' and evaluating with a regression metric like R2 score.")
print("\n--- MASTER PIPELINE COMPLETE ---")

# ---------------- END OF SCRIPT ----------------


# Multiple models with Pipeline and ColumnTransformer

In [None]:
# ---------------- AUTOMATED FEATURE ENGINEERING & MODEL EVALUATION PIPELINE ----------------
# This single, unified script combines all the code you provided for using Pipeline
# and ColumnTransformer to preprocess data and evaluate multiple models.

# Import required libraries
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# ====================================================================================
# --- STEP 1: LOAD AND PREPARE THE DATA ---
# ====================================================================================
print("--- Step 1: Loading and Preparing the 'tips' Dataset ---")

# ---------------- 1.1 LOAD DATA AND ENCODE TARGET VARIABLE ----------------
# Load the 'tips' dataset from seaborn
df = sns.load_dataset("tips")
print("Original Data Head:")
print(df.head())

# The target variable 'time' is categorical ('Lunch', 'Dinner').
# We use LabelEncoder to convert it to a numerical format (0, 1).
encoder = LabelEncoder()
df['time'] = encoder.fit_transform(df['time'])
print("\n'time' column after Label Encoding (Lunch=0, Dinner=1):")
print(df['time'].unique())

# ---------------- 1.2 SEPARATE FEATURES AND TARGET & SPLIT DATA ----------------
X = df.drop('time', axis=1)  # Features
y = df['time']               # Target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
print(f"\nData split into {X_train.shape[0]} training and {X_test.shape[0]} testing samples.")
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 2: CREATE THE PREPROCESSING PIPELINE ---
# ====================================================================================
print("--- Step 2: Building an Automated Preprocessing Pipeline ---")

# ---------------- 2.1 DEFINE COLUMN TYPES ----------------
# Identify which columns are categorical and which are numerical
cat_cols = ["sex", "smoker", "day"]
num_cols = ["total_bill", "tip", "size"]
print(f"Categorical columns identified: {cat_cols}")
print(f"Numerical columns identified: {num_cols}")

# ---------------- 2.2 CREATE PIPELINE FOR NUMERICAL FEATURES ----------------
# This pipeline will first fill any missing values with the median,
# then scale the data to have a mean of 0 and a standard deviation of 1.
num_pipeline = Pipeline(steps=[
    ('imputation', SimpleImputer(strategy="median")),
    ('scaling', StandardScaler())
])
print("\nNumerical pipeline created (Imputation -> Scaling).")

# ---------------- 2.3 CREATE PIPELINE FOR CATEGORICAL FEATURES ----------------
# This pipeline will first fill missing values with the most frequent category,
# then convert the categories into a numerical format using one-hot encoding.
cat_pipeline = Pipeline(steps=[
    ('imputation', SimpleImputer(strategy="most_frequent")),
    ('encoding', OneHotEncoder())
])
print("Categorical pipeline created (Imputation -> One-Hot Encoding).")

# ---------------- 2.4 COMBINE PIPELINES WITH COLUMNTRANSFORMER ----------------
# ColumnTransformer applies the correct pipeline to the correct set of columns.
preprocessor = ColumnTransformer([
    ("num_pipeline", num_pipeline, num_cols),
    ("cat_pipeline", cat_pipeline, cat_cols)
])
print("\nPreprocessor created to apply pipelines to the correct columns.")
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 3: APPLY THE PREPROCESSOR TO THE DATA ---
# ====================================================================================
print("--- Step 3: Applying the Preprocessing Pipeline to the Data ---")

# We use fit_transform on the training data to learn the preprocessing steps
# and apply them simultaneously.
X_train_processed = preprocessor.fit_transform(X_train)

# We use only transform on the test data to apply the same steps learned from the training data.
X_test_processed = preprocessor.transform(X_test)

print("Training and testing data have been successfully transformed.")
print(f"Shape of processed training data: {X_train_processed.shape}")
print(f"Shape of processed testing data: {X_test_processed.shape}")
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 4: TRAIN AND EVALUATE MULTIPLE MODELS ---
# ====================================================================================
print("--- Step 4: Training and Evaluating Multiple Classifiers ---")

# ---------------- 4.1 DEFINE MODELS AND EVALUATION FUNCTION ----------------
# A dictionary to hold the models we want to train.
models = {
    "support vector classifier": SVC(),
    "DT classifier": DecisionTreeClassifier(),
    "logistic regression": LogisticRegression()
}

# A function to loop through the models, train each one, and return its accuracy.
def model_train_eval(X_train, y_train, X_test, y_test, models):
    evaluation = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        model_score = accuracy_score(y_test, y_pred)
        evaluation[name] = model_score
    return evaluation

# ---------------- 4.2 RUN THE EVALUATION ----------------
model_scores = model_train_eval(X_train_processed, y_train, X_test_processed, y_test, models)
print("--- Model Evaluation Results (Accuracy) ---")
for name, score in model_scores.items():
    print(f"{name}: {score:.4f}")

print("\n--- MASTER PIPELINE COMPLETE ---")
# ---------------- END OF SCRIPT ----------------


# OOB SCORE

In [None]:

# ====================================
# Random Forest Classifier with OOB
# ====================================

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

# 1️⃣ Create a synthetic dataset
X, y = make_classification(
    n_samples=1000,     # 1000 samples
    n_features=20,      # 20 features
    n_classes=2,        # Binary classification
    random_state=42     # Reproducibility
)

# 2️⃣ Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(
    n_estimators=100,   # 100 trees in the forest
    oob_score=True,     # Use Out-of-Bag samples to estimate generalization
    random_state=42
)

# 3️⃣ Train the model
rf_classifier.fit(X, y)

# 4️⃣ Extract Out-of-Bag score
oob_score = rf_classifier.oob_score_
print("Out-of-Bag Score:", oob_score)

# RANDOM FORSEST CLASSIFIER

In [None]:
# ---------------- AUTOMATED FEATURE ENGINEERING & MODEL EVALUATION PIPELINE ----------------
# This single, unified script uses Pipeline and ColumnTransformer to preprocess data
# and then trains, evaluates, and tunes a Random Forest Classifier.

# Import required libraries
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# ====================================================================================
# --- STEP 1: LOAD AND PREPARE THE DATA ---
# ====================================================================================
print("--- Step 1: Loading and Preparing the 'tips' Dataset ---")

# ---------------- 1.1 LOAD DATA AND ENCODE TARGET VARIABLE ----------------
df = sns.load_dataset("tips")
print("Original Data Head:")
print(df.head())

# Use LabelEncoder to convert the target 'time' to a numerical format (0, 1).
encoder = LabelEncoder()
df['time'] = encoder.fit_transform(df['time'])
print("\n'time' column after Label Encoding (Lunch=0, Dinner=1).")

# ---------------- 1.2 SEPARATE FEATURES AND TARGET & SPLIT DATA ----------------
X = df.drop('time', axis=1)
y = df['time']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
print(f"\nData split into {X_train.shape[0]} training and {X_test.shape[0]} testing samples.")
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 2: CREATE THE PREPROCESSING PIPELINE ---
# ====================================================================================
print("--- Step 2: Building an Automated Preprocessing Pipeline ---")

# ---------------- 2.1 DEFINE COLUMN TYPES ----------------
cat_cols = ["sex", "smoker", "day"]
num_cols = ["total_bill", "tip", "size"]
print(f"Categorical columns: {cat_cols}")
print(f"Numerical columns: {num_cols}")

# ---------------- 2.2 CREATE SUB-PIPELINES ----------------
num_pipeline = Pipeline(steps=[
    ('imputation', SimpleImputer(strategy="median")),
    ('scaling', StandardScaler())
])
cat_pipeline = Pipeline(steps=[
    ('imputation', SimpleImputer(strategy="most_frequent")),
    ('encoding', OneHotEncoder())
])

# ---------------- 2.3 COMBINE PIPELINES WITH COLUMNTRANSFORMER ----------------
preprocessor = ColumnTransformer([
    ("num_pipeline", num_pipeline, num_cols),
    ("cat_pipeline", cat_pipeline, cat_cols)
])
print("\nPreprocessor created to automate feature engineering.")
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 3: APPLY THE PREPROCESSOR AND TRAIN MULTIPLE MODELS ---
# ====================================================================================
print("--- Step 3: Preprocessing Data and Evaluating Multiple Models ---")

# ---------------- 3.1 APPLY THE PREPROCESSOR ----------------
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
print("Training and testing data have been successfully transformed.")

# ---------------- 3.2 DEFINE MODELS AND EVALUATION FUNCTION ----------------
models = {
    "support vector classifier": SVC(),
    "DT classifier": DecisionTreeClassifier(),
    "Logistic regression": LogisticRegression(),
    "Random_forest": RandomForestClassifier() # Added Random Forest
}

def model_train_eval(X_train, y_train, X_test, y_test, models):
    evaluation = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        model_score = accuracy_score(y_test, y_pred)
        evaluation[name] = model_score
    return evaluation

# ---------------- 3.3 RUN THE INITIAL EVALUATION ----------------
model_scores = model_train_eval(X_train_processed, y_train, X_test_processed, y_test, models)
print("\n--- Initial Model Evaluation Results (Accuracy) ---")
for name, score in model_scores.items():
    print(f"{name}: {score:.4f}")
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 4: HYPERPARAMETER TUNING FOR RANDOM FOREST ---
# ====================================================================================
print("--- Step 4: Hyperparameter Tuning for Random Forest Classifier ---")

# ---------------- 4.1 DEFINE HYPERPARAMETER GRID ----------------
rf = RandomForestClassifier(random_state=1)
params = {
    'max_depth': [1, 2, 3, 5, 10, None],
    'n_estimators': [50, 100, 200, 300],
    'criterion': ['gini', 'entropy']
}
print("Defined parameter grid for hyperparameter search.")

# ---------------- 4.2 PERFORM RANDOMIZED SEARCH WITH CROSS-VALIDATION ----------------
clf = RandomizedSearchCV(
    estimator=rf,
    param_distributions=params,
    cv=5,
    verbose=3,
    scoring='accuracy',
    random_state=1 # For reproducibility of the search
)
print("\nRunning RandomizedSearchCV... (This may take a moment)")
clf.fit(X_train_processed, y_train)

# ---------------- 4.3 DISPLAY BEST PARAMETERS AND SCORE ----------------
print("\n--- RandomizedSearchCV Results ---")
print("Best Hyperparameters Found:")
print(clf.best_params_)
print(f"\nBest Cross-Validated Accuracy Score: {clf.best_score_:.4f}")
print("\n--- MASTER PIPELINE COMPLETE ---")
# ---------------- END OF SCRIPT ----------------



# Multiple models with Pipeline and ColumnTransformer

In [None]:
# ---------------- AUTOMATED REGRESSION & MODEL EVALUATION PIPELINE ----------------
# This single, unified script uses Pipeline and ColumnTransformer to preprocess data
# and then trains, evaluates, and tunes a Random Forest Regressor.

# Import required libraries
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# ====================================================================================
# --- STEP 1: LOAD AND PREPARE THE DATA FOR REGRESSION ---
# ====================================================================================
print("--- Step 1: Loading and Preparing the 'tips' Dataset for a Regression Task ---")

# ---------------- 1.1 LOAD DATA AND DEFINE TARGET ----------------
df = sns.load_dataset("tips")
print("Original Data Head:")
print(df.head())

# For this regression task, the target variable is 'total_bill'.
X = df.drop('total_bill', axis=1)  # Features are all other columns
y = df['total_bill']               # Target variable

# ---------------- 1.2 SPLIT DATA ----------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
print(f"\nData split into {X_train.shape[0]} training and {X_test.shape[0]} testing samples.")
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 2: CREATE THE PREPROCESSING PIPELINE ---
# ====================================================================================
print("--- Step 2: Building an Automated Preprocessing Pipeline ---")

# ---------------- 2.1 DEFINE COLUMN TYPES ----------------
# Identify which columns are categorical and which are numerical
cat_cols = ["sex", "smoker", "day", "time"] # 'time' is now a feature
num_cols = ["tip", "size"]
print(f"Categorical columns: {cat_cols}")
print(f"Numerical columns: {num_cols}")

# ---------------- 2.2 CREATE SUB-PIPELINES ----------------
num_pipeline = Pipeline(steps=[
    ('imputation', SimpleImputer(strategy="median")),
    ('scaling', StandardScaler())
])
cat_pipeline = Pipeline(steps=[
    ('imputation', SimpleImputer(strategy="most_frequent")),
    ('encoding', OneHotEncoder())
])

# ---------------- 2.3 COMBINE PIPELINES WITH COLUMNTRANSFORMER ----------------
preprocessor = ColumnTransformer([
    ("num_pipeline", num_pipeline, num_cols),
    ("cat_pipeline", cat_pipeline, cat_cols)
])
print("\nPreprocessor created to automate feature engineering for regression.")
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 3: APPLY PREPROCESSOR AND EVALUATE MULTIPLE MODELS ---
# ====================================================================================
print("--- Step 3: Preprocessing Data and Evaluating Multiple Regression Models ---")

# ---------------- 3.1 APPLY THE PREPROCESSOR ----------------
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
print("Training and testing data have been successfully transformed.")

# ---------------- 3.2 DEFINE MODELS AND EVALUATION FUNCTION ----------------
models = {
    "Support Vector Regressor": SVR(),
    "Decision Tree Regressor": DecisionTreeRegressor(),
    "Multiple Linear Regression": LinearRegression(),
    "Random Forest Regressor": RandomForestRegressor()
}

def model_train_eval(X_train, y_train, X_test, y_test, models):
    evaluation = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        model_score = r2_score(y_test, y_pred)
        evaluation[name] = model_score
    return evaluation

# ---------------- 3.3 RUN THE INITIAL EVALUATION ----------------
model_scores = model_train_eval(X_train_processed, y_train, X_test_processed, y_test, models)
print("\n--- Initial Model Evaluation Results (R-squared Score) ---")
for name, score in model_scores.items():
    print(f"{name}: {score:.4f}")
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- STEP 4: HYPERPARAMETER TUNING FOR RANDOM FOREST REGRESSOR ---
# ====================================================================================
print("--- Step 4: Hyperparameter Tuning for Random Forest Regressor ---")

# ---------------- 4.1 DEFINE HYPERPARAMETER GRID ----------------
rfr = RandomForestRegressor(oob_score=True, random_state=42)
params = {
    'max_depth': [1, 50, 100, 150, 200],
    'n_estimators': [50, 100, 200]
}
print("Defined parameter grid for hyperparameter search.")

# ---------------- 4.2 PERFORM RANDOMIZED SEARCH WITH CROSS-VALIDATION ----------------
reg = RandomizedSearchCV(
    estimator=rfr,
    param_distributions=params,
    cv=5,
    verbose=3,
    scoring='r2',
    n_iter=10,
    random_state=42
)
print("\nRunning RandomizedSearchCV... (This may take a moment)")
reg.fit(X_train_processed, y_train)

# ---------------- 4.3 DISPLAY BEST PARAMETERS AND SCORE ----------------
print("\n--- RandomizedSearchCV Results ---")
print("Best Hyperparameters Found:")
print(reg.best_params_)
print(f"\nBest Cross-Validated R2 Score: {reg.best_score_:.4f}")
print("\n--- MASTER PIPELINE COMPLETE ---")

# ---------------- END OF SCRIPT ----------------


# AdaBoost classifier and regressor

In [None]:
# ---------------- ADABOOST CLASSIFIER & REGRESSOR PIPELINE ----------------
# This single, unified script combines all the code you provided for both the
# AdaBoost Classifier and the AdaBoost Regressor, including hyperparameter tuning.

# Import required libraries
import warnings
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix,
                             r2_score, mean_absolute_error, mean_squared_error)

warnings.filterwarnings('ignore')

# ====================================================================================
# --- PART 1: ADABOOST CLASSIFIER ---
# ====================================================================================
print("--- Part 1: Building and Tuning an AdaBoost Classifier ---")

# ---------------- 1.1 GENERATE AND SPLIT CLASSIFICATION DATA ----------------
print("\nStep 1.1: Generating synthetic classification data...")
X_clf, y_clf = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=1)
X_clf_train, X_clf_test, y_clf_train, y_clf_test = train_test_split(X_clf, y_clf, test_size=0.33, random_state=42)
print(f"Classification data split into {X_clf_train.shape[0]} training and {X_clf_test.shape[0]} testing samples.")

# ---------------- 1.2 TRAIN AND EVALUATE INITIAL CLASSIFIER ----------------
print("\nStep 1.2: Training and evaluating the initial (untuned) AdaBoost Classifier...")
classifier = AdaBoostClassifier(random_state=1)
classifier.fit(X_clf_train, y_clf_train)
y_pred_initial_clf = classifier.predict(X_clf_test)

print("\n--- Initial Classifier Performance ---")
print(f"Accuracy: {accuracy_score(y_clf_test, y_pred_initial_clf):.4f}")
print("Classification Report:\n", classification_report(y_clf_test, y_pred_initial_clf))
print("Confusion Matrix:\n", confusion_matrix(y_clf_test, y_pred_initial_clf))

# ---------------- 1.3 HYPERPARAMETER TUNING WITH GRIDSEARCHCV ----------------
print("\nStep 1.3: Performing hyperparameter tuning for the classifier...")
param_grid_clf = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0, 1.5],
    'algorithm': ['SAMME', 'SAMME.R']
}
ada_clf = AdaBoostClassifier(random_state=1)
clf_grid = GridSearchCV(estimator=ada_clf, param_grid=param_grid_clf, cv=5, verbose=3, n_jobs=-1)
print("Running GridSearchCV for Classifier... (This may take a moment)")
clf_grid.fit(X_clf_train, y_clf_train)

print("\nBest Hyperparameters Found for Classifier:", clf_grid.best_params_)
best_clf_model = clf_grid.best_estimator_

# ---------------- 1.4 EVALUATE THE TUNED CLASSIFIER ----------------
print("\nStep 1.4: Evaluating the tuned AdaBoost Classifier...")
y_pred_tuned_clf = best_clf_model.predict(X_clf_test)

print("\n--- Tuned Classifier Performance ---")
print(f"Accuracy: {accuracy_score(y_clf_test, y_pred_tuned_clf):.4f}")
print("Classification Report:\n", classification_report(y_clf_test, y_pred_tuned_clf))
print("Confusion Matrix:\n", confusion_matrix(y_clf_test, y_pred_tuned_clf))
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- PART 2: ADABOOST REGRESSOR ---
# ====================================================================================
print("--- Part 2: Building and Tuning an AdaBoost Regressor ---")

# ---------------- 2.1 GENERATE AND SPLIT REGRESSION DATA ----------------
print("\nStep 2.1: Generating synthetic regression data...")
X_reg, y_reg = make_regression(n_samples=1000, n_features=2, noise=10, random_state=1)
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.33, random_state=42)
print(f"Regression data split into {X_reg_train.shape[0]} training and {X_reg_test.shape[0]} testing samples.")

# ---------------- 2.2 TRAIN AND EVALUATE INITIAL REGRESSOR ----------------
print("\nStep 2.2: Training and evaluating the initial (untuned) AdaBoost Regressor...")
regressor = AdaBoostRegressor(random_state=1)
regressor.fit(X_reg_train, y_reg_train)
y_pred_initial_reg = regressor.predict(X_reg_test)

print("\n--- Initial Regressor Performance ---")
print(f"R2 score: {r2_score(y_reg_test, y_pred_initial_reg):.4f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_reg_test, y_pred_initial_reg):.4f}")
print(f"Mean Squared Error: {mean_squared_error(y_reg_test, y_pred_initial_reg):.4f}")

# ---------------- 2.3 HYPERPARAMETER TUNING WITH GRIDSEARCHCV ----------------
print("\nStep 2.3: Performing hyperparameter tuning for the regressor...")
param_grid_reg = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5, 1.0],
    'loss': ['linear', 'square', 'exponential']
}
ada_reg = AdaBoostRegressor(random_state=1)
grid_search_reg = GridSearchCV(estimator=ada_reg, param_grid=param_grid_reg, cv=5, verbose=3)
print("Running GridSearchCV for Regressor... (This may take a moment)")
grid_search_reg.fit(X_reg_train, y_reg_train)

print("\nBest Hyperparameters Found for Regressor:", grid_search_reg.best_params_)
best_reg_model = grid_search_reg.best_estimator_

# ---------------- 2.4 EVALUATE THE TUNED REGRESSOR ----------------
print("\nStep 2.4: Evaluating the tuned AdaBoost Regressor...")
y_pred_tuned_reg = best_reg_model.predict(X_reg_test)

print("\n--- Tuned Regressor Performance ---")
print(f"R2 score: {r2_score(y_reg_test, y_pred_tuned_reg):.4f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_reg_test, y_pred_tuned_reg):.4f}")
print(f"Mean Squared Error: {mean_squared_error(y_reg_test, y_pred_tuned_reg):.4f}")
print("\n--- MASTER PIPELINE COMPLETE ---")

# ---------------- END OF SCRIPT ----------------


# GRADIENT BOOSTING CLASSIFIER

In [None]:
# ---------------- GRADIENT BOOSTING CLASSIFIER & REGRESSOR PIPELINE ----------------
# This single, unified script combines all the code you provided for both the
# Gradient Boosting Classifier and the Gradient Boosting Regressor, including hyperparameter tuning.

# Import required libraries
import warnings
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix,
                             r2_score, mean_absolute_error, mean_squared_error)

warnings.filterwarnings('ignore')

# ====================================================================================
# --- PART 1: GRADIENT BOOSTING CLASSIFIER ---
# ====================================================================================
print("--- Part 1: Building and Tuning a Gradient Boosting Classifier ---")

# ---------------- 1.1 GENERATE AND SPLIT CLASSIFICATION DATA ----------------
print("\nStep 1.1: Generating synthetic classification data...")
X_clf, y_clf = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=1)
X_clf_train, X_clf_test, y_clf_train, y_clf_test = train_test_split(X_clf, y_clf, test_size=0.33, random_state=1)
print(f"Classification data split into {X_clf_train.shape[0]} training and {X_clf_test.shape[0]} testing samples.")

# ---------------- 1.2 TRAIN AND EVALUATE INITIAL CLASSIFIER ----------------
print("\nStep 1.2: Training and evaluating the initial (untuned) Gradient Boosting Classifier...")
clf = GradientBoostingClassifier(random_state=1)
clf.fit(X_clf_train, y_clf_train)
y_pred_initial_clf = clf.predict(X_clf_test)

print("\n--- Initial Classifier Performance ---")
print(f"Accuracy: {accuracy_score(y_clf_test, y_pred_initial_clf):.4f}")
print("Classification Report:\n", classification_report(y_clf_test, y_pred_initial_clf))
print("Confusion Matrix:\n", confusion_matrix(y_clf_test, y_pred_initial_clf))

# ---------------- 1.3 HYPERPARAMETER TUNING WITH GRIDSEARCHCV ----------------
print("\nStep 1.3: Performing hyperparameter tuning for the classifier...")
param_grid_clf = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.05, 0.2]
}
gbc = GradientBoostingClassifier(random_state=1)
grid_search_clf = GridSearchCV(estimator=gbc, param_grid=param_grid_clf, cv=5, verbose=3)
print("Running GridSearchCV for Classifier... (This may take a moment)")
grid_search_clf.fit(X_clf_train, y_clf_train)

print("\nBest Hyperparameters Found for Classifier:", grid_search_clf.best_params_)
best_clf_model = grid_search_clf.best_estimator_

# ---------------- 1.4 EVALUATE THE TUNED CLASSIFIER ----------------
print("\nStep 1.4: Evaluating the tuned Gradient Boosting Classifier...")
y_pred_tuned_clf = best_clf_model.predict(X_clf_test)

print("\n--- Tuned Classifier Performance ---")
print(f"Accuracy: {accuracy_score(y_clf_test, y_pred_tuned_clf):.4f}")
print("Classification Report:\n", classification_report(y_clf_test, y_pred_tuned_clf))
print("Confusion Matrix:\n", confusion_matrix(y_clf_test, y_pred_tuned_clf))
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- PART 2: GRADIENT BOOSTING REGRESSOR ---
# ====================================================================================
print("--- Part 2: Building and Tuning a Gradient Boosting Regressor ---")

# ---------------- 2.1 GENERATE AND SPLIT REGRESSION DATA ----------------
print("\nStep 2.1: Generating synthetic regression data...")
X_reg, y_reg = make_regression(n_samples=1000, n_features=2, noise=10, random_state=42)
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.33, random_state=42)
print(f"Regression data split into {X_reg_train.shape[0]} training and {X_reg_test.shape[0]} testing samples.")

# ---------------- 2.2 TRAIN AND EVALUATE INITIAL REGRESSOR ----------------
print("\nStep 2.2: Training and evaluating the initial (untuned) Gradient Boosting Regressor...")
regressor = GradientBoostingRegressor(random_state=1)
regressor.fit(X_reg_train, y_reg_train)
y_pred_initial_reg = regressor.predict(X_reg_test)

print("\n--- Initial Regressor Performance ---")
print(f"R2 score: {r2_score(y_reg_test, y_pred_initial_reg):.4f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_reg_test, y_pred_initial_reg):.4f}")
print(f"Mean Squared Error: {mean_squared_error(y_reg_test, y_pred_initial_reg):.4f}")

# ---------------- 2.3 HYPERPARAMETER TUNING WITH GRIDSEARCHCV ----------------
print("\nStep 2.3: Performing hyperparameter tuning for the regressor...")
param_grid_reg = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.2]
}
gbr = GradientBoostingRegressor(random_state=1)
grid_search_reg = GridSearchCV(estimator=gbr, param_grid=param_grid_reg, cv=5, verbose=3)
print("Running GridSearchCV for Regressor... (This may take a moment)")
grid_search_reg.fit(X_reg_train, y_reg_train)

print("\nBest Hyperparameters Found for Regressor:", grid_search_reg.best_params_)
best_reg_model = grid_search_reg.best_estimator_

# ---------------- 2.4 EVALUATE THE TUNED REGRESSOR ----------------
print("\nStep 2.4: Evaluating the tuned Gradient Boosting Regressor...")
y_pred_tuned_reg = best_reg_model.predict(X_reg_test)

print("\n--- Tuned Regressor Performance ---")
print(f"R2 score: {r2_score(y_reg_test, y_pred_tuned_reg):.4f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_reg_test, y_pred_tuned_reg):.4f}")
print(f"Mean Squared Error: {mean_squared_error(y_reg_test, y_pred_tuned_reg):.4f}")
print("\n--- MASTER PIPELINE COMPLETE ---")

# ---------------- END OF SCRIPT ----------------


# XGBOOST CLASSIFIER

In [None]:
# ---------------- XGBOOST CLASSIFIER & REGRESSOR PIPELINE ----------------
# This single, unified script combines all the code you provided for both the
# XGBoost Classifier and the XGBoost Regressor, including hyperparameter tuning.

# Import required libraries
import warnings
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix,
                             r2_score, mean_absolute_error, mean_squared_error)

warnings.filterwarnings('ignore')

# ====================================================================================
# --- PART 1: XGBOOST CLASSIFIER ---
# ====================================================================================
print("--- Part 1: Building and Tuning an XGBoost Classifier ---")

# ---------------- 1.1 GENERATE AND SPLIT CLASSIFICATION DATA ----------------
print("\nStep 1.1: Generating synthetic classification data...")
# make_classification ek synthetic dataset banata hai classification tasks ke liye
# n_samples=1000 → total 1000 rows
# n_features=20 → har row me 20 input features
# n_classes=2 → binary classification (0 ya 1 output)
# random_state=1 → reproducibility ke liye same random split
X_clf, y_clf = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=1)
# Dataset ko train aur test set me split karna
# test_size=0.33 → 33% data test ke liye, 67% train ke liye
X_clf_train, X_clf_test, y_clf_train, y_clf_test = train_test_split(X_clf, y_clf, test_size=0.33, random_state=1)
print(f"Classification data split into {X_clf_train.shape[0]} training and {X_clf_test.shape[0]} testing samples.")

# ---------------- 1.2 TRAIN AND EVALUATE INITIAL CLASSIFIER ----------------
print("\nStep 1.2: Training and evaluating the initial (untuned) XGBoost Classifier...")
# XGBClassifier ka ek object banaya (default hyperparameters ke sath)
classifier = XGBClassifier(eval_metric='logloss', use_label_encoder=False)
# Model ko training karna (fit karna) training dataset (X_train, y_train) par
classifier.fit(X_clf_train, y_clf_train)
# Model se predictions lena
y_pred_initial_clf = classifier.predict(X_clf_test)

print("\n--- Initial Classifier Performance ---")
print(f"Accuracy: {accuracy_score(y_clf_test, y_pred_initial_clf):.4f}")
print("Classification Report:\n", classification_report(y_clf_test, y_pred_initial_clf))
print("Confusion Matrix:\n", confusion_matrix(y_clf_test, y_pred_initial_clf))

# ---------------- 1.3 HYPERPARAMETER TUNING WITH GRIDSEARCHCV ----------------
print("\nStep 1.3: Performing hyperparameter tuning for the classifier...")
# param_grid ek dictionary hai jisme hyperparameters aur unke values ki list di jaati hai
param_grid_clf = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.2]
}
# GridSearchCV lagana
grid_search_clf = GridSearchCV(estimator=classifier, param_grid=param_grid_clf, cv=5, n_jobs=-1, verbose=3)
print("Running GridSearchCV for Classifier... (This may take a moment)")
grid_search_clf.fit(X_clf_train, y_clf_train)

# Best parameters milenge jo sabse acha performance dete hain cross-validation me
print("\nBest Hyperparameters Found for Classifier:", grid_search_clf.best_params_)
# Best tuned model ko extract karna
best_clf_model = grid_search_clf.best_estimator_

# ---------------- 1.4 EVALUATE THE TUNED CLASSIFIER ----------------
print("\nStep 1.4: Evaluating the tuned XGBoost Classifier...")
# Best tuned model se predictions lena test data ke liye
y_pred_tuned_clf = best_clf_model.predict(X_clf_test)

print("\n--- Tuned Classifier Performance ---")
print(f"Accuracy: {accuracy_score(y_clf_test, y_pred_tuned_clf):.4f}")
print("Classification Report:\n", classification_report(y_clf_test, y_pred_tuned_clf))
print("Confusion Matrix:\n", confusion_matrix(y_clf_test, y_pred_tuned_clf))
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- PART 2: XGBOOST REGRESSOR ---
# ====================================================================================
print("--- Part 2: Building and Tuning an XGBoost Regressor ---")

# ---------------- 2.1 GENERATE AND SPLIT REGRESSION DATA ----------------
print("\nStep 2.1: Generating synthetic regression data...")
X_reg, y_reg = make_regression(n_samples=1000, n_features=2, noise=10, random_state=1)
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.33, random_state=1)
print(f"Regression data split into {X_reg_train.shape[0]} training and {X_reg_test.shape[0]} testing samples.")

# ---------------- 2.2 TRAIN AND EVALUATE INITIAL REGRESSOR ----------------
print("\nStep 2.2: Training and evaluating the initial (untuned) XGBoost Regressor...")
regressor = XGBRegressor(random_state=1)
regressor.fit(X_reg_train, y_reg_train)
y_pred_initial_reg = regressor.predict(X_reg_test)

print("\n--- Initial Regressor Performance ---")
print(f"R2 score: {r2_score(y_reg_test, y_pred_initial_reg):.4f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_reg_test, y_pred_initial_reg):.4f}")
print(f"Mean Squared Error: {mean_squared_error(y_reg_test, y_pred_initial_reg):.4f}")

# ---------------- 2.3 HYPERPARAMETER TUNING WITH GRIDSEARCHCV ----------------
print("\nStep 2.3: Performing hyperparameter tuning for the regressor...")
param_grid_reg = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.2]
}
grid_search_reg = GridSearchCV(estimator=regressor, param_grid=param_grid_reg, cv=5, n_jobs=-1, verbose=2)
print("Running GridSearchCV for Regressor... (This may take a moment)")
grid_search_reg.fit(X_reg_train, y_reg_train)

print("\nBest Hyperparameters Found for Regressor:", grid_search_reg.best_params_)
best_reg_model = grid_search_reg.best_estimator_

# ---------------- 2.4 EVALUATE THE TUNED REGRESSOR ----------------
print("\nStep 2.4: Evaluating the tuned XGBoost Regressor...")
y_pred_tuned_reg = best_reg_model.predict(X_reg_test)

print("\n--- Tuned Regressor Performance ---")
print(f"R2 score: {r2_score(y_reg_test, y_pred_tuned_reg):.4f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_reg_test, y_pred_tuned_reg):.4f}")
print(f"Mean Squared Error: {mean_squared_error(y_reg_test, y_pred_tuned_reg):.4f}")
print("\n--- MASTER PIPELINE COMPLETE ---")

# ---------------- END OF SCRIPT ----------------


# k nearest neighbhour classifier AND regressor with hyperparameter tunning

In [None]:
# ---------------- K-NEAREST NEIGHBORS (KNN) CLASSIFIER & REGRESSOR PIPELINE ----------------
# This single, unified script combines all the code you provided for both the
# KNN Classifier and the KNN Regressor, including hyperparameter tuning.

# Import required libraries
import warnings
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix, r2_score)

warnings.filterwarnings('ignore')

# ====================================================================================
# --- PART 1: K-NEAREST NEIGHBORS (KNN) CLASSIFIER ---
# ====================================================================================
print("--- Part 1: Building and Tuning a KNN Classifier ---")

# ---------------- 1.1 GENERATE AND SPLIT CLASSIFICATION DATA ----------------
print("\nStep 1.1: Generating synthetic classification data...")
# make_classification function ek synthetic dataset generate karta hai
# n_samples=1000 → 1000 rows
# n_features=3   → 3 independent features
# n_classes=2    → binary classification (0 aur 1)
X_clf, y_clf = make_classification(n_samples=1000, n_features=3, n_redundant=0, n_classes=2, random_state=1)
# train_test_split → dataset ko training aur testing parts me divide karne ke liye
# test_size=0.30 → 30% data testing ke liye
X_clf_train, X_clf_test, y_clf_train, y_clf_test = train_test_split(X_clf, y_clf, test_size=0.30, random_state=1)
print(f"Classification data split into {X_clf_train.shape[0]} training and {X_clf_test.shape[0]} testing samples.")

# ---------------- 1.2 TRAIN AND EVALUATE INITIAL CLASSIFIER ----------------
print("\nStep 1.2: Training and evaluating the initial (untuned) KNN Classifier...")
# KNN classifier ka object banaya (default: n_neighbors=5)
clf = KNeighborsClassifier()
# training data ke upar model ko train karna
clf.fit(X_clf_train, y_clf_train)
# testing data (X_test) ke liye predictions lena
y_pred_initial_clf = clf.predict(X_clf_test)

print("\n--- Initial Classifier Performance ---")
print("Confusion Matrix:\n", confusion_matrix(y_clf_test, y_pred_initial_clf))
print(f"Accuracy: {accuracy_score(y_clf_test, y_pred_initial_clf):.4f}")
print("Classification Report:\n", classification_report(y_clf_test, y_pred_initial_clf))

# ---------------- 1.3 HYPERPARAMETER TUNING WITH GRIDSEARCHCV ----------------
print("\nStep 1.3: Performing hyperparameter tuning for the classifier...")
# param_grid → alag-alag parameter combinations try karne ke liye dictionary
param_grid_clf = {
    'n_neighbors': [3, 5, 6, 7, 9, 11, 13],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [20, 30, 40, 50]
}
# GridSearchCV → sabhi parameter combinations ke liye model train karke best parameters nikalta hai
grid_clf = GridSearchCV(estimator=clf, param_grid=param_grid_clf, cv=5, verbose=3)
print("Running GridSearchCV for Classifier... (This may take a moment)")
grid_clf.fit(X_clf_train, y_clf_train)

# best_params_ → wo parameters jo best accuracy dete hain
print("\nBest Hyperparameters Found for Classifier:", grid_clf.best_params_)
# best_score_ → cross validation me jo best accuracy mili
print(f"Best Cross-Validated Accuracy: {grid_clf.best_score_:.4f}")
# grid.best_estimator_ → GridSearchCV se automatically best hyperparameters wala model return hota hai
best_clf_model = grid_clf.best_estimator_

# ---------------- 1.4 EVALUATE THE TUNED CLASSIFIER ----------------
print("\nStep 1.4: Evaluating the tuned KNN Classifier...")
# ab is best_model se test data ke liye predictions karte hain
y_pred_tuned_clf = best_clf_model.predict(X_clf_test)

print("\n--- Tuned Classifier Performance ---")
print(f"Accuracy: {accuracy_score(y_clf_test, y_pred_tuned_clf):.4f}")
print("Classification Report:\n", classification_report(y_clf_test, y_pred_tuned_clf))
print("Confusion Matrix:\n", confusion_matrix(y_clf_test, y_pred_tuned_clf))
print("\n" + "="*80 + "\n")


# ====================================================================================
# --- PART 2: K-NEAREST NEIGHBORS (KNN) REGRESSOR ---
# ====================================================================================
print("--- Part 2: Building and Tuning a KNN Regressor ---")

# ---------------- 2.1 GENERATE AND SPLIT REGRESSION DATA ----------------
print("\nStep 2.1: Generating synthetic regression data...")
# Synthetic regression dataset generate karna
# n_samples=1000 → 1000 data points
# n_features=2 → 2 input features
# noise=3 → thoda randomness add karna
X_reg, y_reg = make_regression(n_samples=1000, n_features=2, noise=3, random_state=1)
# Dataset ko training aur testing parts me split karna
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.30, random_state=1)
print(f"Regression data split into {X_reg_train.shape[0]} training and {X_reg_test.shape[0]} testing samples.")

# ---------------- 2.2 TRAIN AND EVALUATE INITIAL REGRESSOR ----------------
print("\nStep 2.2: Training and evaluating the initial (untuned) KNN Regressor...")
# KNN Regressor ka object banaya
reg = KNeighborsRegressor()
# Model ko training dataset par fit karna
reg.fit(X_reg_train, y_reg_train)
# Testing data ke liye prediction karna
y_pred_initial_reg = reg.predict(X_reg_test)

# Model ki performance ko evaluate karne ke liye R² score nikalte hain
initial_r2 = r2_score(y_reg_test, y_pred_initial_reg)
print(f"\nInitial Regressor R2 Score: {initial_r2:.4f}")

# ---------------- 2.3 HYPERPARAMETER TUNING WITH GRIDSEARCHCV ----------------
print("\nStep 2.3: Performing hyperparameter tuning for the regressor...")
# param_grid me sabhi possible parameter combinations diye hain
param_grid_reg = {
    'n_neighbors': [3, 5, 6, 7, 9, 11, 13],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [20, 30, 40, 50]
}
# GridSearchCV setup
grid_reg = GridSearchCV(estimator=reg, param_grid=param_grid_reg, cv=5, verbose=3)
print("Running GridSearchCV for Regressor... (This may take a moment)")
# Training data par GridSearchCV ko fit karna
grid_reg.fit(X_reg_train, y_reg_train)

# Best parameter combination jo sabse acha perform kare
print("\nBest Hyperparameters Found for Regressor:", grid_reg.best_params_)
# Best cross-validation score (R² score) jo tuning ke dauran mila
print(f"Best Cross-Validated R2 Score: {grid_reg.best_score_:.4f}")
# Best model ke parameters ko store kar liya
best_model_params = grid_reg.best_params_
print("\nBest model parameters stored.")
print("\n--- MASTER PIPELINE COMPLETE ---")

# ---------------- END OF SCRIPT ----------------
