# Basic sklearn algorithms
Documentation for implementation of basic scikit-learn libraries

## Import Libraries

In [4]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error

# Classification Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

# Regression Algorithms
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

## Classification

In [5]:
# Load Iris dataset from Seaborn
iris = sns.load_dataset('iris')

# Preparing data
X = iris.drop(columns='species')
y = iris['species']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardizing data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# List of classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Classifier": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": GaussianNB(),
    "Neural Network (MLP)": MLPClassifier(max_iter=1000)
}

print("Classification Results on Iris Dataset:")
for name, clf in classifiers.items():
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name}: Accuracy = {accuracy:.4f}")

Classification Results on Iris Dataset:
Logistic Regression: Accuracy = 1.0000
K-Nearest Neighbors: Accuracy = 1.0000
Support Vector Classifier: Accuracy = 1.0000
Random Forest: Accuracy = 1.0000
Gradient Boosting: Accuracy = 1.0000
Decision Tree: Accuracy = 1.0000
Naive Bayes: Accuracy = 0.9778
Neural Network (MLP): Accuracy = 1.0000


## Regression

In [6]:
from sklearn.datasets import fetch_california_housing

# Load California Housing dataset
california_housing = fetch_california_housing()
X = california_housing.data
y = california_housing.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardizing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# List of regressors
regressors = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Decision Tree Regressor": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Gradient Boosting Regressor": GradientBoostingRegressor(),
    "Support Vector Regressor": SVR(),
    "K-Nearest Neighbors Regressor": KNeighborsRegressor(),
    "Neural Network (MLP)": MLPRegressor(max_iter=1000)
}

print("\nRegression Results on California Housing Dataset:")
for name, reg in regressors.items():
    reg.fit(X_train_scaled, y_train)
    y_pred = reg.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    print(f"{name}: MSE = {mse:.4f}")


Regression Results on California Housing Dataset:
Linear Regression: MSE = 0.5306
Ridge Regression: MSE = 0.5305
Lasso Regression: MSE = 1.3125
Decision Tree Regressor: MSE = 0.5282
Random Forest Regressor: MSE = 0.2546
Gradient Boosting Regressor: MSE = 0.2883
Support Vector Regressor: MSE = 0.3496
K-Nearest Neighbors Regressor: MSE = 0.4295
Neural Network (MLP): MSE = 0.2958


***

# Sklearn algorithms with additional methods
* PCA (reduce features to Classification: 2, Regression: 5)
* Feature Selection with SelectKBest (ANOVA for classification, F-test for regression)
* 5-Fold Cross-Validation



## Import Libraries

In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, f_regression
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report, r2_score
from sklearn.pipeline import Pipeline

# Classification Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

# Regression Algorithms
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

## Iris Dateset for Classification

In [2]:
# Load Iris dataset from Seaborn
iris = sns.load_dataset('iris')

# Preparing data
X = iris.drop(columns='species')
y = iris['species']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardizing data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Applying PCA for dimensionality reduction
pca = PCA(n_components=2)  # Reduce to 2 dimensions
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Applying feature selection
select_k_best = SelectKBest(score_func=f_classif, k=2)  # Selecting top 2 features
X_train_selected = select_k_best.fit_transform(X_train_scaled, y_train)
X_test_selected = select_k_best.transform(X_test_scaled)

# List of classifiers with pipelines for PCA and feature selection
classifiers = {
    "Logistic Regression": Pipeline([('pca', pca), ('clf', LogisticRegression())]),
    "K-Nearest Neighbors": Pipeline([('select', select_k_best), ('clf', KNeighborsClassifier())]),
    "Support Vector Classifier": Pipeline([('select', select_k_best), ('clf', SVC())]),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": GaussianNB(),
    "Neural Network (MLP)": MLPClassifier(max_iter=1000)
}

# Evaluating models with cross-validation and reporting classification metrics
print("Classification Results on Iris Dataset:")
for name, clf in classifiers.items():
    # Cross-validation accuracy
    cv_scores = cross_val_score(clf, X_train_scaled, y_train, cv=5)
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name}: CV Accuracy = {cv_scores.mean():.4f}, Test Accuracy = {accuracy:.4f}")
    print(classification_report(y_test, y_pred))

Classification Results on Iris Dataset:
Logistic Regression: CV Accuracy = 0.9048, Test Accuracy = 0.9111
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        19
  versicolor       0.91      0.77      0.83        13
   virginica       0.80      0.92      0.86        13

    accuracy                           0.91        45
   macro avg       0.90      0.90      0.90        45
weighted avg       0.92      0.91      0.91        45

K-Nearest Neighbors: CV Accuracy = 0.9524, Test Accuracy = 1.0000
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        19
  versicolor       1.00      1.00      1.00        13
   virginica       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

Support Vector Classifier: CV Accuracy = 0.9429, Test Accuracy = 1

## California Housing Data for Regression

In [3]:
from sklearn.datasets import fetch_california_housing

# Load California Housing dataset
california_housing = fetch_california_housing()
X = california_housing.data
y = california_housing.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardizing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Applying PCA for dimensionality reduction
pca = PCA(n_components=5)  # Reduce to 5 dimensions for regression
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Applying feature selection
select_k_best = SelectKBest(score_func=f_regression, k=5)  # Selecting top 5 features
X_train_selected = select_k_best.fit_transform(X_train_scaled, y_train)
X_test_selected = select_k_best.transform(X_test_scaled)

# List of regressors with pipelines for PCA and feature selection
regressors = {
    "Linear Regression": Pipeline([('pca', pca), ('reg', LinearRegression())]),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Decision Tree Regressor": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Gradient Boosting Regressor": GradientBoostingRegressor(),
    "Support Vector Regressor": Pipeline([('select', select_k_best), ('reg', SVR())]),
    "K-Nearest Neighbors Regressor": KNeighborsRegressor(),
    "Neural Network (MLP)": MLPRegressor(max_iter=1000)
}

# Evaluating models with cross-validation and reporting regression metrics
print("\nRegression Results on California Housing Dataset:")
for name, reg in regressors.items():
    # Cross-validation score
    cv_scores = cross_val_score(reg, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
    reg.fit(X_train_scaled, y_train)
    y_pred = reg.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{name}: CV MSE = {-cv_scores.mean():.4f}, Test MSE = {mse:.4f}, R^2 = {r2:.4f}")


Regression Results on California Housing Dataset:
Linear Regression: CV MSE = 0.7235, Test MSE = 0.7161, R^2 = 0.4544
Ridge Regression: CV MSE = 0.5268, Test MSE = 0.5305, R^2 = 0.5958
Lasso Regression: CV MSE = 1.3400, Test MSE = 1.3125, R^2 = -0.0000
Decision Tree Regressor: CV MSE = 0.5604, Test MSE = 0.5340, R^2 = 0.5932
Random Forest Regressor: CV MSE = 0.2687, Test MSE = 0.2538, R^2 = 0.8067
Gradient Boosting Regressor: CV MSE = 0.2875, Test MSE = 0.2883, R^2 = 0.7803
Support Vector Regressor: CV MSE = 0.4435, Test MSE = 0.4269, R^2 = 0.6748
K-Nearest Neighbors Regressor: CV MSE = 0.4352, Test MSE = 0.4295, R^2 = 0.6728
Neural Network (MLP): CV MSE = 0.3017, Test MSE = 0.2989, R^2 = 0.7723
