In [4]:
from sklearn.datasets import load_wine
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

X, y = load_wine(return_X_y=True)

classifiers = {
    'Decision Tree': DecisionTreeClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB()
}

#2-fold cross-validation
results = {}
for name, classifier in classifiers.items():
    cv_score = cross_val_score(classifier, X, y, cv=2)
    results[name] = cv_score.mean()

for classifier, score in results.items():
    print(f"{classifier}: Average Accuracy = {score:.2%}")


Decision Tree: Average Accuracy = 91.57%
K-Nearest Neighbors: Average Accuracy = 66.29%
Naive Bayes: Average Accuracy = 97.75%


In [5]:
#20-fold cross-validation
results_20_fold = {}
for name, classifier in classifiers.items():
    cv_score = cross_val_score(classifier, X, y, cv=20)
    results_20_fold[name] = cv_score.mean()

#results
for classifier, score in results_20_fold.items():
    print(f"{classifier}: Average Accuracy (20-fold) = {score:.2%}")


Decision Tree: Average Accuracy (20-fold) = 88.89%
K-Nearest Neighbors: Average Accuracy (20-fold) = 70.42%
Naive Bayes: Average Accuracy (20-fold) = 96.67%


In [7]:
from sklearn.datasets import load_wine
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

X, y = load_wine(return_X_y=True)


classifiers = {
    'Decision Tree': DecisionTreeClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=300),  # Increased iterations for convergence
    'Support Vector Machine': SVC()
}

# 2-fold and 20-fold
results_2_fold = {name: cross_val_score(model, X, y, cv=2).mean() for name, model in classifiers.items()}
results_20_fold = {name: cross_val_score(model, X, y, cv=20).mean() for name, model in classifiers.items()}

# Combine all results
all_accuracies = {f'{name} 2-fold': score for name, score in results_2_fold.items()}
all_accuracies.update({f'{name} 20-fold': score for name, score in results_20_fold.items()})

#most accurate model
best_model_info = max(all_accuracies, key=all_accuracies.get)
best_model_name = ' '.join(best_model_info.split()[:-1])
best_folds = int(best_model_info.split()[-1].split('-')[0])

# Retrieve the best classifier
best_classifier = classifiers[best_model_name]

y_pred_best = cross_val_predict(best_classifier, X, y, cv=best_folds)

# Generate the confusion matrix for the best model
conf_matrix_best = confusion_matrix(y, y_pred_best)
print(conf_matrix_best)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[[57  2  0]
 [ 1 69  1]
 [ 0  2 46]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [14]:
df = pd.read_csv('/Users/syedzaidi/Downloads/Steel_industry_data.csv')
print(df.columns)

Index(['date', 'Usage_kWh', 'Lagging_Current_Reactive.Power_kVarh',
       'Leading_Current_Reactive_Power_kVarh', 'CO2(tCO2)',
       'Lagging_Current_Power_Factor', 'Leading_Current_Power_Factor', 'NSM',
       'WeekStatus', 'Day_of_week', 'Load_Type'],
      dtype='object')


In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor


data = pd.read_csv('/Users/syedzaidi/Downloads/Steel_industry_data.csv')

data['date'] = pd.to_datetime(data['date'], format='%d/%m/%Y %H:%M')
data['hour'] = data['date'].dt.hour
data['minute'] = data['date'].dt.minute
data['weekday'] = data['date'].dt.weekday
data.drop(['date', 'WeekStatus', 'Day_of_week'], axis=1, inplace=True)

# Define features and target
X = data.drop('Usage_kWh', axis=1)
y = data['Usage_kWh']


categorical_features = ['Load_Type']
numerical_features = X.columns.difference(categorical_features)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

#models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Random Forest': RandomForestRegressor(random_state=0)  # Ensure reproducibility
}

#cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=1)
results = {}

for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', model)])
    scores = cross_val_score(pipeline, X, y, cv=kf, scoring='r2')
    results[name] = scores

#results
for model_name, scores in results.items():
    print(f"{model_name}: Mean R-squared = {np.mean(scores):.4f}, Std Dev = {np.std(scores):.4f}")


Linear Regression: Mean R-squared = 0.9803, Std Dev = 0.0043
Ridge Regression: Mean R-squared = 0.9803, Std Dev = 0.0043
Random Forest: Mean R-squared = 0.9991, Std Dev = 0.0003


In [23]:
from sklearn.metrics import make_scorer, mean_squared_error

mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', model)])
    scores = cross_val_score(pipeline, X, y, cv=kf, scoring=mse_scorer)
    results[name] = scores

# Print results
for model_name, scores in results.items():
    print(f"{model_name}: Mean MSE = {np.mean(scores):.4f}, Std Dev = {np.std(scores):.4f}")
    

Linear Regression: Mean MSE = -22.0166, Std Dev = 4.7699
Ridge Regression: Mean MSE = -22.0162, Std Dev = 4.7673
Random Forest: Mean MSE = -1.0171, Std Dev = 0.3574
