<a href="https://colab.research.google.com/github/bijay2051/ML/blob/main/11_Bijay_Supervised_Learning_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Write a program in python using scikit learn to train Logistic Regression that predicts quantitative measure of progression driven by various attributes.

In [1]:
# Importing necessary libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [31]:
# Reading data from the specified URL into a DataFrame

url = "https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt"
data = pd.read_csv(url, delimiter='\t')

print(data.head())

   AGE  SEX   BMI     BP   S1     S2    S3   S4      S5  S6    Y
0   59    2  32.1  101.0  157   93.2  38.0  4.0  4.8598  87  151
1   48    1  21.6   87.0  183  103.2  70.0  3.0  3.8918  69   75
2   72    2  30.5   93.0  156   93.6  41.0  4.0  4.6728  85  141
3   24    1  25.3   84.0  198  131.4  40.0  5.0  4.8903  89  206
4   50    1  23.0  101.0  192  125.4  52.0  4.0  4.2905  80  135


In [32]:
# Extracting features (X) and target variable (Y) from the DataFrame
# Dropping the column 'Y' to get the features (X)
X = data.drop(columns=["Y"])
# Extracting the target variable 'Y'
Y = data["Y"]

# Splitting the data into training and testing sets

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) # Random state ensures reproducibility of the split


In [33]:
# Creating a Linear Regression model
model = LinearRegression()

# Training the model with the training data
model.fit(X_train, Y_train)

# Making predictions on the test set
Y_pred = model.predict(X_test)


In [34]:
# Calculating Mean Squared Error (MSE)
mse = mean_squared_error(Y_test, Y_pred)

# Calculating R-squared (R2) score
r2 = r2_score(Y_test, Y_pred)

# Printing the evaluation metrics
print("Mean Squared Error:", mse)
print("R-squared (R2) Score:", r2)


Mean Squared Error: 2900.193628493482
R-squared (R2) Score: 0.4526027629719195


## 2. Using Iris Plant Dataset, write a scratch program in python to train softmax regression that predicts the class of Iris plant.


In [7]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [35]:

iris = datasets.load_iris()
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating a StandardScaler object to standardize the features
scaler = StandardScaler()
# Fitting the scaler to the training data and transforming the training features
X_train = scaler.fit_transform(X_train)
# Transforming the test features using the same scaler
X_test = scaler.transform(X_test)

In [9]:
# Adding a column of ones to the features for the bias term
X_train = np.column_stack((np.ones(len(X_train)), X_train))
X_test = np.column_stack((np.ones(len(X_test)), X_test))

# Determining the number of features in the training data
num_classes = len(np.unique(y))

num_features = X_train.shape[1]

# Converting the target variable to one-hot encoding
y_one_hot = np.zeros((len(y_train), num_classes))
for i in range(len(y_train)):
    y_one_hot[i, y_train[i]] = 1

In [10]:
np.random.seed(1000)
weights = np.random.rand(num_features, num_classes)

# Setting hyperparameters
learning_rate = 0.01
num_epochs = 1000

# Training loop
for epoch in range(num_epochs):
    scores = np.dot(X_train, weights)
    exp_scores = np.exp(scores)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

    # Assuming y_one_hot is a one-hot encoded matrix
    loss = -np.sum(y_one_hot * np.log(probs)) / len(X_train)

    grad = np.dot(X_train.T, (probs - y_one_hot)) / len(X_train)

    weights -= learning_rate * grad

    if (epoch + 1) % 100 == 0:
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss:.4f}')


Epoch 100/1000, Loss: 0.7641
Epoch 200/1000, Loss: 0.5914
Epoch 300/1000, Loss: 0.5068
Epoch 400/1000, Loss: 0.4560
Epoch 500/1000, Loss: 0.4220
Epoch 600/1000, Loss: 0.3975
Epoch 700/1000, Loss: 0.3787
Epoch 800/1000, Loss: 0.3635
Epoch 900/1000, Loss: 0.3508
Epoch 1000/1000, Loss: 0.3398


In [11]:
# Computing scores on the test set using the learned weights
test_scores = np.dot(X_test, weights)

# Calculating softmax probabilities for each class
test_probs = np.exp(test_scores) / np.sum(np.exp(test_scores), axis=1, keepdims=True)

# Predicting the class with the highest probability for each data point
predicted_classes = np.argmax(test_probs, axis=1)

accuracy = accuracy_score(y_test, predicted_classes)
print(f'Accuracy on the test set: {accuracy * 100:.2f}%')


Accuracy on the test set: 96.67%


# 3. Compare the accuracy, precision, recall and accuracy of following three classification algorithms for Wine Quality Prediction.

  Logistic Regression Classifier

  Support Vector Classifier

  Naïve Bayes Classifier

  KNN Algorithm
  
  Decision Tree Classifier


In [12]:

from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [13]:
wine_data = load_wine()
X = wine_data.data
y = wine_data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
classifiers = [
    ("Logistic Regression", LogisticRegression()),
    ("Support Vector Classifier", SVC()),
    ("Naïve Bayes", GaussianNB()),
    ("K-Nearest Neighbors", KNeighborsClassifier(n_neighbors=3)),
    ("Decision Tree", DecisionTreeClassifier())
]

results = []

for name, clf in classifiers:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    results.append((name, accuracy, precision, recall, f1))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
print("Classifier\tAccuracy\tPrecision\tRecall\tF1 Score")
for name, accuracy, precision, recall, f1 in results:
    print(f"{name}\t{accuracy:.2f}\t\t{precision:.2f}\t\t{recall:.2f}\t{f1:.2f}")


Classifier	Accuracy	Precision	Recall	F1 Score
Logistic Regression	0.97		0.97		0.97	0.97
Support Vector Classifier	0.81		0.80		0.81	0.80
Naïve Bayes	1.00		1.00		1.00	1.00
K-Nearest Neighbors	0.81		0.82		0.81	0.81
Decision Tree	0.94		0.95		0.94	0.94


## 4. Implement Stacking ensemble method in the questions no. 3 and train the meta learner for prediction.

In [16]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [17]:
wine_data = load_wine()
X, y = wine_data.data, wine_data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
base_classifiers = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
    ('ab', AdaBoostClassifier(n_estimators=100, random_state=42)),
    ('dt', DecisionTreeClassifier(random_state=42)),
    ('svm', SVC(kernel='linear', C=1, random_state=42)),
    ('nb', GaussianNB()),
    ('lr', LogisticRegression(max_iter=1000, random_state=42))
]

meta_learner = LogisticRegression(max_iter=1000, random_state=42)

In [19]:
stacking_classifier = StackingClassifier(estimators=base_classifiers, final_estimator=meta_learner)
stacking_classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [20]:
y_pred = stacking_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of Stacking Classifier:", accuracy)

Accuracy of Stacking Classifier: 1.0


## 5. Implement bagging method to train machine learning on SMS spam detection.

In [23]:
df = pd.read_csv(r"https://raw.githubusercontent.com/bijay2051/ML/main/spam.csv", encoding='ISO-8859-1')

In [None]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [27]:
import pandas as pd

file_path = 'https://raw.githubusercontent.com/bijay2051/ML/main/spam.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [28]:
X = data['v2']
y = data['v1']
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
base_classifier = DecisionTreeClassifier()
bagging_classifier = BaggingClassifier(base_classifier, n_estimators=100, random_state=42)

bagging_classifier.fit(X_train, y_train)

y_pred = bagging_classifier.predict(X_test)

In [30]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='spam')
recall = recall_score(y_test, y_pred, pos_label='spam')
f1 = f1_score(y_test, y_pred, pos_label='spam')

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

Accuracy: 0.97
Precision: 0.92
Recall: 0.88
F1 Score: 0.90


Here, we get accuray:0.97, its better result.
