In [None]:
import pandas as pd
from sklearn.linear_model import Perceptron, SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, VotingClassifier
from sklearn.metrics import precision_score, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV

# Load the dataset
data = pd.read_csv("T124OPPE2_ModelBuilding_V1.csv")

# Split into train and test sets
train_size = int(len(data) * 0.7)
train_data = data.iloc[:train_size]
test_data = data.iloc[train_size:]

# Separate features and target
X_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1]
X_test = test_data.iloc[:, :-1]
y_test = test_data.iloc[:, -1]

In [None]:
perceptron = Perceptron(
    random_state=1729,
    eta0=1,
    max_iter=1,
    shuffle=False,
    penalty=None,
    fit_intercept=True,
    validation_fraction=0.1,
    warm_start=True
)

for i in range(5):
    perceptron.fit(X_train, y_train)
    print(f"Iteration {i+1}, Bias: {perceptron.intercept_}")

Iteration 1, Bias: [-4.]
Iteration 2, Bias: [-3.]
Iteration 3, Bias: [-3.]
Iteration 4, Bias: [-4.]
Iteration 5, Bias: [-3.]




In [None]:
y_train_pred = perceptron.predict(X_train)
precision = precision_score(y_train, y_train_pred, pos_label=1)
print(f"Precision: {precision:.2f}")

Precision: 0.67


In [None]:
import numpy as np
from sklearn.metrics import log_loss

sgd = SGDClassifier(
    loss='log_loss',
    penalty='l2',
    eta0=0.001,
    alpha=0,
    learning_rate='constant',
    random_state=1729,
    warm_start=True
)

# Training and tracking loss
for i in range(5):
    sgd.partial_fit(X_train, y_train, classes=[0, 1])
    y_train_pred = sgd.predict_proba(X_train)  # Get predicted probabilities
    loss = log_loss(y_train, y_train_pred)  # Calculate log loss
    print(f"Iteration {i+1}, Loss: {loss:.3f}")


Iteration 1, Loss: 0.253
Iteration 2, Loss: 0.208
Iteration 3, Loss: 0.194
Iteration 4, Loss: 0.187
Iteration 5, Loss: 0.183


In [None]:
param_grid = {
    'alpha': [0.0001, 0.0005, 0.001, 0.005],
    'eta0': [0.01, 0.05, 0.1, 0.5]
}

grid_search = GridSearchCV(
    SGDClassifier(loss='log_loss', learning_rate='constant', random_state=1729),
    param_grid,
    scoring='accuracy',
    cv=5
)

grid_search.fit(X_train, y_train)
print(f"Best alpha: {grid_search.best_params_['alpha']}, Best eta0: {grid_search.best_params_['eta0']}")


Best alpha: 0.0001, Best eta0: 0.01


In [None]:
best_alpha = grid_search.best_params_['alpha']
best_eta0 = grid_search.best_params_['eta0']

weighted_sgd = SGDClassifier(
    loss='log_loss',
    learning_rate='constant',
    random_state=1729,
    alpha=best_alpha,
    eta0=best_eta0,
    class_weight={0: 0.1, 1: 2}
)

weighted_sgd.fit(X_train, y_train)
y_test_pred = weighted_sgd.predict(X_test)
correct_predictions = sum((y_test_pred == 1) & (y_test == 1))
print(f"Correctly predicted class 1 samples: {correct_predictions}")

Correctly predicted class 1 samples: 47


In [None]:
svm = SVC(
    kernel='rbf',
    decision_function_shape='ovr',
    random_state=1729,
    C=1
)

svm.fit(X_train, y_train)
y_test_pred = svm.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_test_pred)
print(f"Confusion Matrix:\n{conf_matrix}")

Confusion Matrix:
[[1142    0]
 [  58    0]]


In [None]:
tree = DecisionTreeClassifier(
    criterion='entropy',
    splitter='random',
    min_samples_split=4,
    min_impurity_decrease=0.0001,
    random_state=1729
)

tree.fit(X_train, y_train)
print(f"Tree Depth: {tree.get_depth()}")
print(f"Number of Nodes: {tree.tree_.node_count}")

# Compute entropy at the left child of the root
root_entropy = tree.tree_.impurity[1]
print(f"Entropy at left child of root: {root_entropy}")

Tree Depth: 20
Number of Nodes: 515
Entropy at left child of root: 0.024564134553940277


In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# Example classifiers
models = {
    'DecisionTree': DecisionTreeClassifier(random_state=1729),
    'LogisticRegression': LogisticRegression(random_state=1729),
    'KNeighbors': KNeighborsClassifier()
}

# Train BaggingClassifier for each model
for name, model in models.items():
    bagging = BaggingClassifier(estimator=model, n_estimators=20, random_state=1729)
    bagging.fit(X_train, y_train)
    train_score = bagging.score(X_train, y_train)
    test_score = bagging.score(X_test, y_test)

    print(f"{name} - Train score: {train_score:.4f}, Test score: {test_score:.4f}")

    print(f"{name}: Train-Test Absolute Difference: {abs(train_score - test_score):.4f}")


DecisionTree - Train score: 0.9939, Test score: 0.9442
DecisionTree: Train-Test Absolute Difference: 0.0498
LogisticRegression - Train score: 0.9514, Test score: 0.9517
LogisticRegression: Train-Test Absolute Difference: 0.0002
KNeighbors - Train score: 0.9521, Test score: 0.9508
KNeighbors: Train-Test Absolute Difference: 0.0013
