# Index
- Setup
- Classification
    - [Naive Bayes](#Naive-Bayes)
    - [Decision Tree](#Decision-Tree)
- [Conclusion](#conclusion)


# Setup

In [7]:
import pandas as pd
import numpy as np

## Naive Bayes

In [1]:
# Pros: simple and fast, efficient for high dim data, low training time, binary or MC,
# high bias low variance - less prone to underfitting (feature independence), interpretable,  
# numeric and categorical, 
# ensemble friendly but unlikely, online or batch
 
# cons
# sensitive to noise in data, bad for imbalanced data, 

# Pitfalls and troubleshoot
# laplace smoothing

# assumptions
# assumes indep features, 

# preprocessing needed - missing values, categs, 

# variants
# clf = MultinomialNB(class_prior=None, class_weight=class_weights)
# clf = MultinomialNB(alpha=1) - laplace smoothing

In [2]:
# Implementation
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
clf = GaussianNB()
X = np.array([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]])
y = np.array([0, 0, 1, 1, 1])  # Binary labels (0 or 1)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

NameError: name 'np' is not defined

[Back to the top](#Index)

# Decision Tree

In [None]:
# Pros: 
# interpretable (white-box), less preprocessing (scaling, outlier, categ), non-linear relationships: decision boundary
# offers feature importance, classification & regression
# binary or mc, compute efficient, smooth bias variance tradeoff, somwehat robust to noise, 

# cons
# overfitting when not pruned, high variance, bias toward dominant class, 
# less stable (sensitive dependent on training data, different tree), ~ greedy algorithm
# batch, but online possible

# Pitfalls and troubleshoot
# overfitting - prune and limit depth or increase max sampels, 
# feature importance may be distorted for correlated fields - permutation importance or SHAP values

# assumptions: Feature Independence (but ok), Binary Splits, Recursive Splittings,
# Sequential Decision-Making

# variants: non-binary splits, 

# Notes:
# algo, feature importace, imbalanced, 
# weighted for minority class,
# clf = DecisionTreeClassifier(class_weight={0: 1.0, 1: 10.0})

In [3]:
# choice of parameters

In [None]:
# Implementation
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)


In [None]:
#### RF ####
# Pros: high accuracy, outlier and noise robust, feature importance, handles missing data, 
# non-linear relationships, compute friendly given parallel, less overfitting, 
# non-parametric: no prior distribition assumed, 
 
# cons
# high memory, not interpretable, needs large dataset preferably, 
# inflated feature importance - Bias Toward Features with Many Categories
# sensitive towards imbalanced data

# assumptions
# Independence of Trees, randomness, ensemble principle, 

# preprocessing needed
# multicollinearity

# variants
# clf = RandomForestClassifier(class_weight={0: 1.0, 1: 2.0, 2: 1.5})
# ensemble pruning - discard poor trees

# Implementation
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)


In [None]:
#### k-Nearest Neighbors ####
# Pros: simple, no assumptions of underlying data - non-param, cls/reg, streaming-no training time, 
# Pros: Interpretable, robust to outlier, multiclass
# binary or multiclass, 

# cons: sensitive to k, compute efficiency (approx nearest n, KD tree, ball tree, local hash) - multicore, dim redn,  GPU,  
# sensitive to class imbalance, sensitive to outliers, missing imputation needed
# sensitive to noise if small k, batch >> online

# Pitfalls and troubleshoot
# use with ensemble

# assumptions: all features are equally important

# variants: approx nearest neighbors?

# Notes: scaling needed. Kvalue: CV/grid-search, domain knowledge, odd number
# Small "k" (Low Bias, High Variance), underfitting, sensitive to noise, strong local patterns 
# anamoly detection: large distances
# weights: distance, uniform, rank

# imbalanced class: weights by class
knn_classifier = KNeighborsClassifier(n_neighbors=5, weights='distance', class_weight={0: 1, 1: 10})

# Implementation
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors=3) 
knn_classifier.fit(X_train, y_train)
y_pred = knn_classifier.predict(X_test)

knn_regressor = KNeighborsRegressor(n_neighbors=3)
knn_regressor.fit(X_train, y_train)
y_pred = knn_regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)


In [None]:
#### Support Vector Machines ####
# Pros
# high dimensions, binary and MC, classification and regression, 
# immune to overfitting especially when regularised (tune C parameter)
# global optimum, non linear relationships well handled with complex decision boundary maximising margin (RBF), 
# immune to outliers and noise,  
# ensemble friendly but unlikely, online or batch

# cons
# black-box
# compute costly, memory intensive, parameter tuning dependent, 
# not so good for MC problems, not interpretable, 
# class imbalance sensitive, 

# assumptions
# linearly separable boundary (kernel trick), max margin (hyperplane and boundary points, )

# preprocessing needed
# scaling must

# variants
from sklearn.svm import SVC
classifiers = {
    "Linear": SVC(kernel='linear'),
    "Polynomial": SVC(kernel='poly', degree=3),  # You can adjust the degree parameter
    "RBF": SVC(kernel='rbf')
}
for kernel_name, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)

# svm_classifier = SVC(kernel='linear', class_weight={0: 1.0, 1: 1.0, 2: 2.0})

# notes:
# cost parameter: small C wide margin underfit, large C overfit

# Implementation
svm_classifier = SVC(kernel='rbf', C=1, gamma='scale')
svm_classifier.fit(X_train, y_train)
y_pred = svm_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [None]:
#### Multinomial Logistic Regression ####
# Pros: interpretable, compute efficient, converges quickly and online friendly, 
# L1 and L2 regularization possible, ensemble friendly,
# good balance of bias and variance, 

# cons
# linear relationships need transformation, not readily suitable for complex non-linear data
# sensitive to outliers, requires feature engineering
# parametric

# assumptions
# features are independent so are errors, 
# linearity of logit function, no multicollinearity, 

# preprocessing needed: missing, outlier, feature selection, categ encoding, 
# scaling optional but recommended for faster convergence?, 

# variants
# model = LogisticRegression(class_weight={0: 1, 1: 5})

# Implementation
from sklearn.linear_model import LogisticRegression

# Standardize your features (optional but can be helpful)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)



In [None]:
#### one vs rest (ovr) ####
# good for few classes. But creates artificial imbalance for larger classes
classifiers = []
num_classes = len(set(y_train))

for i in range(num_classes):
    y_binary = (y_train == i).astype(int)    
    classifier = LogisticRegression()
    classifier.fit(X_train, y_binary)
    classifiers.append(classifier)

class_predictions = []
for classifier in classifiers:
    y_pred = classifier.predict(X_test)
    class_predictions.append(y_pred)
final_predictions = [max(range(num_classes), key=lambda x: class_predictions[x][i]) for i in range(len(X_test))]


In [None]:
#### one vs one: requires sufficient data ####
classifiers = []
for class_pair in class_pairs:
    # Filter the training data for the current class pair
    mask = (y_train == class_pair[0]) | (y_train == class_pair[1])
    X_pair = X_train[mask]
    y_pair = y_train[mask]
    
    # Train a binary classifier (e.g., Support Vector Machine)
    classifier = SVC(kernel='linear')
    classifier.fit(X_pair, y_pair)
    
    # Append the trained classifier to the list
    classifiers.append(classifier)
# Make predictions using all the binary classifiers
predictions = []

for classifier in classifiers:
    y_pred = classifier.predict(X_test)
    predictions.append(y_pred)
# Aggregate the binary classifier results to make the final prediction
final_predictions = []

for i in range(len(X_test)):
    votes = [0] * len(set(y))
    for j in range(len(class_pairs)):
        if predictions[j][i] == class_pairs[j][0]:
            votes[class_pairs[j][0]] += 1
        else:
            votes[class_pairs[j][1]] += 1
    final_predictions.append(max(range(len(votes)), key=lambda x: votes[x]))


In [None]:
#### Bagging ####
# sample w replacement, train model on them
# parallel    
    
#### Boosting ####
# models are trained sequentially, and each new model focuses on the previously misclassified samples 

#### Boosting - Gradient Boosting Classification ####
# ensemble of decision trees sequentially, with each tree correcting the errors of the previous one. 
from sklearn.ensemble import GradientBoostingClassifier
gbt_classifier = GradientBoostingClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=42)
gbt_classifier.fit(X_train, y_train)
y_pred = gbt_classifier.predict(X_test)

In [None]:
#### Boosting - Gradient Boosting Trees ####
# same as above
# n_estimators - no of iterations to boost
# learning_rate - 0.1 
# max_depth - 3
# random_state - None
# loss - deviance or exponential
# subsample - fraction of samples
# min_samples_split - 
# min_samples_leaf - 
# max_features - sqrt, log2, none
# max_leaf_nodes - 
# n_iter_no_change and tol
    

In [None]:
#### Boosting - light GBM ####
import lightgbm as lgb
train_data = lgb.Dataset(X_train, label=y_train)
# Define your LightGBM parameters
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}
clf = lgb.train(params, train_data, num_boost_round=100)
y_pred = clf.predict(X_test, num_iteration=clf.best_iteration)

boosting_type # gbdt, dart, goss
objective: binary, multiclass, regression
metric: binary_logloss, multi_logloss, rmse
learning_rate
n_estimators
max_depth, num_leaves, min_child_samples, 
subsample
reg_alpha
reg_lambda

In [None]:
Efficient Handling of Large Datasets:
Faster Training Speed: 
Low memory usage
GPU and multicore
Early Stopping Capability: faster convergence

Categorical, outliers and missing
Improved Accuracy: leaf-first approach vs depth first
Flexibility: custom objective function
Support for Regularization: L1 and L2

In [None]:
#### Boosting - Gradient Boosting Decision Trees ####

#### Boosting - Gradient Boosting Regression Trees ####

#### Boosting - Gradient Boosting Machine ####

#### Boosting - Multiple Additive Regression Trees ####
    

In [None]:
#### Boosting - XG Boost ####
#Classification and Regression
#efficiency (multicore)
#accuracy
#robustness (dtypes, sizes)
#Missing values handled, k-fold and feature importance built in
#L1 and L2 regularization to control overfitting

# pip install xgboost
import xgboost as xgb
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'reg:squarederror',  # Regression task
    'max_depth': 3,                  # Maximum depth of trees
    'learning_rate': 0.1,            # Learning rate
    'n_estimators': 100              # Number of boosting rounds (trees)
}
model = xgb.train(params, dtrain)
y_pred = model.predict(dtest)

#### Boosting - ADA Boost ####
from sklearn.ensemble import AdaBoostClassifier
adaboost_classifier = AdaBoostClassifier(base_classifier, n_estimators=50, random_state=42)
adaboost_classifier.fit(X_train, y_train)
y_pred = adaboost_classifier.predict(X_test)
