In [None]:
# setting up the notebook width to 100% of the screen
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import pandas as pd
import time

### data collection

In [None]:
df = pd.read_csv("../Chapter03/phishing_dataset.csv")

In [None]:
df.shape

In [None]:
def load_data(data_path):
    df = pd.read_csv(data_path)
    X = df[[c for c in df.columns if c not in ["id", "CLASS_LABEL"]]]
    y = df["CLASS_LABEL"]
    return X, y

In [None]:
X, y = load_data("../Chapter03/phishing_dataset.csv")

In [None]:
X.head()

In [None]:
y.head()

In [None]:
X.describe(include="all")

### data cleaning

In [None]:
X.dtypes

In [None]:
y.dtypes

In [None]:
X.isnull().any(axis=0)

In [None]:
y.isnull().any(axis=0)

In [None]:
y = y[~X.duplicated()].reset_index(drop=True) 
X = X[~X.duplicated()].reset_index(drop=True) 

In [None]:
X.shape

In [None]:
eps=1e-10
X = (X-X.min())/(X.max()-X.min()+eps)

### model selection training evaluation

#### logistic regression

In [None]:
start_time = time.time()

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

X, y = load_data("../Chapter03/phishing_dataset.csv")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# initialize a MinMaxScaler object
scaler = MinMaxScaler()

# fit and transform the training set
X_train_scaled = scaler.fit_transform(X_train)

# transform the testing set
X_test_scaled = scaler.transform(X_test)

# Create a logistic regression model
model = LogisticRegression (C=1.0, max_iter=1000)

# Train the model on the fraud dataset
model.fit(X_train_scaled, y_train)

# Predict the outcomes for the testing set
y_pred = model.predict(X_test_scaled)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print("Total time:", time.time()-start_time)

#### random search

In [None]:
start_time = time.time()

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

# Define the parameter space for the model
param_space = {'C': uniform(0, 10), 'max_iter': randint(500, 2000)}

# Create a logistic regression model
model = LogisticRegression()

# Define the random search object
random_search = RandomizedSearchCV(model, param_distributions=param_space, n_iter=10, cv=5, random_state=42)

# Fit the random search object on the scaled training set
random_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print('Best hyperparameters:', random_search.best_params_)

# Predict the outcomes for the testing set using the best model
y_pred = random_search.predict(X_test_scaled)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# return the trained model
model = random_search.best_estimator_

print("Total time:", time.time()-start_time)

#### knn

In [None]:
start_time = time.time()

from sklearn.neighbors import KNeighborsClassifier

# Set the parameters to search over
param_dist = {'n_neighbors': randint(1, 30),
              'weights': ['uniform', 'distance']}

# Create a KNN model
knn = KNeighborsClassifier()

# Run randomized search
rand_search = RandomizedSearchCV(knn, param_distributions=param_dist, n_iter=10, cv=5, random_state=42)

# Fit the randomized search to the training data
rand_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print('Best hyperparameters:', rand_search.best_params_)

# Predict the outcomes for the testing set
y_pred = rand_search.predict(X_test_scaled)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# return the trained model
model = rand_search.best_estimator_

print("Total time:", time.time()-start_time)

#### decision tree

In [None]:
start_time = time.time()

from sklearn.tree import DecisionTreeClassifier

# Set the parameters to search over
param_dist = {'max_depth': randint(1, 30)}

# Create a Decision Tree model
dt = DecisionTreeClassifier(random_state=42)

# Run randomized search
rand_search = RandomizedSearchCV(dt, param_distributions=param_dist, n_iter=10, cv=5, random_state=42)

# Fit the randomized search to the training data
rand_search.fit(X_train, y_train)

# Print the best hyperparameters
print('Best hyperparameters:', rand_search.best_params_)

# Predict the outcomes for the testing set
y_pred = rand_search.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# return the trained model
model = rand_search.best_estimator_

print("Total time:", time.time()-start_time)

#### random forest

In [None]:
start_time = time.time()

from sklearn.ensemble import RandomForestClassifier

# Set the parameters to search over
param_dist = {'max_depth': randint(1, 30), 'n_estimators': randint(10, 100)}

# Create a Random Forest model
rf = RandomForestClassifier(random_state=42)

# Run randomized search
rand_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=10, cv=5, random_state=42)

# Fit the randomized search to the training data
rand_search.fit(X_train, y_train)

# Print the best hyperparameters
print('Best hyperparameters:', rand_search.best_params_)

# Predict the outcomes for the testing set
y_pred = rand_search.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# return the trained model
model = rand_search.best_estimator_

print("Total time:", time.time()-start_time)

#### xgboost

In [None]:
start_time = time.time()

from xgboost import XGBClassifier

# Set the parameters to search over
param_dist = {'max_depth': randint(1, 30),
              'learning_rate': [0.01, 0.05, 0.1, 0.25, 0.5],
              'n_estimators': randint(10, 100)}

# Create an XGBoost model
xgb = XGBClassifier(random_state=42)

# Run randomized search
rand_search = RandomizedSearchCV(xgb, param_distributions=param_dist, n_iter=10, cv=5, random_state=42)

# Fit the randomized search to the training data
rand_search.fit(X_train, y_train)

# Print the best hyperparameters
print('Best hyperparameters:', rand_search.best_params_)

# Predict the outcomes for the testing set
y_pred = rand_search.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# return the trained model
model = rand_search.best_estimator_

# return the trained model
model = rand_search.best_estimator_

print("Total time:", time.time()-start_time)

#### kmeans

In [None]:
start_time = time.time()

import numpy as np
from sklearn.cluster import KMeans

# initialize a MinMaxScaler object
scaler = MinMaxScaler()

# fit and transform the data
X_scaled = scaler.fit_transform(X)

# Define the parameter space for the model
param_space = {'max_iter': np.arange(50, 2001, 50)}

# Create a KMeans model
kmeans = KMeans(n_clusters=2, random_state=42)

# Define the random search object
random_search = RandomizedSearchCV(kmeans, param_distributions=param_space, n_iter=10, cv=5, random_state=42)

# Train the model on the data
random_search.fit(X_scaled)

# Print the best hyperparameters
print('Best hyperparameters:', random_search.best_params_)

# Get the predicted labels for the data
y_pred = random_search.predict(X_scaled)

# Compute the accuracy of the model
accuracy = max(accuracy_score(y, y_pred), accuracy_score(y, (1-y_pred)))
print("Accuracy:", accuracy)

# return the trained model
model = random_search.best_estimator_

print("Total time:", time.time()-start_time)

#### pca

In [None]:
start_time = time.time()

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

# define the pipeline
pipe = Pipeline([
    ('pca', PCA(n_components=5)),
    ('xgb', XGBClassifier())
])

# Set the parameters to search over
param_dist = {'xgb__max_depth': randint(1, 30),
              'xgb__learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
              'xgb__n_estimators': randint(10, 100)}

# Run randomized search with cross-validation
rand_search = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=10, cv=5, random_state=42)

# Fit the randomized search to the training data
rand_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print('Best hyperparameters:', rand_search.best_params_)

# Predict the outcomes for the testing set
y_pred = rand_search.predict(X_test_scaled)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# return the trained model
best_pca, best_xgb = rand_search.best_estimator_

print("Total time:", time.time()-start_time)

#### model deployment

In [None]:
def load_data_simple(data_path):
    df = pd.read_csv(data_path)
    X = df[["NumDash"]]
    y = df["CLASS_LABEL"]
    return X, y

##### train simpler model

In [None]:
start_time = time.time()

X, y = load_data_simple("../Chapter03/phishing_dataset.csv")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set the hyperparameters
params = {'max_depth': 7,
          'learning_rate': 0.5,
          'n_estimators': 92}

# Create an XGBoost model
xgb = XGBClassifier(**params)

# Train the model on the phishing dataset
xgb.fit(X_train, y_train)

# Predict the outcomes for the testing set
y_pred = xgb.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print("Total time:", time.time()-start_time)

##### save and load model using joblib and make prediction

In [None]:
import joblib
import pandas as pd

# Save the trained model
joblib.dump(xgb, "xgb_model.joblib")

# Load the saved model
xgb = joblib.load("xgb_model.joblib")

numdash_data = [[1]]
X_numdash = np.array(numdash_data)

start_time = time.time()
# Make predictions on the NumDash data
predictions = xgb.predict(X_numdash)
print("Total time:", time.time()-start_time)

# Print the predictions
print(predictions)

##### save and load model using pickle and make prediction

In [None]:
import pickle

# Save the trained model as a pickle file
with open('xgb_model.pkl', 'wb') as f:
    pickle.dump(xgb, f)

# Load the saved model
with open('xgb_model.pkl', 'rb') as f:
    model = pickle.load(f)

# Load NumDash data and prepare it for inference
numdash_data = [[10]]
X_numdash = np.array(numdash_data)

start_time = time.time()
# Make predictions using the loaded model
y_pred = model.predict(X_numdash)
print("Total time:", time.time()-start_time)

# Print the predictions
print(y_pred)

##### make prediction using rules

In [None]:
start_time = time.time()
# Make predictions using the loaded model
y_pred = 1>2
print("Total time:", time.time()-start_time)