In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib
from flask import Flask, request, jsonify

In [3]:
# Step 1: Load and preprocess the dataset
def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)
    # Map labels to binary: 0 for benign, 1 for malicious
    df['type'] = df['type'].apply(lambda x: 0 if x == 'benign' else 1)
    return df

In [5]:
# Step 2: Define a custom tokenizer for URLs
def url_tokenizer(url):
    # Split on '/', '-', '.'
    tokens = re.split('[/-]', url)
    return [token for token in tokens if token]

In [7]:
# Step 3: Extract features using TF-IDF
def extract_features(urls):
    vectorizer = TfidfVectorizer(tokenizer=url_tokenizer)
    features = vectorizer.fit_transform(urls)
    return features, vectorizer

In [9]:
# Step 4: Train the model
def train_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LogisticRegression(random_state=0)
    model.fit(X_train, y_train)
    
    # Evaluate the model
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    
    return model, X_test, y_test

In [11]:
# Step 5: Save the model and vectorizer
def save_model_and_vectorizer(model, vectorizer, model_path='url_classifier.pkl', vectorizer_path='vectorizer.pkl'):
    joblib.dump(model, model_path)
    joblib.dump(vectorizer, vectorizer_path)

In [13]:
# Main execution
# Replace with the path to your dataset
dataset_path = 'malicious_phish.csv'

# Load and preprocess data
df = load_and_preprocess_data(dataset_path)

# Extract features
X, vectorizer = extract_features(df['url'])
y = df['type']

# Train model
model, X_test, y_test = train_model(X, y)

# Save model and vectorizer
save_model_and_vectorizer(model, vectorizer)



Accuracy: 0.9223888389806433
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.99      0.94     85778
           1       0.98      0.79      0.87     44461

    accuracy                           0.92    130239
   macro avg       0.94      0.89      0.91    130239
weighted avg       0.93      0.92      0.92    130239

