# URL Maliciousness Model (LogReg primary, RF fallback)

In [ ]:
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from joblib import dump
from pathlib import Path
import json
from backend.ml.feature_extractor import extract_features_from_url

mal = pd.read_csv('model/data/kaggle_malicious_samples.csv')
val = pd.read_csv('model/data/valid_urls_seed.csv')
mal['label']=1
val['label']=0
df = pd.concat([mal, val], ignore_index=True)
X = df['url'].apply(lambda u: extract_features_from_url(u))
feat_names = list(X.iloc[0].keys())
X = np.stack(X.apply(lambda d: [d[k] for k in feat_names]).to_numpy())
y = df['label'].to_numpy()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)
logreg = LogisticRegression(max_iter=1000).fit(X_train,y_train)
rf = RandomForestClassifier(n_estimators=200, random_state=42).fit(X_train,y_train)
pred = logreg.predict(X_test)
acc = accuracy_score(y_test,pred)
print('Accuracy:', acc)
print('Confusion:', confusion_matrix(y_test,pred).tolist())
Path('backend/ml').mkdir(parents=True, exist_ok=True)
dump(logreg, 'backend/ml/url_model.pkl')
with open('backend/ml/feature_schema.json','w') as f:
    json.dump({ 'features': feat_names }, f)
with open('model/sample_predictions.csv','w') as f:
    f.write('url,pred\n')
    for i in range(min(20, len(X_test))):
        f.write(f"{df.iloc[i]['url']},{int(pred[i])}\n")
print('Saved model and artifacts')
