In [1]:
!pip install pyzbar
!apt-get install libzbar0


Collecting pyzbar
  Downloading pyzbar-0.1.9-py2.py3-none-any.whl.metadata (10 kB)
Downloading pyzbar-0.1.9-py2.py3-none-any.whl (32 kB)
Installing collected packages: pyzbar
Successfully installed pyzbar-0.1.9
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  fonts-droid-fallback fonts-noto-mono fonts-urw-base35 ghostscript gsfonts
  imagemagick-6-common libdjvulibre-text libdjvulibre21 libfftw3-double3
  libgs9 libgs9-common libidn12 libijs-0.35 libilmbase25 libjbig2dec0
  libjxr-tools libjxr0 liblqr-1-0 libmagickcore-6.q16-6
  libmagickcore-6.q16-6-extra libmagickwand-6.q16-6 libopenexr25 libv4l-0
  libv4lconvert0 libwmflite-0.2-7 poppler-data
Suggested packages:
  fonts-noto fonts-freefont-otf | fonts-freefont-ttf fonts-texgyre
  ghostscript-x libfftw3-bin libfftw3-dev inkscape poppler-utils
  fonts-japanese-mincho | fonts-ipafont-mincho fonts-japanese-gothic
  | fonts-ipafont-goth

## Decoder

In [2]:
from pyzbar.pyzbar import decode
import cv2

def decode_qr(image_path):
    # Read image
    img = cv2.imread(image_path)

    # Decode QR
    decoded = decode(img)

    if not decoded:
        return None  # No QR found

    # Extract decoded text
    data = decoded[0].data.decode("utf-8")
    return data


In [3]:
result = decode_qr("/content/drive/MyDrive/mendeley_qr/benign/qr_485_benign_images.png")
print(result)


https://www.edx.org/


In [4]:
def decode_qr_all(image_path):
    img = cv2.imread(image_path)
    decoded = decode(img)

    if not decoded:
        return {
            "success": False,
            "message": "No QR code detected.",
            "data": None
        }

    results = [obj.data.decode("utf-8") for obj in decoded]

    return {
        "success": True,
        "count": len(results),
        "data": results
    }


## Feature vector

In [5]:
!pip install tldextract


Collecting tldextract
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-3.0.1-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading tldextract-5.3.0-py3-none-any.whl (107 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/107.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.4/107.4 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_file-3.0.1-py2.py3-none-any.whl (4.5 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-3.0.1 tldextract-5.3.0


In [6]:
import re
import tldextract
import urllib.parse
import numpy as np
from math import log2

In [7]:
def string_entropy(s):  # malicious urls will have high entropy
    prob = [float(s.count(c)) / len(s) for c in dict.fromkeys(list(s))]
    return -sum([p * log2(p) for p in prob])


## Feature extractor function

the function returns a dictionary of features useful for classifying a URL as safe or harmful

In [8]:
def extract_url_features(url):
    features = {}

    parsed = urllib.parse.urlparse(url)
    domain_info = tldextract.extract(url)

    hostname = parsed.netloc
    path = parsed.path

    # ----- Basic length features -----
    features["url_length"] = len(url)
    features["domain_length"] = len(hostname)
    features["path_length"] = len(path)

    # ----- Count-based features -----
    features["num_dots"] = url.count('.')
    features["num_slashes"] = url.count('/')
    features["num_dashes"] = url.count('-')
    features["num_plus"] = url.count('+')
    features["num_digits"] = sum(c.isdigit() for c in url)

    # ----- Suspicious keywords -----
    suspicious_keywords = [
        "login","verify","secure","update","bank","account",
        "reset","alert","confirm","billing","webscr","signin",
        "auth","wp-admin","approve"
    ]
    features["has_suspicious_keyword"] = int(
        any(k in url.lower() for k in suspicious_keywords)
    )

    # ----- Protocol -----
    features["is_https"] = int(url.startswith("https"))

    # ----- IP address detection -----
    ip_pattern = r"^\d{1,3}(\.\d{1,3}){3}$"
    features["has_ip_address"] = int(bool(re.match(ip_pattern, hostname)))

    # ----- Special characters -----
    features["num_special_chars"] = sum(c in "%@!#$&*" for c in url)

    # ----- TLD category -----
    features["is_suspicious_tld"] = int(domain_info.suffix in ["xyz","top","tk","ml","cf","gq"])

    # ----- URL entropy -----
    features["url_entropy"] = string_entropy(url)

    return features


converting the feature vector into np array to pass to ml model

In [9]:
def vectorize_features(feature_dict):
    return np.array(list(feature_dict.values())).reshape(1, -1)


testing the feature extractor

In [10]:
url = "https://www.google.com"
f = extract_url_features(url)
vector = vectorize_features(f)

print(f)
print(vector)


{'url_length': 22, 'domain_length': 14, 'path_length': 0, 'num_dots': 2, 'num_slashes': 2, 'num_dashes': 0, 'num_plus': 0, 'num_digits': 0, 'has_suspicious_keyword': 0, 'is_https': 1, 'has_ip_address': 0, 'num_special_chars': 0, 'is_suspicious_tld': 0, 'url_entropy': 3.6635327548042547}
[[22.         14.          0.          2.          2.          0.
   0.          0.          0.          1.          0.          0.
   0.          3.66353275]]


In [11]:
url = "http://login-verify-paypal.com.secure-check.xyz/update"
f = extract_url_features(url)
vector = vectorize_features(f)

print(f)
print(vector)


{'url_length': 54, 'domain_length': 40, 'path_length': 7, 'num_dots': 3, 'num_slashes': 3, 'num_dashes': 3, 'num_plus': 0, 'num_digits': 0, 'has_suspicious_keyword': 1, 'is_https': 0, 'has_ip_address': 0, 'num_special_chars': 0, 'is_suspicious_tld': 1, 'url_entropy': 4.4930548079520305}
[[54.         40.          7.          3.          3.          3.
   0.          0.          1.          0.          0.          0.
   1.          4.49305481]]


# Training the Model

In [12]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/balanced_urls.csv")

df

Unnamed: 0,url,label,result
0,https://www.google.com,benign,0
1,https://www.youtube.com,benign,0
2,https://www.facebook.com,benign,0
3,https://www.baidu.com,benign,0
4,https://www.wikipedia.org,benign,0
...,...,...,...
632503,xbox360.ign.com/objects/850/850402.html,malicious,1
632504,games.teamxbox.com/xbox-360/1860/Dead-Space/,malicious,1
632505,www.gamespot.com/xbox360/action/deadspace/,malicious,1
632506,en.wikipedia.org/wiki/Dead_Space_(video_game),malicious,1


In [13]:
# extracting features for each URL

feature_rows = []
for url, label in zip(df["url"], df["result"]):
    feats = extract_url_features(url)
    feats["label"] = label
    feature_rows.append(feats)

feature_df = pd.DataFrame(feature_rows)

X = feature_df.drop("label", axis=1)
y = feature_df["label"]

## Checking features and labels

In [14]:
X

Unnamed: 0,url_length,domain_length,path_length,num_dots,num_slashes,num_dashes,num_plus,num_digits,has_suspicious_keyword,is_https,has_ip_address,num_special_chars,is_suspicious_tld,url_entropy
0,22,14,0,2,2,0,0,0,0,1,0,0,0,3.663533
1,23,15,0,2,2,0,0,0,0,1,0,0,0,3.762267
2,24,16,0,2,2,0,0,0,0,1,0,0,0,3.855389
3,21,13,0,2,2,0,0,0,0,1,0,0,0,3.880180
4,25,17,0,2,2,0,0,0,0,1,0,0,0,3.813661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
632503,39,0,39,3,3,0,0,12,0,0,0,0,0,4.355539
632504,44,0,44,2,4,2,0,7,0,0,0,0,0,4.243300
632505,42,0,42,2,4,0,0,3,0,0,0,0,0,4.147921
632506,45,0,45,2,2,0,0,0,0,0,0,0,0,4.102313


In [15]:
y

Unnamed: 0,label
0,0
1,0
2,0
3,0
4,0
...,...
632503,1
632504,1
632505,1
632506,1


In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=42,
    stratify=y
)

In [17]:
from xgboost import XGBClassifier


xgb_model = XGBClassifier(
    n_estimators=300,
    max_depth=7,
    learning_rate=0.08,         # Balanced learning speed
    subsample=0.85,             # Good regularization
    colsample_bytree=0.85,      # Balanced feature sampling
    min_child_weight=3,         # Prevent overfitting
    gamma=0.1,                  # Additional regularization
    random_state=42,
    n_jobs=-1,                  # Use all CPU cores
    eval_metric='logloss',
    tree_method='hist',         # Faster for large datasets (600k+ rows)
    enable_categorical=False
)

In [19]:
xgb_model.fit(X_train, y_train)


## Evaluation

In [21]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = xgb_model.predict(X_test)

In [22]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99     63251
           1       1.00      0.98      0.99     63251

    accuracy                           0.99    126502
   macro avg       0.99      0.99      0.99    126502
weighted avg       0.99      0.99      0.99    126502



In [23]:
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.9906799892491819


In [26]:
import joblib

joblib.dump(xgb_model, "qrshield_url_model.pkl")
joblib.dump(list(X.columns), "qrshield_feature_order.pkl")

['qrshield_feature_order.pkl']

## Test the model

In [45]:
import joblib
import numpy as np

# Load saved model and feature order
model = joblib.load("qrshield_url_model.pkl")
feature_order = joblib.load("qrshield_feature_order.pkl")

# URL to test
test_url = "https://freegiftcards.com"

# Extract features
features = extract_url_features(test_url)

# Convert to vector based on saved feature order
vector = np.array([features[f] for f in feature_order]).reshape(1, -1)

# Predict
prediction = model.predict(vector)[0]
prob = model.predict_proba(vector)[0][1]

print("URL:", test_url)
print("Prediction:", "MALICIOUS" if prediction == 1 else "BENIGN")
print("Malicious Probability:", prob)


URL: https://freegiftcards.com
Prediction: MALICIOUS
Malicious Probability: 0.98351073
