<a href="https://colab.research.google.com/github/bristiHalder/Phishing-Link-Scanner/blob/main/Phishing_Link_Scanner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install tldextract

Collecting tldextract
  Downloading tldextract-5.1.2-py3-none-any.whl (97 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.6/97.6 kB[0m [31m959.3 kB/s[0m eta [36m0:00:00[0m
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl (4.2 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-2.1.0 tldextract-5.1.2


In [4]:
!pip install Levenshtein

Collecting Levenshtein
  Downloading Levenshtein-0.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (177 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.4/177.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz<4.0.0,>=3.8.0 (from Levenshtein)
  Downloading rapidfuzz-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein
Successfully installed Levenshtein-0.25.1 rapidfuzz-3.9.1


In [22]:
import tldextract #used to extract the subdomain, domain, and suffix (TLD) from a given URL.
import Levenshtein as lv  #Levenshtein distance functions (used for measuring string similarity)

In [23]:
legitimate_domains = ['google.com', 'instagram.com', 'spotify.com']


In [24]:
test_urls = [
    'https://www.google.security-update.com',
    'http://faceb00k.com/login',
    'https://google.com',
    'http://spotify.co'
]

In [26]:
#This function takes a URL and uses tldextract to extract the subdomain, domain, and suffix. It then returns these components.
def extract_domain_parts(url):
  extracted = tldextract.extract(url)
  return extracted.subdomain, extracted.domain, extracted.suffix

In [27]:
#This function checks if the given 'domain' is similar to any of the 'legitimate_domains' using the Levenshtein ratio.
def is_misspelled_domain(domain, legitimate_domains, threshold=0.9):
  for legit_domain in legitimate_domains:
    similarity = lv.ratio(domain, legit_domain)  #Calculates the similarity ratio between the 'domain' and each 'legit_domain'.
    if similarity >= threshold:
      return False #it's a legitimate domain
  return True #No close match found, possibly misspelled

In [28]:
#This function determines if a URL is a potential phishing site.
def is_phishing_url(url, legitimate_domains):
  subdomain, domain, suffix = extract_domain_parts(url)

  #Check if its a known legitimate domain
  #Checks if the domain and suffix combination is in the list of legitimate_domains.
  if f"{domain},{suffix}" in legitimate_domains:
    return False

  #Check for misspelled domain names
  if is_misspelled_domain(domain, legitimate_domains):
    print(f"Potential phishing detected:{url}")
    return True

  return False

In [29]:
#main script
if __name__ == '__main__':
  for url in test_urls:
    is_phishing_url(url, legitimate_domains)

Potential phishing detected:https://www.google.security-update.com
Potential phishing detected:http://faceb00k.com/login
Potential phishing detected:https://google.com
Potential phishing detected:http://spotify.co
