# Notebook for retrieving final destination phishing domains from urlscan.io 

## Requirements
* Python Libraries
  * Pandas
  * Requests
* urlscan.io API Key

In [None]:
# Libs
from requests import get
from getpass import getpass
from urllib.parse import quote
from json import loads
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

## Get urlscan.io data

### REGEXs
We have 2 REGEX queries that will find all the relevant final destination (FD) phishing domains with high accuracy. The first 2 queries are not combined due to limitations in how many results you can obtain at one time from the urlscan API. 

#### REGEX 1: All FD URLs abusing reCAPTCHA with `-`
**Base REGEX**

```
^(http)(s)?:\/\/([a-z0-9]+\.)?[a-z0-9]{1,}\-[a-z0-9]{1,}\.[a-z]+\/(main|jump)\/$
```

**urlscan.io Query**

```
filename:"/recaptcha/api.js" AND page.url.keyword:/(http)(s)?:\/\/([a-z0-9]+\.)?[a-z0-9]{1,}\-[a-z0-9]{1,}\.[a-z]+\/(main|jump)\//
```


#### REGEX 2: All FD URLs abusing reCAPTCHA without `-`
**Base REGEX**

```
^(http)(s)?:\/\/([a-z0-9]+\.)?([a-z]|[0-9])+\.[a-z]+\/(main|jump)\/$
```

**urlscan.io Query**
```
filename:"/recaptcha/api.js" AND page.url.keyword:/(http)(s)?:\/\/([a-z0-9]+\.)?([a-z]|[0-9])+\.[a-z]+\/(main|jump)\//
```


### Hash queries

#### ExRobotos Phishing URLs Group 1

**urlscan.io Query**
```
hash:35283bce87f120b3df83722176e4c6684f2e64088aa24f357ac7530b54754beb
```

#### ExRobotos Phishing URLs Group 2

**urlscan.io Query**
```
hash:ce1441121feb1441dcd78d618caa8228432271f6671e896c8a753af3dd679623 AND hash:105c03d3360cdb953585482374b2cc953d090741037502b0609629f5bb0135b7 AND hash:f32a760f15530284447282af5c7d0825babf8bc4739e073928f6128830819f7a
```

### Getting Data
Using python requests can sometimes be slow, so if you are using curl, you can also load results from a file.
The first code block uses Python's requests library while the next code block loads from a file.

#### Python Requests

**Enter API Key**

In [None]:
api_key = getpass()

In [None]:
# Set max results here!
max_results = 10000
base_url = f'https://urlscan.io/api/v1/search/?size={max_results}&q='
regex_one_query = quote(
    'filename:"/recaptcha/api.js" AND page.url.keyword:/(http)(s)?:\/\/([a-z0-9]+\.)?[a-z0-9]{1,}\-[a-z0-9]{1,}\.[a-z]+\/(main|jump)\//'
)
regex_two_query = quote(
    'filename:"/recaptcha/api.js" AND page.url.keyword:/(http)(s)?:\/\/([a-z0-9]+\.)?([a-z]|[0-9])+\.[a-z]+\/(main|jump)\//'
)
exrobotos_group_one = quote(
    'hash:35283bce87f120b3df83722176e4c6684f2e64088aa24f357ac7530b54754beb'
)
exrobotos_group_two = quote(
    'hash:ce1441121feb1441dcd78d618caa8228432271f6671e896c8a753af3dd679623 AND hash:105c03d3360cdb953585482374b2cc953d090741037502b0609629f5bb0135b7 AND hash:f32a760f15530284447282af5c7d0825babf8bc4739e073928f6128830819f7a'
)
headers = {'api-key': api_key}

In [None]:
regex_one_results = get(f'{base_url}{regex_one_query}', headers=headers).json()

In [None]:
regex_two_results = get(f'{base_url}{regex_two_query}', headers=headers).json()

In [None]:
group_one_results = get(f'{base_url}{exrobotos_group_one}', headers=headers).json()

In [None]:
group_two_results = get(f'{base_url}{exrobotos_group_two}', headers=headers).json()

#### Parse results from json file

In [None]:
# Parse results from json file
regex_one_file = ''
regex_two_file = ''
exrobotos_group_one_file = ''
exrobotos_group_two_file = ''

In [None]:
with open(regex_one_file, 'r') as file:
    regex_one_results = loads(file.read())

In [None]:
with open(regex_two_file, 'r') as file:
    regex_two_results = loads(file.read())

In [None]:
with open(exrobotos_group_one_file, 'r') as file:
    group_one_results = loads(file.read())

In [None]:
with open(exrobotos_group_two_file, 'r') as file:
    group_two_results = loads(file.read())

In [None]:
print(f'REGEX 1 results: {regex_one_results["total"]}')
print(f'REGEX 2 results: {regex_two_results["total"]}')
print(f'ExRobotos Group 1 results: {group_one_results["total"]}')
print(f'ExRobotos Group 2 results: {group_two_results["total"]}')

# Combine data
recaptcha_results = regex_one_results['results'] + regex_two_results['results']
exrobotos_results = group_one_results["results"] + group_two_results["results"]
total_recaptcha = len(recaptcha_results)
total_exrobotos = len(exrobotos_results)
print(f'Total reCAPTCHA results: {total_recaptcha}')
print(f'Total ExRobotos results: {total_exrobotos}')

## Output Final Destination Phishing Domains

**Enter output files**

In [None]:
output_recaptcha_file = 'phish_domains_abusing_recaptcha.txt'
output_exrobotos_file_one = 'phish_domains_exrobotos_group_one.txt'
output_exrobotos_file_two = 'phish_domains_exrobotos_group_two.txt'

In [None]:
# reCAPTCHA
# Dedup domains
recaptcha_domains = set()
for i in recaptcha_results:
    recaptcha_domains.add(i['page']['domain'])

print(f'Total phishing domains abusing reCAPTCHA: {len(recaptcha_domains)}')

# Write to file
with open(output_recaptcha_file, 'w+') as file:
    for domain in recaptcha_domains:
        file.write(f"{domain}\n")

In [None]:
# ExRobotos
# Dedup domains
exrobotos_domains = set()
for i in group_one_results["results"]:
    exrobotos_domains.add(i['page']['domain'])
print(f'Total phishing domains using ExRobotos Phishing Kit: {len(exrobotos_domains)}')

# Write to file
with open(output_exrobotos_file_one, 'w+') as file:
    for domain in exrobotos_domains:
        file.write(f"{domain}\n")

# Dedup domains
exrobotos_domains = set()
for i in group_two_results["results"]:
    exrobotos_domains.add(i['page']['domain'])
print(f'Total phishing domains using ExRobotos Phishing Kit: {len(exrobotos_domains)}')

# Write to file
with open(output_exrobotos_file_two, 'w+') as file:
    for domain in exrobotos_domains:
        file.write(f"{domain}\n")

**Optional: output phish domains with datetime**

In [None]:
output_recaptcha_file = 'phish_domains_abusing_recaptcha_with_datetime.csv'
output_exrobotos_file_one = 'phish_domains_exrobotos_group_one_with_datetime.csv'
output_exrobotos_file_two = 'phish_domains_exrobotos_group_two_with_datetime.csv'

In [None]:
tasks_and_page_urls = []
for i in recaptcha_results:
    tasks_and_page_urls.append([i['task']['time'], i['page']['domain']])
df = pd.DataFrame(tasks_and_page_urls, columns=['Task Time', 'Domain'])

print(f'Total results: {len(tasks_and_page_urls)}')

# Write to file
with open(output_recaptcha_file, 'w+') as file:
    file.write(df.to_csv(index=False))

In [None]:
tasks_and_page_urls = []
for i in group_one_results["results"]:
    tasks_and_page_urls.append([i['task']['time'], i['page']['domain']])
df = pd.DataFrame(tasks_and_page_urls, columns=['Task Time', 'Domain'])

print(f'Total results: {len(tasks_and_page_urls)}')

# Write to file
with open(output_exrobotos_file_one, 'w+') as file:
    file.write(df.to_csv(index=False))

tasks_and_page_urls = []
for i in group_two_results["results"]:
    tasks_and_page_urls.append([i['task']['time'], i['page']['domain']])
df = pd.DataFrame(tasks_and_page_urls, columns=['Task Time', 'Domain'])

print(f'Total results: {len(tasks_and_page_urls)}')

# Write to file
with open(output_exrobotos_file_two, 'w+') as file:
    file.write(df.to_csv(index=False))