# Notebook for retrieving final destination phishing domains from urlscan.io 

## Requirements
* Python Libraries
  * Pandas
  * Requests
* urlscan.io API Key

In [None]:
# Libs
from requests import get
from getpass import getpass
from urllib.parse import quote
from json import loads
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

## Get urlscan.io data

### REGEXs
We have 4 REGEX queries that will find all the relevant final destination (FD) phishing domains with high accuracy. There are a few FPs that do come up with the first 2 queries however, we can filter those results out in later cell blocks.

#### REGEX 1: All FD domains with `-`
**Base REGEX**

```
^(http)(s)?:\/\/([a-z0-9]+\.)?[a-z0-9]{1,}\-[a-z0-9]{1,}\.(shop|online|xyz|club|com|top|si|org|net|live)\/main\/$
```

**urlscan.io Query**

```
page.url.keyword:/(http)(s)?:\/\/([a-z0-9]+\.)?[a-z0-9]{1,}\-[a-z0-9]{1,}\.(shop|online|xyz|club|com|top|si|org|net|live)\/main\//
```


#### REGEX 2: All FD domains without `-`
**Base REGEX**

```
^(http)(s)?:\/\/([a-z0-9]+\.)?([a-z]|[0-9])+\.(online|xyz|com|top|club|com.au|org.ge|co.ug|co.mz|one|org|net|live)\/main\/$
```

**urlscan.io Query**
```
page.url.keyword:/(http)(s)?:\/\/([a-z0-9]+\.)?([a-z]|[0-9])+\.(online|xyz|com|top|club|com.au|org.ge|co.ug|co.mz|one|org|net|live)\/main\//
```


#### REGEX 3: All FD URLs with `/jump/` path
**Base REGEX**

```
^(http)(s)?:\/\/([a-z0-9]+\.)?[a-z0-9\-]+\.((shop|online|club|com|top|si|org|net|live)|(xyz))\/(?(6)(jump|main)|main)\/$
```

**urlscan.io Query**
```
page.url.keyword:/(http)(s)?:\/\/([a-z0-9]+\.)?[a-z0-9\-]+\.xyz\/jump\//
```


#### REGEX 4: All FD URLs with `.well-known` path
**Base REGEX**

```
^(http)(s)?:\/\/([a-z0-9]+\.)?([a-z0-9\-])+\.([a-z])+\/(.*)?\.well-known\/((login\.php\?ss=2(&)?.*)|(.*\/authorize_client_id\:.*))+$
```

**urlscan.io Query**
```
page.url.keyword:/(http)(s)?:\/\/([a-z0-9]+\.)?([a-z0-9\-])+\.([a-z])+\/(.*)?\.well-known\/((login\.php\?ss=2(&)?.*)|(.*\/authorize_client_id\:.*))+/
```


### Getting Data
Using python requests can sometimes be slow, so if you are using curl, you can also load results from a file.
The first code block uses Python's requests library while the next code block loads from a file.

#### Python Requests

**Enter API Key**

In [None]:
api_key = getpass()

In [None]:
# Set max results here!
max_results = 10000
base_url = f'https://urlscan.io/api/v1/search/?size={max_results}&q='
regex_one_query = quote(
    'page.url.keyword:/(http)(s)?:\/\/([a-z0-9]+\.)?[a-z0-9]{1,}\-[a-z0-9]{1,}\.(shop|online|xyz|club|com|top|si|org|net|live)\/main\//'
)
regex_two_query = quote(
    'page.url.keyword:/(http)(s)?:\/\/([a-z0-9]+\.)?([a-z]|[0-9])+\.(online|xyz|com|top|club|com.au|org.ge|co.ug|co.mz|one|org|net|live)\/main\//'
)
regex_three_query = quote(
    'page.url.keyword:/(http)(s)?:\/\/([a-z0-9]+\.)?[a-z0-9\-]+\.xyz\/jump\//'
)
regex_four_query = quote(
    'page.url.keyword:/(http)(s)?:\/\/([a-z0-9]+\.)?([a-z0-9\-])+\.([a-z])+\/(.*)?\.well-known\/((login\.php\?ss=2(&)?.*)|(.*\/authorize_client_id\:.*))+/'
)
headers = {'api-key': api_key}

In [None]:
regex_one_results = get(f'{base_url}{regex_one_query}', headers=headers).json()

In [None]:
regex_two_results = get(f'{base_url}{regex_two_query}', headers=headers).json()

In [None]:
regex_three_results = get(f'{base_url}{regex_three_query}', headers=headers).json()

In [None]:
regex_four_results = get(f'{base_url}{regex_four_query}', headers=headers).json()

#### Parse results from json file

In [None]:
# Parse results from json file
regex_one_file = ''
regex_two_file = ''
regex_three_file = ''
regex_four_file = ''

In [None]:
with open(regex_one_file, 'r') as file:
    regex_one_results = loads(file.read())

In [None]:
with open(regex_two_file, 'r') as file:
    regex_two_results = loads(file.read())

In [None]:
with open(regex_three_file, 'r') as file:
    regex_three_results = loads(file.read())

In [None]:
with open(regex_four_file, 'r') as file:
    regex_four_results = loads(file.read())

In [None]:
print(f'REGEX one results: {regex_one_results["total"]}')
print(f'REGEX two results: {regex_two_results["total"]}')
print(f'REGEX three results: {regex_three_results["total"]}')
print(f'REGEX four results: {regex_four_results["total"]}')

# Combine data
regex_results = regex_one_results['results'] + regex_two_results['results'] + regex_three_results['results'] + regex_four_results['results']
total_results_prefilter = len(regex_results)
print(f'Total results: {total_results_prefilter}')

## Filter False Positives
* Where path is `/Main/` and not `/main/`
* Where task url is `/main/`. While this could remove some TPs, the FDs will likely pop up in other results.
* Where task url's path is root
* Remove google redirects
* Where task url path does at least 1 `?` or `@` or `==` or `#` or `&`
* Where domains are on the alexa 1 million

In [None]:
filtered_regex_results = regex_results.copy()
filtered_domains = set()
for i in regex_results:
    if i['page']['url'][-6:] == '/Main/' or i['task']['url'][-6:] == '/main/' or '/goo.gl/' in i['task']['url'] or 'google.com/' in i['task']['url'] or ('&' not in i['task']['url'][8:] and '?' not in i['task']['url'] and '@' not in i['task']['url'] and '=' not in i['task']['url'] and '#' not in i['task']['url']):
        x = 0
        while x < len(filtered_regex_results):
            if filtered_regex_results[x]['_id'] == i['_id']:
                filtered_domains.add(filtered_regex_results[x]['page']['domain'])
                del filtered_regex_results[x]
                break
            x += 1
        continue

In [None]:
# Alexa 1 Million (domain line by line)
alexa_file = ''
with open(alexa_file, 'r') as alexa:
    alexa = alexa.read().splitlines()

In [None]:
filtered_alexa_domains = []
for i in regex_results:
    if '.'.join(i['page']['domain'].split('.')[-2:]) in alexa:
        x = 0
        while x < len(filtered_regex_results):
            if filtered_regex_results[x]['_id'] == i['_id']:
                filtered_alexa_domains.append(filtered_regex_results[x])
                del filtered_regex_results[x]
                break
            x += 1

In [None]:
# Domains filtered - These might be true positives but likely compromised sites.
# If you would like to add these IOCs run this block.

# Set threshold for alexa to be considered compromised
threshold = 50000

for i in filtered_alexa_domains:
    x = 0
    while x < len(alexa):
        if '.'.join(i['page']['domain'].split('.')[-2:]) == alexa[x]:
            if x >= threshold:
                print(f"Threshold met for {i['page']['domain']}: {x}")
                filtered_regex_results.append(i)
            else:
                print(f"Threshold not met for {i['page']['domain']}: {x}")
        x += 1

In [None]:
print(f'Results removed: {total_results_prefilter - len(filtered_regex_results)}')
print(f'New total: {len(filtered_regex_results)}')

## Output Final Destination Phishing Domains

**Enter output file**

In [None]:
output_file = 'phish_domains.txt'

In [None]:
# Dedup domains
domains = set()
for i in filtered_regex_results:
    domains.add(i['page']['domain'])

print(f'Total phishing domains: {len(domains)}')

# Write to file
with open(output_file, 'w+') as file:
    for domain in domains:
        file.write(f"{domain}\n")

**Optional: output phish domains with datetime**

In [None]:
output_file = 'phish_domains_with_datetime.csv'

In [None]:
tasks_and_page_urls = []
for i in filtered_regex_results:
    tasks_and_page_urls.append([i['task']['time'], i['page']['domain']])
df = pd.DataFrame(tasks_and_page_urls, columns=['Task Time', 'Domain'])

print(f'Total results: {len(tasks_and_page_urls)}')

# Write to file
with open(output_file, 'w+') as file:
    file.write(df.to_csv(index=False))