In [None]:
import os
import time
from datetime import datetime, timedelta
from googleapiclient.discovery import build
import virustotal_python
from base64 import urlsafe_b64encode
import requests
from duckduckgo_search import DDGS


RESULTS = []
queries = ['"BC Registries and Online Services"', '"BC Corporate Online"']

def duckduckgo_search(query):
    with DDGS() as ddgs:
        try:
            res = ddgs.text(search_query, region='wt-wt', safesearch='off', timelimit='y', max_results=10)
            for r in res:
                google_safe_test(r['href'])
                virus_total_scan(r['href'])
        except:
            print('rate limit hit in duckduckgo')


# CRITERIA:
# - can exclude some bcgov sites
# - freshness -> 1 week
# - turning safe - active/off

# - can exclude north-american sites
# https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/filter-answers
# https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/search-responses
def bing_search(query):
    bing_api_key = os.getenv('BING_API_KEY', '')
    bing_id = os.getenv('BING_ID', '')
    LINKS_TO_EXCLUDE = ['-site:www2.gov.bc.ca',
                        '-site:www.names.bcregistry.gov.bc.ca',
                        '-site:www.bcregistry.gov.bc.ca',
                        '-site:www.account.bcregistry.gov.bc.ca',
                        '-site:www.business.bcregistry.gov.bc.ca',
                        '-site:www.bcregistryallservices.gov.bc.ca',
                        '-site:www.bcregistry.daxiom.ca',
                        '-site:digital.gov.bc.ca',
                        '-site:justice.gov.bc.ca',
                        '-site:dir.gov.bc.ca',
                        '-site:www.search.business.bcregistry.gov.bc.ca',
                        '-site:www.corporateonline.gov.bc.ca',
                        '-site:www.bconline.gov.bc.ca',
                        '-site:www.bcregistry.ca',
                        '-site:www.analytics.bcregistry.gov.bc.ca',
                        '-site:www.directorsearch.bcregistry.gov.bc.ca',
                        '-site:orgbook.gov.bc.ca',
                        '-site:bcchamber.org',
                        '-site:news.gov.bc.ca',
                        '-site:test.names.bcregistry.gov.bc.ca',
                        '-site:test.bcregistry.gov.bc.ca',
                        '-site:test.account.bcregistry.gov.bc.ca',
                        '-site:test.bcregistry.ca',
                        '-site:test.directorsearch.bcregistry.gov.bc.ca',
                        '-site:dev.names.bcregistry.gov.bc.ca',
                        '-site:dev.bcregistry.gov.bc.ca',
                        '-site:dev.account.bcregistry.gov.bc.ca',
                        '-site:dev.bcregistry.ca',
                        '-site:dev.bcros.ca',
                        '-site:dev.directorsearch.bcregistry.gov.bc.ca',
                        ]
    subscriptionKey = bing_api_key
    customConfigId = bing_id
    params = {
        'textDecorations': False,
        'textFormat': 'raw',
        'responseFilter': 'Webpages',
        'count': 30,
        'freshness': 'week',
        'customconfig': customConfigId
    }
    searchTerm = query
    params['q'] = searchTerm + ' ' + ' '.join(LINKS_TO_EXCLUDE)
    headers = {
        'Ocp-Apim-Subscription-Key': subscriptionKey,
        'Accept': 'application/json',
        'Retry-After': '1',
    }

    url = 'https://api.bing.microsoft.com/v7.0/custom/search'
    r = requests.get(url, headers=headers, params=params)
    json_data = r.json()
    if 'webPages' in json_data:
        for entry in json_data['webPages']['value']:
            google_safe_test(entry['url'])
            if 'malware' in entry:
                print('malware found: ')
                print(entry)
                RESULTS.append(entry)
            else:
                virus_total_scan(entry['url'])


# CRITERIA:
# - in the last 3 weeks
# - turning safe - active/off
# - can exclude some bcgov sites
# - can exclude north-american sites
# https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list#response
# https://developers.google.com/custom-search/v1/reference/rest/v1/Search
def google_search(query):
    google_api_key = os.getenv('GOOGLE_API_KEY', '')
    service = build(
        'customsearch', 'v1', developerKey=google_api_key
    )

    res = (
        service.cse()
        .list(
            q=query,
            exactTerms='British Columbia',
            cx='27230fb86149043c0',
            cr='(-countryCA).(-countryUS)',
            siteSearch='www2.gov.bc.ca  www.names.bcregistry.gov.bc.ca  www.bcregistry.gov.bc.ca www.account.bcregistry.gov.bc.ca',
            siteSearchFilter='e',
            dateRestrict='w1',
            # safe='active'
        )
        .execute()
    )
    if int(res['searchInformation']['totalResults']) > 0:
        print('Found Potential Phishing sites')
        for url in res['items']:
            google_safe_test(url['link'])
            virus_total_scan(url['link'])


# https://services.google.com/fh/files/misc/web_risk_data_sheet.pdf
# https://developers.google.com/safe-browsing/v4/lookup-api
def google_safe_test(test_url):
    google_api_key = os.getenv('GOOGLE_API_KEY', '')
    url = 'https://safebrowsing.googleapis.com/v4/threatMatches:find'
    payload = {'client': {'clientId': 'mycompany', 'clientVersion': '0.1'},
               'threatInfo': {'threatTypes': ['SOCIAL_ENGINEERING', 'MALWARE'],
                              'platformTypes': ['ANY_PLATFORM'],
                              'threatEntryTypes': ['URL'],
                              'threatEntries': [{'url': test_url}]}}
    params = {'key': google_api_key}
    r = requests.post(url, params=params, json=payload)
    json_data = r.json()
    if json_data:
        print('URL in Google SafeList')
        RESULTS.append(json_data)


def virus_total_scan(test_url):
    virus_total_api_key = os.getenv('VIRUS_TOTAL_API_KEY', '')
    with virustotal_python.Virustotal(virus_total_api_key) as vtotal:
        try:
            resp = vtotal.request("urls", data={"url": test_url}, method="POST")
            url_id = urlsafe_b64encode(test_url.encode()).decode().strip("=")
            report = vtotal.request(f"urls/{url_id}")
            mal_count = int(report.data['attributes']['last_analysis_stats']['malicious'])
            susp_count = int(report.data['attributes']['last_analysis_stats']['suspicious'])
            if mal_count > 0 and susp_count > 0:
                print('Potentially malicious link')
                RESULTS.append(report)
        except virustotal_python.VirustotalError as err:
            print(f"Failed to send URL: {test_url} for analysis and get the report: {err}")
        finally:
            time.sleep(20)


def scan_search_results():
    datestr = datetime.strftime(datetime.now() - timedelta(1), '%Y-%m-%d')
    filename = os.path.join(os.getcwd(), r'data/') + 'phishing-scan-' + datestr + '.csv'
    for q in queries:
        google_search(q)
        bing_search(q)
        duckduckgo_search(q)
    with open(filename, 'w') as f:
        if len(RESULTS) == 0:
            f.write('No malicious URLs returned ' + datestr)
        for entry in RESULTS:
            f.write(entry + '\n')
        f.write('\n\n')


if __name__ == '__main__':
    scan_search_results()