In [23]:
#!/usr/bin/env python
''' Scrapes the password policies from Alexa's top websites '''

from bs4 import BeautifulSoup
import requests
from urllib.parse import urlparse
import re

__author__ = "Aditya Godambe"
__email__ = "agodambe@cs.stonybrook.edu"

In [24]:
# List of URLs from Alexa top websites for USA
alexa_urls = []
with open("sites.csv", 'r') as file:
    for line in file.readlines():
        alexa_urls.append(line.rstrip())


# List of urls not to be scraped
blacklist = []
with open("blacklist.csv", 'r') as file:
    for line in file.readlines():
        blacklist.append(line.rstrip())


# Bag of words for different policies
policies = []
with open("policies.csv",'r') as file:
    for line in file.readlines():
        policies.append(line.rstrip())



# Bag of words for Sign Up action
sign_up_list = []
with open("signup.csv",'r') as file:
    for line in file.readlines():
        sign_up_list.append(line.rstrip())


# Bag of words for Sign in action
sign_in_list = []
with open("signin.csv",'r') as file:
    for line in file.readlines():
        sign_in_list.append(line.rstrip())
        
data = {}

In [25]:
def buildUrl(protocol, target, old_target):
    '''
        This function builds a complete url based on available link
        Args:
            protocol: The protocol used by the website, set default to https://wwww.
            target: The portion of the new url that needs to be completed
            old_target: The url of the current page, typically, the sign in page
        Returns:
            complete url to be requested (str)
    '''
    data = requests.request("GET", old_target)
    url = data.url
    netlocal = urlparse(url).netloc
    
    #print(netlocal)
    #print(protocol)
    
    return protocol + netlocal + target

In [26]:
def extractPolicy(url, soup, policy):
    '''
        This function extracts the policy present on the page
        Args:
            soup: BeautifulSoup object for the current page
            policy: The policy keywords found on the page
        Returns:
            Currently, prints the original URL and the password policy
    '''
    
    final_policy = ""
    
    policy_str = soup.find(string = re.compile(policy))
    index = policy_str.find(policy)
    
    while index < len(policy_str):
        if policy_str[index] == "." or policy_str[index] == ")":
            break
        final_policy = final_policy + policy_str[index]
        index = index+1
    
    #print(url)
    #print(final_policy)
    data[protocol+url] = final_policy
    

In [27]:
def userAccountPresent(soup):
    '''
        This function returns true if the site has an option to create a user account
        Args: 
            soup: BeautifulSoup object for the current page
        Returns:
            True, if there is an option to create account
            False, otherwise
    '''
    #sign_in_present = False
    #sign_up_present = False
    
    # Check if Sign in option is present
    for sign_in in sign_in_list:
        if len(soup.find_all("a")) != 0:
            for link in soup.find_all("a"):
                if len(link) != 0:
                    if sign_in in link: #or sign_in in link.get("href"):
                        return True
    
    for sign_up in sign_up_list:
        if len(soup.find_all("a")) != 0:
            for link in soup.find_all("a"):
                if len(link) != 0:
                    if sign_up in link:# or sign_up in link.get("href"):
                        return True
    
    return False
        

In [28]:
def checkPolicies(soup):
    '''
        This funtion checks if the page contains the policies
        Args:
            soup: BeautifulSoup object for the current page
        Returns:
            True, if policy found
            False, otherwise
    '''
    policy_found = False
    for policy in policies:
        if len(soup.find_all(string = re.compile(policy))) != 0:
            # Policy exists on this page
            policy_found = True
            extractPolicy(url, soup, policy)
    
    return policy_found
        
    

In [None]:
# TODO: Check sign up in homepage also eg: twitch.tv
# TODO: Fix stackoverflow.com issue: policies not being scraped
# TODO: connection error issue with some websites: too many requests


# Traverse through the list of urls, skip blacklisted ones

protocol = "https://www."
site_number = 0

for url in alexa_urls:
    url = url.lower()
    site_number = site_number+1
    #print("\n" + str(site_number) + ". " + url)
    
    blacklisted = False
    policy_found = False
    sign_up_found = False
    sign_in_found = False
    
    
    for blacklist_keyword in blacklist:
        if blacklist_keyword in url:
            blacklisted = True
            break
    if blacklisted:
        #print("Scraping not allowed")
        data[protocol+url] = "Scraping not allowed"
        continue
        
        
    # For urls that are not blacklisted, perform scraping
    r = requests.get(protocol + url)
    c = r.content
    
    soup = BeautifulSoup(c, "html.parser")
    
    # Check if this site requires a user account
    if not userAccountPresent(soup):
        continue
    
    # Check if policy exists on the home page
    policy_found = checkPolicies(soup)
    if policy_found:
        continue
        
    
    # Check for Sign In keywords
    target = ""
    for sign_in in sign_in_list:
        for link in soup.find_all("a"):
            if sign_in in link:# or sign_in in link.get("href"):
                target = link.get("href")
                sign_in_found = True
                break
        if sign_in_found:
            break
    
    
    if target[0] == "/":
        # Get complete url from the relative url
        target = buildUrl(protocol, target, protocol+url)
    
    # Go to the target url and check for policy
    try:
        r = requests.get(target, timeout=5)
    except requests.exceptions.ConnectionError:
        r.status_code = "Connection refused"
        
    c = r.content
    
    soup = BeautifulSoup(c, "html.parser")
    
    # Check if policy exists on the Sign in page
    policy_found = checkPolicies(soup)  
    if policy_found:
        continue
        
    # Policy not found in the Sign in page; Check for sign up keywords in the sign in page
    old_target = target
    for sign_up in sign_up_list:
        for link in soup.find_all("a"):
            if sign_up in link or sign_up in link.get("href"):
                target = link.get("href")
                sign_up_found = True
                break
        if sign_up_found:
            break
    
    new_url = ""
    if target[0] == "/":
        # Build a url for the complete sign up page
        new_url = buildUrl(protocol, target, old_target)
    else:
        new_url = target
    
    # Go to the new url and check for policy
    try:
        r = requests.get(new_url)
    except requests.exceptions.ConnectionError:
        r.status_code = "Connection refused"
    
    
    c = r.content
    
    soup = BeautifulSoup(c, "html.parser")
    
    # Check if policy exists on the Sign up page
    policy_found = checkPolicies(soup)    
    if not policy_found:
        #print("Policy not found")
        data[protocol+url] = "Policy not found"
        continue

#print(data)