# Project 3 - Create a framework for government R&D survey

# Import necessary modules/libraries

In [None]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import re
import json
import pandas as pd
# to avoid warnings to make the document output more readable
# suppress all warnings by using the warnings module and using the function ‘filterwarnings()’.
import warnings
warnings.filterwarnings('ignore')

# Write a function for getting the text data from a website url

In [None]:
def getdata(url):
    r = requests.get(url)
    return r.text

# Write a function for getting all links from one page and store them in a list

First, in this function we will get all “a href” marked links. As mentioned, this could potentially lead to the scraping of other websites you do not want information from. We have to place some restraints on the function.

Second thing is that we also want the href that don’t show the full HTML link but only a relative link and starts with a “/” to be included in the collection of links.

Third, we want to convert the link to a dictionary with the function dict.fromkeys() to prevent saving duplications of the same link and to speed up the link searching process. The result looks like this:

In [None]:
# create empty dict
dict_href_links = {}

# aşağıdaki fonksiyonda ikinci parametreyi linkin kökünü sabit tutmak için koydum
def get_links(website_link,start_link):
    html_data = getdata(website_link)
    soup = BeautifulSoup(html_data, "html.parser")
    list_links = []
    for link in soup.find_all("a", href=True):
        
        # Append to list if new link contains original link
        if str(link["href"]).startswith((str(start_link))):
            list_links.append(link["href"])
            
        # Include all href that do not start with website link but with "/"
        if str(link["href"]).startswith("/"):
            if link["href"] not in dict_href_links:
                print(link["href"])
                dict_href_links[link["href"]] = None
                link_with_www = website_link + link["href"][1:]
                print("adjusted link =", link_with_www)
                list_links.append(link_with_www)
                
    # Convert list of links to dictionary and define keys as the links and the values as "Not-checked"
    dict_links = dict.fromkeys(list_links, "Not-checked")
    return dict_links

# Write a function that loops over all the subpages

We use a for loop to go through the subpages and use tqdm to obtain insight into the number of steps that have been completed and keep track of the remaining time to complete the process.

In [None]:
def get_subpage_links(l):
    for link in tqdm(l):
        # If not crawled through this page start crawling and get links
        if l[link] == "Not-checked":
            # aşağıdaki satırdaki website parametresi bir aşağıdaki komutta verilecek parametredir ve linkin kök hali içindir.
            dict_links_subpages = get_links(link,website) 
            # Change the dictionary value of the link to "Checked"
            l[link] = "Checked"
        else:
            # Create an empty dictionary in case every link is checked
            dict_links_subpages = {}
        # Add new dictionary to old dictionary
        l = {**dict_links_subpages, **l}
    return l

# Import the URL list and Create the loop

Before we start the loop we have to initialize some variables. We save the website we want to scrape in a variable and convert this variable into a single key dictionary that has the value “Not-checked”. We create a counter “counter” to count the number of “Not-checked” links and we create a second counter “counter2” to count the number of iterations. To communicate back to ourselves we create some print statements.

In the below code I used the try except statement. The try except statement can handle exceptions. Exceptions may happen when you run a program. Exceptions are errors that happen during execution of the program. Python won’t tell you about errors like syntax errors (grammar faults), instead it will abruptly stop. An abrupt exit is bad for both the end user and developer. Instead of an emergency halt, you can use a try except statement to properly deal with the problem. An emergency halt will happen if you do not properly handle exceptions. https://pythonbasics.org/try-except/

In [None]:
with open("url_list.txt") as file_in:
    lines = []
    for line in file_in:
        lines.append(line)
website_list = [x.rstrip() for x in lines] # remove line breaks
website_list

In [None]:
# add website WITH slash on end
# website_list=["https://www.kastamonu.bel.tr/v2/","https://www.ilkadim.bel.tr/"]
links={}
# create dictionary of website
for website in website_list:
    dict_links = {website:"Not-checked"}
    counter, counter2 = None, 0
    while counter!=0:
        try:
            counter2 += 1
            dict_links2 = get_subpage_links(dict_links)
            # Count number of non-values and set counter to 0 if there are no values within the dictionary equal to the string "Not-checked"
            # https://stackoverflow.com/questions/48371856/count-the-number-of-occurrences-of-a-certain-value-in-a-dictionary-in-python
            counter = sum(value == "Not-checked" for value in dict_links2.values())
            # Print some statements
            print("")
            print("THIS IS LOOP ITERATION NUMBER", counter2)
            print("LENGTH OF DICTIONARY WITH LINKS =", len(dict_links2))
            print("NUMBER OF 'Not-checked' LINKS = ", counter)
            print("")
            dict_links = dict_links2
            # I set the cutoff frequency to 200. Otherwise it takes a lot of time and usually 200 links are sufficient.
            if len(dict_links2)>=200:
                break
        except:
            pass
    links.update(dict_links)
    # Save list in json file
    a_file = open("data.json", "w", encoding='utf-8')
    json.dump(links, a_file, ensure_ascii=False, indent=4)
    a_file.close()

As you can see the number of not-checked links decreases until it reaches zero and then the script is finished, which is exactly what we want.

source: https://python.plainenglish.io/scraping-the-subpages-on-a-website-ea2d4e3db113

# Import previously produced json data

In [None]:
import json
data = json.load(open('data.json', 'r', encoding='utf-8'))
data

# get links from imported json data

In [None]:
links = [t for t in data]
links[0:5]

# Import the keyword list and Scrape the list of words from url's

In [None]:
with open("keyword_list.txt") as file_in2:
    lines2 = []
    for line in file_in2:
        lines2.append(line)
word_list = [x.rstrip() for x in lines2] # remove line breaks
word_list

In [None]:
#arge sözcüğünü de eklesem mi, riskli, normal metnin içinde geçebilir
#word_list=['Ar-Ge','AR-GE','Araştırma ve Geliştirme','araştırma ve geliştirme','ARAŞTIRMA VE GELİŞTİRME','Arge']
count=0
d={}
for url in links:
    print("\nWebsite currently being scraped:", url)
    try:
        for word in word_list:
            r = requests.get(url, allow_redirects=False)
            r.encoding='utf-8'
            soup = BeautifulSoup(r.content, 'html.parser')
            results = soup.body.find_all(string=re.compile('.*{0}.*'.format(word)), recursive=True)
            print ('Found the word "{0}" {1} times\n'.format(word, len(results)))
            count+=len(results)
    except:
        pass
    d.update({url: count})
    print ('Found the list {0} times\n'.format(count))
    b_file = open("data1.json", "w", encoding='utf-8')
    json.dump(d, b_file, ensure_ascii=False, indent=4)
    b_file.close()

# Print the url's scraped and cumulative number of found words

In [None]:
data1 = json.load(open('data1.json', 'r', encoding='utf-8'))
data1

# Show only the links containing keywords with their frequency

In [None]:
a=0
df = pd.DataFrame(columns=['Link', 'Frequency'])
for x, y in data1.items():
    if y>a:
        b=[]
        b.append(x)
        b.append(y-a)
        temporary_df = pd.DataFrame([b], columns=['Link', 'Frequency'])
        df = df.append(temporary_df, ignore_index=True)
        a=y
df.head()

In [None]:
df.to_excel("sil.xlsx")