# Homework 3 - Master's Degrees from all over!

### 1. Data collection

In [46]:
# import necessary libraries

import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tqdm import tqdm
import time
import concurrent.futures
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import regex as re
import string
from nltk.stem import PorterStemmer
from collections import defaultdict
import json
from heapq import nlargest
from collections import Counter

#### 1.1. Get the list of master's degree courses


In [2]:
# website link
base_url = 'https://www.findamasters.com/masters-degrees/msc-degrees/'

In [10]:
# extract URLs 

def extract_masters(this_url, retries=3, backoff_factor=1.5):

    wait_time = 1 

    for attempt in range(retries):
        try:
            response = requests.get(this_url)

            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                links = soup.find_all('a', {'class': 'courseLink'})
                return [(urljoin(this_url, link['href']), link.text.strip()) for link in links]
            
            elif response.status_code == 429:
                print(f"rate limit hit")
                time.sleep(wait_time)
                wait_time *= backoff_factor

            else:
                print(f"failed to fetch {this_url}, status code {response.status_code}")
                return []
            
        except requests.RequestException as e:
            print(f"error while fetching {this_url}: {e}")
            return []
        wait_time = 30  # reset wait time to 30 seconds for each retry

    print(f"failed to fetch {this_url}")
    return []

all_links = []

# collecting URLs from 400 pages and save them to URL.txt
for page in tqdm(range(1, 401), desc="collecting urls"):
    page_url = f"{base_url}?PG={page}"
    links = extract_masters(page_url)
    all_links.extend(links)

with open('URL.txt', 'w') as file:
    for url, _ in all_links:
        file.write(url + '\n')

print("urls have been collected and saved to URL.txt")

#### 1.2. Crawl master's degree pages

The code below is not optimal, however, it did download all the necessary HTML files into according folders.

In [6]:
base_website_url = 'https://www.findamasters.com'
base_url = 'https://www.findamasters.com/masters-degrees/msc-degrees/'


def save_html(url, content, folder, file_name):
    try:
        os.makedirs(folder, exist_ok=True)
        file_path = os.path.join(folder, file_name + ".html")
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(content)
            
    except Exception as e:
        # in case of error
        print(f"error saving file {file_path}: {e}")

def download_html(index, url, base_url, main_folder, urls_per_page, max_retries=5):

    page_number = index // urls_per_page + 1
    page_folder = os.path.join(main_folder, f"Page_{page_number}")

    if not url.startswith(('http://', 'https://')):
        url = urljoin(base_url, url)

    retries = 0
    wait_time = 30  # starting with a 30 seconds

    while retries < max_retries:
        try:
            response = requests.get(url)

            if response.status_code == 200:

                file_name = f"url{index + 1}"  
                save_html(url, response.text, page_folder, file_name)
                return f"downloaded {url}"
            
            # if website is blocking
            elif response.status_code == 429:
                wait_time += 10
                # print(f"rate limit hit")
                time.sleep(wait_time)
                retries += 1

            else:
                return f"failed to download {url}, status code {response.status_code}"
            
        except requests.RequestException as e:
            return f"error while downloading {url}: {e}"

    return f"failed to download {url} after {max_retries} retries"

# read URLs from URL.txt
with open('URL.txt', 'r') as file:
    urls = file.read().splitlines()

main_folder = 'HTML_pages'
urls_per_page = 15  # number of URLs per page

# parallel downloading of HTML pages
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(download_html, index, url, base_url, main_folder, urls_per_page) for index, url in enumerate(urls)]
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(urls)):
        print(future.result())

print("HTML pages have been downloaded and saved")

#### 1.3 Parse downloaded pages


In [11]:
def extract_info(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # extract course name
    cname_tag = soup.find("h1", class_="text-white course-header__course-title")
    courseName = cname_tag["data-permutive-title"] if cname_tag else " "

    # extract university name
    uname = soup.find('a', class_='course-header__institution')
    universityName = uname.get_text(strip=True) if uname else " "

    # extract faculty name
    fname = soup.find('a', class_='course-header__department')
    facultyName = fname.get_text(strip=True) if fname else " "

    # extract part/fulltime
    ftime = soup.find('span', class_='key-info__duration')
    isItFullTime = " "
    if ftime:
        ftime_text = ftime.get_text(strip=True)
        if "full-time" in ftime_text.lower() or "full time" in ftime_text.lower() or "fulltime" in ftime_text.lower():
            isItFullTime = "full-time"
        elif "part-time" in ftime_text.lower() or "part time" in ftime_text.lower() or "parttime" in ftime_text.lower():
            isItFullTime = "part-time"

    # extract description
    desc = soup.find('div', id='Snippet')
    description = ' '.join(p.get_text(strip=True) for p in desc.find_all('p')) if desc else " "

    # extract startdate
    sdate = soup.find('span', class_='key-info__start-date')
    startDate = sdate.get_text(strip=True) if sdate else " "

    # extract fees
    fees_div = soup.find('div', class_='course-sections course-sections__fees tight col-xs-24')
    fees = fees_div.get_text(strip=True) if fees_div else " "

    # extract modality
    modality_info = soup.find('span', class_='key-info__qualification')
    modality = modality_info.find('a').get_text(strip=True) if modality_info and modality_info.find('a') else ' '

    # extract duration
    duration_tag = soup.find('span', class_='key-info__duration')
    duration = ' '.join(duration_tag.get_text(strip=True).split()[:2]) if duration_tag else " "

    # extract city
    city_info = soup.find('a', class_='course-data__city')
    city = city_info.get_text(strip=True) if city_info else ' '

    # extract country
    country_info = soup.find('a', class_='course-data__country')
    country = country_info.get_text(strip=True) if country_info else ' '

    # extract administration
    administration_info = soup.find('a', class_='course-data__on-campus')
    administration = administration_info.get_text(strip=True) if administration_info else ' '

    # extract URL
    link = soup.find('link', rel='canonical')
    url = link['href'] if link and 'href' in link.attrs else ' '

    return {
        'courseName': courseName,
        'universityName': universityName,
        'facultyName': facultyName,
        'isItFullTime': isItFullTime,
        'description': description,
        'startDate': startDate,
        'fees': fees,
        'modality': modality,
        'duration': duration,
        'city': city,
        'country': country,
        'administration': administration,
        'url': url
    }

In [12]:
main_folder = 'HTML_pages'  #html files are stored
tsv_folder = 'tsv' #tsv files directory

#making sure files exist
os.makedirs(tsv_folder, exist_ok=True)

# initializing counter for the file naming
i = 1

for root, dirs, files in os.walk(main_folder):
    for file in files:
        if file.endswith('.html'):
            file_path = os.path.join(root, file)
            
            with open(file_path, 'r', encoding='utf-8') as f:
                html_content = f.read()
                course_info = extract_info(html_content)

                # converting the dictionary to a dataframe
                df = pd.DataFrame([course_info])

                # defining the tsv file path in the tsv folder using the counter
                tsv_filename = f'course_{i}.tsv'
                tsv_path = os.path.join(tsv_folder, tsv_filename)

                # saving the dataframe as a tsv file
                df.to_csv(tsv_path, sep='\t', index=False)

                # incrementing the counter
                i += 1

In [13]:
# list to store each dataframe
dataframes = []

# loop through all tsv files in the folder
for file in os.listdir(tsv_folder):

    if file.endswith('.tsv'):

        file_path = os.path.join(tsv_folder, file)

        # read the tsv file into a dataframe
        df = pd.read_csv(file_path, sep='\t')

        # append the dataframe to the list
        dataframes.append(df)

# concatenate all dataframes into a single dataframe
df = pd.concat(dataframes, ignore_index=True)

In [14]:
df1 = df.copy() #to not lose df just in case

#### 2.0 Preprocessing


In [44]:
# preprocessing "fees" column

#extracting numeric value and currency into "numeric fees"
def extract_fees(fee_text):
    pattern = r'([\$€£]\s*\d+(?:,\d{3})*(?:\.\d+)?)'
    matches = re.findall(pattern, fee_text)

    if matches:
        # assuming first found currency applies to all amounts
        currency_symbol = matches[0][0]
        fee_numbers = [float(match.replace('$', '').replace('€', '').replace('£', '').replace(',', '')) for match in matches]
        return max(fee_numbers), currency_symbol
    return None, None

df1['numeric_fees'] = df1['fees'].apply(extract_fees)

In [22]:
def exchange_rates(currencies, api_key='89077096ab3dc6e3a2cfc1b5'):
    rates = {}

    for currency in currencies:
        url = f'https://api.exchangerate-api.com/v4/latest/{currency}?apiKey={api_key}'
        response = requests.get(url)  
        data = response.json()  

        if 'rates' in data:
            rates[currency] = data['rates']
    return rates  

def convert_currency(amount, from_currency, to_currency, exchange_rates):

    if from_currency in exchange_rates and to_currency in exchange_rates[from_currency]:
        return amount * exchange_rates[from_currency][to_currency]
    
    else:
        return None  

def convert_to_euro(row, exchange_rates):

    amount, currency_symbol = row['numeric_fees']

    currency_map = {'$': 'USD', '€': 'EUR', '£': 'GBP'}
    from_currency = currency_map.get(currency_symbol, 'EUR')

    # converting the amount if it's not NaN
    if pd.notna(amount):
        return convert_currency(amount, from_currency, 'EUR', exchange_rates)
    return None

needed_currencies = ['USD', 'GBP', 'EUR']
exchange_rates = exchange_rates(needed_currencies)

df1['fees_EUR'] = df1.apply(lambda row: convert_to_euro(row, exchange_rates), axis=1)

In [29]:
# preparing stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    # checking if the text is string
    if not isinstance(text, str):
        # if non-string data
        return str(text) 

    # removing stopwords
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]

    # stemming and removing punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # converting text to lowercase
    text = text.lower()

    return text

df1['processed_description'] = df1['description'].apply(preprocess_text)


In [45]:
df1.head()

Unnamed: 0,courseName,universityName,facultyName,isItFullTime,description,startDate,fees,modality,duration,city,country,administration,url,numeric_fees,fees_EUR,processed_description
0,msc business analytics,durham university,durham university business school,full-time,our msc in business analytics will enable you ...,september,feesplease see the university website for furt...,msc,1 year,durham,united kingdom,on campus,https://www.findamasters.com/masters-degrees/c...,"(None, None)",,our msc in business analytics will enable you ...
1,human resource management msc,university of southampton,faculty of social sciences,,embark on a successful career in human resourc...,september,feesplease see the university website for furt...,msc,1 year,southampton,united kingdom,on campus,https://www.findamasters.com/masters-degrees/c...,"(None, None)",,embark on a successful career in human resourc...
2,data science - master of science (ms),university of colorado boulder,college of arts and sciences,,the on-campus master of science in data scienc...,see course,feesplease see the university website for furt...,msc,see programme,boulder,usa,on campus,https://www.findamasters.com/masters-degrees/c...,"(None, None)",,the oncampus master of science in data science...
3,drug discovery (msc),university of bath,department of life sciences,full-time,develop expertise and specialist practical ski...,september,feesplease see the university website for furt...,msc,1 year,bath,united kingdom,on campus,https://www.findamasters.com/masters-degrees/c...,"(None, None)",,develop expertise and specialist practical ski...
4,energy and environment - msc,university of leeds,school of chemical and process engineering,full-time,the sustainable use of energy is fundamental t...,september,"feesuk: £13,750 (total)international: £31,000 ...",msc,1 year,leeds,united kingdom,on campus,https://www.findamasters.com/masters-degrees/c...,"(31000.0, £)",35960.0,the sustainable use of energy is fundamental t...


#### 2.1. Conjunctive query


#### 2.1.1) Create your index!

In [33]:
def create_vocabulary(df):
    unique_words = set()

    for description in df['processed_description']:

        words = description.split()
        unique_words.update(words)

    # it creates dictionary to map each word to an id
    return {word: idx for idx, word in enumerate(sorted(unique_words))}

def create_inverted_index(df, vocabulary):
    inverted_index = defaultdict(list)

    for index, row in df.iterrows():

        words = row['processed_description'].split()

        for word in set(words):  

            term_id = vocabulary.get(word)  

            if term_id is not None:
                inverted_index[term_id].append(index)  
    
    # returns inverted index as a regular dictionary
    return dict(inverted_index)

vocabulary = create_vocabulary(df1)
inverted_index = create_inverted_index(df1, vocabulary)

# writing vocabulary.json, inverted_index.json
with open('vocabulary.json', 'w') as vocab_file:
    json.dump(vocabulary, vocab_file)

with open('inverted_index.json', 'w') as index_file:
    json.dump(inverted_index, index_file)


#### 2.1.2) Execute the query


In [42]:
def search_query(query, inverted_index, df, vocabulary):
    query_words = query.lower().split()

    document_lists = [inverted_index[vocabulary[word]] for word in query_words if word in vocabulary]

    if document_lists:
        common_documents = set(document_lists[0]).intersection(*document_lists[1:])
    else:
        common_documents = set()

    if common_documents:
        columns_to_show = ["courseName", "universityName", "description", "url"]
        results_df = df.loc[list(common_documents), columns_to_show]
    else:
        results_df = pd.DataFrame(columns=["courseName", "universityName", "description", "url"])

    return results_df

# example
user_query = "advanced knowledge"
search_results_df = search_query(user_query, inverted_index, df, vocabulary)

search_results_df.head()

Unnamed: 0,courseName,universityName,description,url
2048,Advanced Clinical Practice - MSc/PGDip/PGCert,University of Birmingham,Our aim at the University of Birmingham is to ...,https://www.findamasters.com/masters-degrees/c...
14,Energy Systems and Data Analytics (ESDA) MSc,University College London,Energy Systems and Data Analytics MSc provides...,https://www.findamasters.com/masters-degrees/c...
23,Master of Chemistry (Leuven),KU Leuven,Breakthroughs in chemistry can change the text...,https://www.findamasters.com/masters-degrees/c...
28,Master of Bioscience Engineering: Human Health...,KU Leuven,This master's degree will turn you into an int...,https://www.findamasters.com/masters-degrees/c...
4126,Master of Management in Artificial Intelligence,York University (Canada),Artificial Intelligence (AI) is undergoing a l...,https://www.findamasters.com/masters-degrees/c...
