# Homework 3 - Master's Degrees from all over!

In [78]:
# import necessary libraries

import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tqdm import tqdm
import time
import random
import concurrent.futures
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import regex as re
import string
from nltk.stem import PorterStemmer
import pickle 
from collections import defaultdict


1. Data collection


1.1. Get the list of master's degree courses


In [30]:
# extract URLs 

def extract_masters(this_url, retries=3, backoff_factor=1.5):
    wait_time = 1
    for attempt in range(retries):
        try:
            response = requests.get(this_url)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                links = soup.find_all('a', {'class': 'courseLink'})
                return [(urljoin(this_url, link['href']), link.text.strip()) for link in links]
            elif response.status_code == 429:
                print(f"Rate limit hit")
                time.sleep(wait_time)
                wait_time *= backoff_factor
            else:
                print(f"Failed to fetch {this_url}: Status code {response.status_code}")
                return []
        except requests.RequestException as e:
            print(f"Error occurred while fetching {this_url}: {e}")
            return []
        wait_time = 30  # reset wait time to 30 seconds for each retry

    print(f"Failed to fetch {this_url} after {retries} retries.")
    return []

base_url = 'https://www.findamasters.com/masters-degrees/msc-degrees/'
all_links = []

# collecting URLs from 400 pages and save them to URL.txt
for page in tqdm(range(1, 401), desc="Collecting URLs"):
    page_url = f"{base_url}?PG={page}"
    links = extract_masters(page_url)
    all_links.extend(links)

with open('URL.txt', 'w') as file:
    for url, _ in all_links:
        file.write(url + '\n')

#print that everything is okay
print("URLs have been collected and saved to URL.txt.")

Collecting URLs:   0%|          | 0/400 [00:00<?, ?it/s]

Collecting URLs: 100%|██████████| 400/400 [1:02:08<00:00,  9.32s/it]

URLs have been collected and saved to URL.txt.





1.2. Crawl master's degree pages

The code below has some errors and thus does not work properly/optimal. However, it did download all the necessary HTML files into according folders.

In [74]:
# base_website_url = 'https://www.findamasters.com'
# base_url = 'https://www.findamasters.com/masters-degrees/msc-degrees/'


# def save_html(url, content, folder, file_name):
#     try:
#         os.makedirs(folder, exist_ok=True)
#         file_path = os.path.join(folder, file_name + ".html")
#         with open(file_path, 'w', encoding='utf-8') as file:
#             file.write(content)
#         # print(f"successfully saved")
#     except Exception as e:
#         # in case of error
#         print(f"error saving file {file_path}: {e}")

# def download_html(index, url, base_url, main_folder, urls_per_page, max_retries=5):
#     page_number = index // urls_per_page + 1
#     page_folder = os.path.join(main_folder, f"Page_{page_number}")

#     if not url.startswith(('http://', 'https://')):
#         url = urljoin(base_url, url)

#     retries = 0
#     wait_time = 30  # starting with a 30-second wait time

#     while retries < max_retries:
#         try:
#             response = requests.get(url)
#             if response.status_code == 200:
#                 file_name = f"url{index + 1}"  # naming file as "url{iteration_number}.html"
#                 save_html(url, response.text, page_folder, file_name)
#                 return f"Downloaded: {url}"
#             # if website is blocking
#             elif response.status_code == 429:
#                 wait_time += 10
#                 # print(f"rate limit hit")
#                 time.sleep(wait_time)
#                 retries += 1
#             else:
#                 return f"Failed to download {url}: Status code {response.status_code}"
#         except requests.RequestException as e:
#             return f"Error occurred while downloading {url}: {e}"

#     return f"Failed to download {url} after {max_retries} retries."

# # read URLs from URL.txt
# with open('URL.txt', 'r') as file:
#     urls = file.read().splitlines()

# main_folder = 'HTML_pages'
# urls_per_page = 15  # number of URLs per page

# # parallel downloading of HTML pages
# with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
#     futures = [executor.submit(download_html, index, url, base_url, main_folder, urls_per_page) for index, url in enumerate(urls)]
#     for future in tqdm(concurrent.futures.as_completed(futures), total=len(urls)):
#         print(future.result())

# print("HTML pages have been downloaded and saved.")


1.3 Parse downloaded pages


In [13]:
#parsing

def extract_info(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # extract course name
    cname_tag = soup.find("h1", class_="text-white course-header__course-title")
    courseName = cname_tag["data-permutive-title"] if cname_tag else " "

    # extract university name
    uname = soup.find('a', class_='course-header__institution')
    universityName = uname.get_text(strip=True) if uname else " "

    # extract faculty name
    fname = soup.find('a', class_='course-header__department')
    facultyName = fname.get_text(strip=True) if fname else " "

    # extract part/fulltime
    ftime = soup.find('span', class_='key-info__duration')
    isItFullTime = " "
    if ftime:
        ftime_text = ftime.get_text(strip=True)
        if "full-time" in ftime_text.lower() or "full time" in ftime_text.lower() or "fulltime" in ftime_text.lower():
            isItFullTime = "full-time"
        elif "part-time" in ftime_text.lower() or "part time" in ftime_text.lower() or "parttime" in ftime_text.lower():
            isItFullTime = "part-time"

    # extract description
    desc = soup.find('div', id='Snippet')
    description = ' '.join(p.get_text(strip=True) for p in desc.find_all('p')) if desc else " "

    # extract startdate
    sdate = soup.find('span', class_='key-info__start-date')
    startDate = sdate.get_text(strip=True) if sdate else " "

    # extract fees
    fees_div = soup.find('div', class_='course-sections course-sections__fees tight col-xs-24')
    fees = fees_div.get_text(strip=True) if fees_div else " "

    # extract modality
    modality_info = soup.find('span', class_='key-info__qualification')
    modality = modality_info.find('a').get_text(strip=True) if modality_info and modality_info.find('a') else ' '

    # extract duration
    duration_tag = soup.find('span', class_='key-info__duration')
    duration = ' '.join(duration_tag.get_text(strip=True).split()[:2]) if duration_tag else " "

    # extract city
    city_info = soup.find('a', class_='course-data__city')
    city = city_info.get_text(strip=True) if city_info else ' '

    # extract country
    country_info = soup.find('a', class_='course-data__country')
    country = country_info.get_text(strip=True) if country_info else ' '

    # extract administration
    administration_info = soup.find('a', class_='course-data__on-campus')
    administration = administration_info.get_text(strip=True) if administration_info else ' '

    # extract URL
    link = soup.find('link', rel='canonical')
    url = link['href'] if link and 'href' in link.attrs else ' '

    return {
        'courseName': courseName,
        'universityName': universityName,
        'facultyName': facultyName,
        'isItFullTime': isItFullTime,
        'description': description,
        'startDate': startDate,
        'fees': fees,
        'modality': modality,
        'duration': duration,
        'city': city,
        'country': country,
        'administration': administration,
        'url': url
    }

In [16]:
main_folder = 'HTML_pages'  #html files are stored
tsv_folder = 'tsv' #tsv files directory

#making sure files exist
os.makedirs(tsv_folder, exist_ok=True)

# initializing counter for the file naming
i = 1

for root, dirs, files in os.walk(main_folder):
    for file in files:
        if file.endswith('.html'):
            file_path = os.path.join(root, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                html_content = f.read()
                course_info = extract_info(html_content)

                # converting the dictionary to a dataframe
                df = pd.DataFrame([course_info])

                # defining the tsv file path in the tsv folder using the counter
                tsv_filename = f'course_{i}.tsv'
                tsv_path = os.path.join(tsv_folder, tsv_filename)

                # saving the dataframe as a tsv file
                df.to_csv(tsv_path, sep='\t', index=False)

                # incrementing the counter
                i += 1

In [18]:
# list to store each dataframe
dataframes = []

# loop through all tsv files in the folder
for file in os.listdir(tsv_folder):
    if file.endswith('.tsv'):
        file_path = os.path.join(tsv_folder, file)
        # read the tsv file into a dataframe
        df = pd.read_csv(file_path, sep='\t')
        # append the dataframe to the list
        dataframes.append(df)

# concatenate all dataframes into a single dataframe
df = pd.concat(dataframes, ignore_index=True)

In [21]:
df1 = df.copy() #to not lose df just in case

2.0 Preprocessing


In [39]:
# preprocessing "fees" column

#extracting numeric value and currency into "numeric fees"
def extract_fees(fee_text):
    pattern = r'([\$€£]\s*\d+(?:,\d{3})*(?:\.\d+)?)'
    matches = re.findall(pattern, fee_text)

    if matches:
        # assuming first found currency applies to all amounts
        currency_symbol = matches[0][0]
        fee_numbers = [float(match.replace('$', '').replace('€', '').replace('£', '').replace(',', '')) for match in matches]
        return max(fee_numbers), currency_symbol
    return None, None

df1['numeric_fees'] = df1['fees'].apply(extract_fees)
df1.head()

Unnamed: 0,courseName,universityName,facultyName,isItFullTime,description,startDate,fees,modality,duration,city,country,administration,url,numeric_fees
0,MSc Business Analytics,Durham University,Durham University Business School,full-time,Our MSc in Business Analytics will enable you ...,September,FeesPlease see the university website for furt...,MSc,1 year,Durham,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...,"(None, None)"
1,Human Resource Management MSc,University of Southampton,Faculty of Social Sciences,,Embark on a successful career in human resourc...,September,FeesPlease see the university website for furt...,MSc,1 Year,Southampton,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...,"(None, None)"
2,Data Science - Master of Science (MS),University of Colorado Boulder,College of Arts and Sciences,,The on-campus Master of Science in Data Scienc...,See Course,FeesPlease see the university website for furt...,MSc,See Programme,Boulder,USA,On Campus,https://www.findamasters.com/masters-degrees/c...,"(None, None)"
3,Drug Discovery (MSc),University of Bath,Department of Life Sciences,full-time,Develop expertise and specialist practical ski...,September,FeesPlease see the university website for furt...,MSc,1 year,Bath,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...,"(None, None)"
4,Energy and Environment - MSc,University of Leeds,School of Chemical and Process Engineering,full-time,The sustainable use of energy is fundamental t...,September,"FeesUK: £13,750 (Total)International: £31,000 ...",MSc,1 year,Leeds,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...,"(31000.0, £)"


In [49]:
#function to convert currency to euro
def convert_currency(amount, from_currency, to_currency='EUR'):
    api_key = '89077096ab3dc6e3a2cfc1b5'
    url = f'https://api.exchangerate-api.com/v4/latest/{from_currency}?apiKey={api_key}'
    response = requests.get(url)
    data = response.json()

    if 'rates' in data and to_currency in data['rates']:
        return amount * data['rates'][to_currency]
    else:
        return None

# preprocessing function
def convert_to_euro(row):
    amount, currency_symbol = row['numeric_fees']

    # handling different currency symbols
    currency_map = {'$': 'USD', '€': 'EUR', '£': 'GBP'}
    from_currency = currency_map.get(currency_symbol, 'EUR')

    # convert only if amount is not NaN
    if pd.notna(amount):
        return convert_currency(amount, from_currency, 'EUR')
    return None

df1['fees_EUR'] = df1.apply(convert_to_euro, axis=1)

In [66]:
# preparing stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    # checking if the text is string
    if not isinstance(text, str):
        # if non-string data
        return str(text) 

    # removing stopwords
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]

    # stemming and removing punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    return text

df1['processed_description'] = df1['description'].apply(preprocess_text)

In [70]:
#lowercasing
for column in df.columns:
    if df1[column].dtype == object:
        df1[column] = df1[column].str.lower()

In [77]:
df1.head()

Unnamed: 0,courseName,universityName,facultyName,isItFullTime,description,startDate,fees,modality,duration,city,country,administration,url,numeric_fees,fees_EUR,processed_description
0,msc business analytics,durham university,durham university business school,full-time,our msc in business analytics will enable you ...,september,feesplease see the university website for furt...,msc,1 year,durham,united kingdom,on campus,https://www.findamasters.com/masters-degrees/c...,"(None, None)",,our msc in business analytics will enable you ...
1,human resource management msc,university of southampton,faculty of social sciences,,embark on a successful career in human resourc...,september,feesplease see the university website for furt...,msc,1 year,southampton,united kingdom,on campus,https://www.findamasters.com/masters-degrees/c...,"(None, None)",,embark on a successful career in human resourc...
2,data science - master of science (ms),university of colorado boulder,college of arts and sciences,,the on-campus master of science in data scienc...,see course,feesplease see the university website for furt...,msc,see programme,boulder,usa,on campus,https://www.findamasters.com/masters-degrees/c...,"(None, None)",,the oncampus master of science in data science...
3,drug discovery (msc),university of bath,department of life sciences,full-time,develop expertise and specialist practical ski...,september,feesplease see the university website for furt...,msc,1 year,bath,united kingdom,on campus,https://www.findamasters.com/masters-degrees/c...,"(None, None)",,develop expertise and specialist practical ski...
4,energy and environment - msc,university of leeds,school of chemical and process engineering,full-time,the sustainable use of energy is fundamental t...,september,"feesuk: £13,750 (total)international: £31,000 ...",msc,1 year,leeds,united kingdom,on campus,https://www.findamasters.com/masters-degrees/c...,"(31000.0, £)",35960.0,the sustainable use of energy is fundamental t...


2.1. Conjunctive query


2.1.1) Create your index!

In [79]:
words = set()
df1['processed_description'].apply(lambda row: words.update(row.split()))

# creating vocabulary,mapping each word to integer

vocabulary = {word: idx for idx, word in enumerate(sorted(words), start=1)}

# saving the vocabulary as file
with open('vocabulary.pkl', 'wb') as vocab_file:
    pickle.dump(vocabulary, vocab_file)

In [81]:
# empty defaultdict for the inverted index

inverted_index = defaultdict(list)

# iterating over df1 to populate inverted index

for document_id, row in df1.iterrows():
    description = row['processed_description']
    for word in description.split():
        term_id = vocabulary[word]
        if document_id not in inverted_index[term_id]:
            inverted_index[term_id].append(document_id)

# saving inverted index to file
with open('inverted_index.pkl', 'wb') as index_file:
    pickle.dump(dict(inverted_index), index_file)

2.1.2) Execute the query


In [84]:
# loading the inverted index and vocabulary
with open('inverted_index.pkl', 'rb') as index_file, open('vocabulary.pkl', 'rb') as vocab_file:
    inverted_index = pickle.load(index_file)
    vocabulary = pickle.load(vocab_file)

# processing the query
def process_query(query):
    return query.lower().split()

# retrieving documents for the query
def search_documents(query):
    query_words = process_query(query)
    query_ids = [vocabulary[word] for word in query_words if word in vocabulary]
    
    # finding the intersection of document lists
    doc_lists = [set(inverted_index[term_id]) for term_id in query_ids]
    relevant_docs = set.intersection(*doc_lists) if doc_lists else set()
    
    # retrieving details for each relevant document
    results = df.loc[df.index.isin(relevant_docs), ['courseName', 'universityName', 'description', 'url']]
    return results

# example query
query = "advanced knowledge"
search_results = search_documents(query)
search_results

Unnamed: 0,courseName,universityName,description,url
14,energy systems and data analytics (esda) msc,university college london,energy systems and data analytics msc provides...,https://www.findamasters.com/masters-degrees/c...
23,master of chemistry (leuven),ku leuven,breakthroughs in chemistry can change the text...,https://www.findamasters.com/masters-degrees/c...
28,master of bioscience engineering: human health...,ku leuven,this master's degree will turn you into an int...,https://www.findamasters.com/masters-degrees/c...
32,advanced clinical practice (postgraduate certi...,university of gloucestershire,this programme enables experienced healthcare ...,https://www.findamasters.com/masters-degrees/c...
44,investment management and financial analysis -...,lancaster university,the investment management and financial analys...,https://www.findamasters.com/masters-degrees/c...
...,...,...,...,...
5608,mdp in mechanical engineering: smart systems,university of turku,"in the smart systems track, you will have acce...",https://www.findamasters.com/masters-degrees/c...
5613,information technology - msc,university of glasgow,the masters in information technology is an in...,https://www.findamasters.com/masters-degrees/c...
5627,mathematical engineering - mathematical modell...,university of padua,themaster’s degree in mathematical engineering...,https://www.findamasters.com/masters-degrees/c...
5645,biomedical engineering - msc,university of west london,if you want to contribute towards transforming...,https://www.findamasters.com/masters-degrees/c...
