<font style='font-size:1.5em'>**✏️ W06 Summative - Data Collection** </font>

<font style='font-size:0.75em'>DS105W – Data for Data Science</font>

**Author:**  <28109>

**Objectives:**
- To collect data from allocated web pages
- To put the data into a structured tabular format
- To save the data into a CSV file


# ⚙️ SETUP

In [45]:
#Importing libraries
import numpy as np
import pandas as pd
import os
import requests
import json
import csv
import shutil
from scrapy import Selector
from urllib.parse import urlparse


In [28]:
CANDIDATE_NUMBER = 28109

**Discover your unique assignment**

In [29]:
import sys
sys.path.append('../')

from lse import print_designated_lse_department

print_designated_lse_department(CANDIDATE_NUMBER)

Your designated department is: Department of International Relations
URL: http://www.lse.ac.uk/International-Relations


-----------------

# Generative AI acknowledgment.

In compliance with the DS105W 🤖 [Generative AI Policy](https://lse-dsi.github.io/DS105/2023/winter-term/generative-ai.html), I declare that...



# Task 1 - Collecting Menus


In [30]:
#Scraping the IR Webpage
my_url = 'https://www.lse.ac.uk/international-relations'
response = requests.get(my_url) #This sends a GET request to the website
response.status_code

if not response.status_code == 200: #Check the status code, 200 means OK, anything else means something went wrong
    print('Something went wrong, status code:', response.status_code)
else:
    print('Everything is OK, status code:', response.status_code)
    # print the HTML code
print(response.text[:100])


Everything is OK, status code: 200
<!DOCTYPE html>

<html lang="en-GB">
  
  
<head>
	<!-- Page-hiding snippet -->
	<style>.async-hide 


In [31]:
sel = Selector(text=response.text) #Parsing the HTML code using Scrapy Selector
sel.get() #This selector now holds all the webpage html
#print(sel.get())
print('Number of elements on the page with CSS:', len(sel.css('*'))) #Printing the number of elements on the IR webpage

sel.css('nav.departmentNav')
nav = (sel.css('nav.departmentNav'))

Number of elements on the page with CSS: 480


In [32]:
# Extract subheadings using CSS selector
subheadings = sel.css('nav.departmentNav ::text').extract()
subheadingnames = subheadings[2:24:2] # Extract even-indexed elements starting from the third one, to eliminate blank text
print(subheadingnames)
print(len(subheadingnames)) #checking the length

#Extracting the links
parsed_links = nav[0].css('a::attr(href)').extract() #this is for the parsed links
link_prefix = 'https://www.lse.ac.uk/' #link prefix for LSE website
complete_links = [link_prefix + link for link in parsed_links]
print(complete_links)
print(len(complete_links)) #checking the length of list is the same as the edited text (subheadings)

['About us', 'News', 'Study', 'Events', 'People', 'Research', 'Current Students', 'Alumni', 'PhD Job Market', 'Centres and units', 'Contact us']
11
['https://www.lse.ac.uk//international-relations/about-us', 'https://www.lse.ac.uk//international-relations/news', 'https://www.lse.ac.uk//international-relations/study/study', 'https://www.lse.ac.uk//international-relations/ir-events', 'https://www.lse.ac.uk//international-relations/people', 'https://www.lse.ac.uk//international-relations/research', 'https://www.lse.ac.uk//international-relations/current-students', 'https://www.lse.ac.uk//international-relations/alumni', 'https://www.lse.ac.uk//international-relations/phd-job-market', 'https://www.lse.ac.uk//international-relations/centres-and-units', 'https://www.lse.ac.uk//international-relations/contact-us']
11


In [33]:
##Creating a dictionary and saving as json file:
import json
def menu(subheadingnames, complete_links):
    result_dict = dict(zip(subheadingnames, complete_links))
    with open('menu.json', 'w') as json_file:
        json.dump(result_dict, json_file, indent =4)

menu(subheadingnames, complete_links)
file_path = "../notebooks/menu.json"

# Task 2 - Scraping the Schedule

In [37]:
#Reading the json file
file_path = "../notebooks/menu.json"
with open(file_path, 'r') as file:
    menu = json.load(file)
    print(menu)

{'About us': 'https://www.lse.ac.uk//international-relations/about-us', 'News': 'https://www.lse.ac.uk//international-relations/news', 'Study': 'https://www.lse.ac.uk//international-relations/study/study', 'Events': 'https://www.lse.ac.uk//international-relations/ir-events', 'People': 'https://www.lse.ac.uk//international-relations/people', 'Research': 'https://www.lse.ac.uk//international-relations/research', 'Current Students': 'https://www.lse.ac.uk//international-relations/current-students', 'Alumni': 'https://www.lse.ac.uk//international-relations/alumni', 'PhD Job Market': 'https://www.lse.ac.uk//international-relations/phd-job-market', 'Centres and units': 'https://www.lse.ac.uk//international-relations/centres-and-units', 'Contact us': 'https://www.lse.ac.uk//international-relations/contact-us'}


In [39]:
import requests
from scrapy import Selector  # Make sure you have the necessary import statement

def get_page_info(url):
    """
    Scrapes information from the International Relations LSE Webpage.

    Parameters:
    - url (str): The URL of the webpage to be scraped.

    Returns:
    list of dict: A list containing dictionaries with information extracted from the page.
    Each dictionary includes the following keys: Department, Page Title, Box Title, Box Subtitle, Box Text, Box Image URL and Box Image Alt.
    If any of these are not applicable / present, the dataframe entry is 'None'. 
    """
    response = requests.get(url)
    sel = Selector(text=response.text)  
    containers = sel.css('div.component__details').xpath('..')
    output = []
    for box in containers:
        #print(box.extract())
        titles = box.css('h2 span ::text').extract_first() 
        subtitles = box.css('h2').xpath('text()').extract_first()
        subtitles = subtitles if subtitles is not None and subtitles.strip() != '' else 'None'
        texts = box.css('p.component__summary ::text').extract_first() or 'None'
        parsed_links = box.css('img::attr(src)').extract()  # this is for the parsed links
        link_prefix = 'https://www.lse.ac.uk'  # link prefix for LSE website
        image_urls = link_prefix + parsed_links[0] if parsed_links else 'None'
        image_alts = box.css('img::attr(alt)').extract_first() or 'None'
        
        output.append({ 
            'Department': 'International Relations', 'Page Title': subheadingnames, 'Box Title': titles, 'Box Subtitle': subtitles, 'Box Text': texts, "Box Image Url": image_urls, 'Box Image Alt': image_alts})
    
    return output


In [40]:
def scrape_menu(menu):
    """
    Scrapes information for each link in the provided menu using the get_page_info function.

    Parameters:
    - menu (dict): A dictionary where keys are subheading names and values are link URLs, for each of the different subheadings.

    Returns:
    pd.DataFrame: A DataFrame containing the scraped information for each link in the menu, such as Department, Page Title, Box Title, etc.
    """

    all_data = []

    for subheading, link_url in menu.items():
        page_info = get_page_info(link_url)
        
        # Adding the page url to each entry
        for item in page_info:
            item['Page URL'] = link_url
            item['Page Title'] = subheading
            all_data.append(item)

    df = pd.DataFrame(all_data)
    return df

# Assuming you have the 'menu' dictionary already defined
menu = {'About us': 'https://www.lse.ac.uk//international-relations/about-us',
 'News': 'https://www.lse.ac.uk//international-relations/news',
 'Study': 'https://www.lse.ac.uk//international-relations/study/study',
 'Events': 'https://www.lse.ac.uk//international-relations/ir-events',
 'People': 'https://www.lse.ac.uk//international-relations/people',
 'Research': 'https://www.lse.ac.uk//international-relations/research',
 'Current Students': 'https://www.lse.ac.uk//international-relations/current-students',
 'Alumni': 'https://www.lse.ac.uk//international-relations/alumni',
 'PhD Job Market': 'https://www.lse.ac.uk//international-relations/phd-job-market',
 'Centres and units': 'https://www.lse.ac.uk//international-relations/centres-and-units',
 'Contact us': 'https://www.lse.ac.uk//international-relations/contact-us'}

# Call the function to scrape information for each link in the menu
result_df = scrape_menu(menu)
result_df.head()

Unnamed: 0,Department,Page Title,Box Title,Box Subtitle,Box Text,Box Image Url,Box Image Alt,Page URL
0,International Relations,About us,Foundation and history of the IR Department,Learn more about our history,,https://www.lse.ac.uk/international-relations/...,HoughtonStreetOld-16-9,https://www.lse.ac.uk//international-relations...
1,International Relations,About us,David Davies of Llandinam Research Fellowship,Find out more,,https://www.lse.ac.uk/international-relations/...,David_Davies_DINAM-16-9,https://www.lse.ac.uk//international-relations...
2,International Relations,About us,Martin Wight memorial lecture,Find out more about the life and works of Mar...,,https://www.lse.ac.uk/international-relations/...,wight-martin-747x420-16-9,https://www.lse.ac.uk//international-relations...
3,International Relations,News,IR events and podcasts,Find out what's happening in the IR Departmen...,,https://www.lse.ac.uk/international-relations/...,event-world-disorders-747x420-16-9,https://www.lse.ac.uk//international-relations...
4,International Relations,News,IR Alumni community,Find out more about our alumni community and ...,,https://www.lse.ac.uk/international-relations/...,alumni-arms-round-747x420-16-9,https://www.lse.ac.uk//international-relations...


In [41]:
result_df.to_csv('../data/boxes.csv', index=False) #saving df as a csv file called 'boxes

# Task 3 - Downloading Images


In [42]:
#Reading the menu dictionary from menu json file and reading the boxes.csv file
file_path = "../notebooks/menu.json"
with open(file_path, 'r') as file:
    menu = json.load(file)

file_path_2 = "../data/boxes.csv"
with open(file_path_2, 'r') as csv_file:
    csv_reader = csv.reader(csv_file)


In [None]:
# Create folders
images_folder_path = "../data/images"
menu_subheadings = ['About us', 'News', 'Study', 'Events', 'People', 'Research', 'Current Students', 'Alumni', 'PhD Job Market', 'Centres and units', 'Contact us']

for item in menu_subheadings:
    sub_folder_path = os.path.join(images_folder_path, item.lower().replace(' ', '_'))
    if not os.path.exists(sub_folder_path):
        os.makedirs(sub_folder_path)
        print(f"Sub-folder '{sub_folder_path}' created successfully.")
    else:
        print(f"Sub-folder '{sub_folder_path}' already exists.")

In [47]:
result_df

# Loop through each row in the DataFrame
for index, row in result_df.iterrows():
    subheading = row['Page Title']
    sub_folder_path = os.path.join(images_folder_path, subheading.lower().replace(' ', '_'))
    image_link = row['Box Image Url']

    # Check if the image link is not 'None' and the subfolder exists
    if image_link != 'None' and os.path.exists(sub_folder_path):
        # Create a filename based on the image link (you may need to customize this)
        filename = os.path.join(sub_folder_path, os.path.basename(image_link))

        # Download and save the image using shutil
        response = requests.get(image_link, stream=True)
        with open(filename, 'wb') as out_file:
            shutil.copyfileobj(response.raw, out_file)

        print(f"Image saved to '{filename}'")
    else:
        print(f"Skipping row {index + 1}: Image link is 'None' or subfolder does not exist.")


Image saved to '../data/images/about_us/HoughtonStreetOld-16-9.jpg'
Image saved to '../data/images/about_us/David-Davies-DINAM-16-9.jpg'
Image saved to '../data/images/about_us/wight-martin-747x420-16-9.jpg'
Image saved to '../data/images/news/event-world-disorders-747x420-16-9.jpg'
Image saved to '../data/images/news/alumni-arms-round-747x420-16-9.jpg'
Image saved to '../data/images/study/programmes-study-egi-747x420-16-9.jpg'
Image saved to '../data/images/study/globe-old-building-discover-lse-EGI-747x420.jpg'
Image saved to '../data/images/study/centre-building-747x420.jpg'
Image saved to '../data/images/study/video-camera-747x420-16-9.jpg'
Image saved to '../data/images/study/current-students-IRD-EGI-747x420-16-9.jpg'
Image saved to '../data/images/study/lightbulb-blackboard-747x420-16-9.jpg'
Image saved to '../data/images/events/podcast-mic-flowers-747x420-16-9.jpg'
Image saved to '../data/images/events/podcast-kit-747x420-16-9.jpg'
Image saved to '../data/images/events/podcast-mi