In [2]:
import requests as req
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
page= req.get("https://en.wikipedia.org/wiki/Samsung")
if page.status_code == 200:
    print("Page downloaded successfully!")


Page downloaded successfully!


In [4]:
soup = BeautifulSoup(page.content, 'html.parser')

In [6]:
link_url = [a['href'] for a in soup.find_all('a', href=True)]
link_text = [a.text for a in soup.find_all('a', href=True)]
print(link_url)
print(link_text)

text_and_links = pd.DataFrame(list(zip(link_text, link_url)), columns=['Text', 'Link'])
text_and_links

['#bodyContent', '/wiki/Main_Page', '/wiki/Wikipedia:Contents', '/wiki/Portal:Current_events', '/wiki/Special:Random', '/wiki/Wikipedia:About', '//en.wikipedia.org/wiki/Wikipedia:Contact_us', '/wiki/Special:SpecialPages', '/wiki/Help:Contents', '/wiki/Help:Introduction', '/wiki/Wikipedia:Community_portal', '/wiki/Special:RecentChanges', '/wiki/Wikipedia:File_upload_wizard', '/wiki/Main_Page', '/wiki/Special:Search', 'https://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=en.wikipedia.org&uselang=en', '/w/index.php?title=Special:CreateAccount&returnto=Samsung', '/w/index.php?title=Special:UserLogin&returnto=Samsung', 'https://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=en.wikipedia.org&uselang=en', '/w/index.php?title=Special:CreateAccount&returnto=Samsung', '/w/index.php?title=Special:UserLogin&returnto=Samsung', '/wiki/Help:Introduction', '/wiki/Special:MyContributions', '/wiki/Special:MyTalk', '#', '#Meaning_of_the_name', '#History

Unnamed: 0,Text,Link
0,Jump to content,#bodyContent
1,Main page,/wiki/Main_Page
2,Contents,/wiki/Wikipedia:Contents
3,Current events,/wiki/Portal:Current_events
4,Random article,/wiki/Special:Random
...,...,...
2072,\n\n,#
2073,\n\n,#
2074,\n\n,#
2075,\n\n,#


In [7]:
infobox = soup.find('table', class_='infobox')

if infobox:
    rows = infobox.find_all('tr')
    data = {}
    for row in rows: 
        header = row.find('th')
        value = row.find('td')
        if header and value:
            data[header.text.strip()] = value.text.strip()
    print(data)
else: print("No infobox found")

{'Native name': '삼성그룹', 'Company type': 'Private', 'Industry': 'Conglomerate', 'Founded': '1\xa0March 1938 (86 years ago)\xa0(1938-03-01) in Taikyu, Korea, Empire of Japan', 'Founder': 'Lee Byung-chul', 'Headquarters': 'Samsung Town, Seoul, South\xa0Korea', 'Area served': 'Worldwide', 'Key people': 'Lee Jae-yong (Chairman)', 'Subsidiaries': 'Cheil WorldwideSamsung Asset ManagementSamsung BiologicsSamsung C&T CorporationSamsung Electro-MechanicsSamsung ElectronicsSamsung EngineeringSamsung Fire & Marine InsuranceSamsung Heavy IndustriesSamsung Life InsuranceSamsung SDISamsung SDSSamsung Securities', 'Website': 'samsung.com'}


In [8]:
info_table = pd.DataFrame(list(data.items()), columns=['Attribute', 'Value'])
info_table

Unnamed: 0,Attribute,Value
0,Native name,삼성그룹
1,Company type,Private
2,Industry,Conglomerate
3,Founded,1 March 1938 (86 years ago) (1938-03-01) in Ta...
4,Founder,Lee Byung-chul
5,Headquarters,"Samsung Town, Seoul, South Korea"
6,Area served,Worldwide
7,Key people,Lee Jae-yong (Chairman)
8,Subsidiaries,Cheil WorldwideSamsung Asset ManagementSamsung...
9,Website,samsung.com


In [9]:
topics = [heading.text.strip() for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
topics

['Contents',
 'Samsung',
 'Meaning of the name',
 'History',
 '1938–1970',
 '1970–1990',
 '1990–2000',
 '2000–present',
 'Influence in South Korea',
 'Operations',
 'Leadership',
 'Affiliates',
 'Divested',
 'Defunct',
 'Joint ventures',
 'Partially owned companies',
 'Acquisitions and attempted acquisitions',
 'Major clients',
 'Shell plc',
 'United Arab Emirates government',
 'Ontario government',
 'Corporate image',
 'Audio logo',
 'Font',
 'Sponsorships',
 'In Vietnam',
 'Controversies',
 'Labor abuses',
 'Union-busting activity',
 '2007 slush fund scandal',
 "Lee Kun-hee's prostitution scandal",
 '2017 bribery scandal',
 'Supporting far-right groups',
 'Price fixing',
 'Misleading claims',
 'References',
 'External links']

In [10]:
samsung_topics = pd.DataFrame(topics, columns=['Topic'])
samsung_topics

Unnamed: 0,Topic
0,Contents
1,Samsung
2,Meaning of the name
3,History
4,1938–1970
5,1970–1990
6,1990–2000
7,2000–present
8,Influence in South Korea
9,Operations


In [11]:
excel_file = 'Web_Scraping_Samsung_Wiki.xlsx'
with pd.ExcelWriter(excel_file) as writer:
    text_and_links.to_excel(writer, sheet_name='Samsung Links', index=False)
    info_table.to_excel(writer, sheet_name='Samsung Info', index=False)
    samsung_topics.to_excel(writer, sheet_name='Samsung Topics', index=False)
print(f"Dataframes saved to {excel_file} successfully!")

Dataframes saved to Web_Scraping_Samsung_Wiki.xlsx successfully!
