## modulok importálása

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

## weboldal tartalmának beolvasása

In [2]:
url = 'https://en.wikipedia.org/wiki/Community_areas_in_Chicago'

# User-Agent azonosítóval enged a wikipedia, mindegy mit írunk
headers = {'User-Agent' : 'Mozilla/5.0'}

In [3]:
# lehívjuk a wiki oldalt
response = requests.get(url = url, headers = headers)

# teljes weboldal tartalma
soup = BeautifulSoup(response.content, 'html.parser')

In [4]:
# prettify - szebben kiírja a beolvasott adatokat
# print(soup.prettify())

## táblázat kikeresése

In [5]:
# minden táblázatot megkeres
soup.find_all("table")

[<table class="wikitable sortable plainrowheaders mw-datatable" style="text-align:right">
 <caption>Chicago community areas by number, population, and area<sup class="reference" id="cite_ref-City_basics_9-0"><a href="#cite_note-City_basics-9"><span class="cite-bracket">[</span>8<span class="cite-bracket">]</span></a></sup>
 </caption>
 <tbody><tr>
 <th rowspan="2" scope="col">No.
 </th>
 <th rowspan="2" scope="col">Name
 </th>
 <th scope="col">Population
 </th>
 <th colspan="2" scope="col">Area<sup class="reference" id="cite_ref-CMAP_Area_10-0"><a href="#cite_note-CMAP_Area-10"><span class="cite-bracket">[</span>9<span class="cite-bracket">]</span></a></sup>
 </th>
 <th colspan="2" scope="col">Density
 </th></tr>
 <tr>
 <th><span style="font-size: 85%;"><style data-mw-deduplicate="TemplateStyles:r886047488">.mw-parser-output .nobold{font-weight:normal}</style><span class="nobold">(2023)<sup class="plainlinks noexcerpt noprint asof-tag update" style="display:none;"><a class="external te

In [6]:
# csak a táblázatok class attribútumát írjuk ki: <table class = "xyz xyz">...</table>
for table in soup.find_all("table") :
	print(table.get('class'))

['wikitable', 'sortable', 'plainrowheaders', 'mw-datatable']
['nowraplinks', 'mw-collapsible', 'autocollapse', 'navbox-inner']
['nowraplinks', 'mw-collapsible', 'autocollapse', 'navbox-inner']
['nowraplinks', 'hlist', 'mw-collapsible', 'autocollapse', 'navbox-inner']


In [7]:
# egyetlen tábla keresése
table = soup.find('table', class_ = 'wikitable sortable plainrowheaders mw-datatable')

In [8]:
# soronként írjuk ki a táblát, amelyikben a parkok nevei vannak
# az első két sor header, az utolsó sor összegzés, ezeket nem vesszük figyelembe
for row in table.tbody.find_all('tr')[2 :-1] :
	# a harmadik sortól az utolsó előttiig van szükségnk az adatokra
	print(row.prettify()) # prettify olvashatóbbá teszi a kódot
	print('-' * 100)

<tr>
 <td>
  01
 </td>
 <th scope="row">
  <a href="/wiki/Rogers_Park,_Chicago" title="Rogers Park, Chicago">
   Rogers Park
  </a>
 </th>
 <td style="text-align:right">
  54,388
 </td>
 <td style="text-align:right">
  1.84
 </td>
 <td style="text-align:right">
  4.77
 </td>
 <td style="text-align:right">
  29,558.7
 </td>
 <td style="text-align:right">
  11,412.61
 </td>
</tr>

----------------------------------------------------------------------------------------------------
<tr>
 <td>
  02
 </td>
 <th scope="row">
  <a href="/wiki/West_Ridge,_Chicago" title="West Ridge, Chicago">
   West Ridge
  </a>
 </th>
 <td style="text-align:right">
  78,227
 </td>
 <td style="text-align:right">
  3.53
 </td>
 <td style="text-align:right">
  9.14
 </td>
 <td style="text-align:right">
  22,160.62
 </td>
 <td style="text-align:right">
  8,556.22
 </td>
</tr>

----------------------------------------------------------------------------------------------------
<tr>
 <td>
  03
 </td>
 <th scope="ro

In [9]:
# soronként kiírjuk a cellák tartalmát - először a sorszámokat
for row in table.tbody.find_all('tr')[2 : -1] :
	cells = row.find_all('td')
	print(cells[0].get_text(strip = True))  # strip eltűnteti a felesleges whitespacet

01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77


In [10]:
# soronként kiírjuk a cellák tartalmát - most a szöveges adatokat
for row in table.tbody.find_all('tr')[2 : -1] :
	h_cells = row.find_all('th')
	print(h_cells[0].get_text(strip = True))  # strip eltűnteti a felesleges whitespacet

Rogers Park
West Ridge
Uptown
Lincoln Square
North Center
Lake View
Lincoln Park
Near North Side
Edison Park
Norwood Park
Jefferson Park
Forest Glen
North Park
Albany Park
Portage Park
Irving Park
Dunning
Montclare
Belmont Cragin
Hermosa
Avondale
Logan Square
Humboldt Park
West Town
Austin
West Garfield Park
East Garfield Park
Near West Side
North Lawndale
South Lawndale
Lower West Side
Loop
Near South Side
Armour Square
Douglas
Oakland
Fuller Park
Grand Boulevard
Kenwood
Washington Park
Hyde Park
Woodlawn
South Shore
Chatham
Avalon Park
South Chicago
Burnside
Calumet Heights
Roseland
Pullman
South Deering
East Side
West Pullman
Riverdale
Hegewisch
Garfield Ridge
Archer Heights
Brighton Park
McKinley Park
Bridgeport
New City
West Elsdon
Gage Park
Clearing
West Lawn
Chicago Lawn
West Englewood
Englewood
Greater Grand Crossing
Ashburn
Auburn Gresham
Beverly
Washington Heights
Mount Greenwood
Morgan Park
O'Hare[11]
Edgewater


In [11]:
# soronként kiírjuk a cellák tartalmát - együtt
for row in table.tbody.find_all('tr')[2 : -1] :
	cells = row.find_all('td') # számok
	h_cells = row.find_all('th') # szövegek (headerek)
	print(cells[0].get_text(strip = True))  # strip eltűnteti a felesleges whitespacet
	print(h_cells[0].get_text(strip = True))
	print()

01
Rogers Park

02
West Ridge

03
Uptown

04
Lincoln Square

05
North Center

06
Lake View

07
Lincoln Park

08
Near North Side

09
Edison Park

10
Norwood Park

11
Jefferson Park

12
Forest Glen

13
North Park

14
Albany Park

15
Portage Park

16
Irving Park

17
Dunning

18
Montclare

19
Belmont Cragin

20
Hermosa

21
Avondale

22
Logan Square

23
Humboldt Park

24
West Town

25
Austin

26
West Garfield Park

27
East Garfield Park

28
Near West Side

29
North Lawndale

30
South Lawndale

31
Lower West Side

32
Loop

33
Near South Side

34
Armour Square

35
Douglas

36
Oakland

37
Fuller Park

38
Grand Boulevard

39
Kenwood

40
Washington Park

41
Hyde Park

42
Woodlawn

43
South Shore

44
Chatham

45
Avalon Park

46
South Chicago

47
Burnside

48
Calumet Heights

49
Roseland

50
Pullman

51
South Deering

52
East Side

53
West Pullman

54
Riverdale

55
Hegewisch

56
Garfield Ridge

57
Archer Heights

58
Brighton Park

59
McKinley Park

60
Bridgeport

61
New City

62
West Elsdon

63
Ga

## list of dictionaries készítése a beolvasott adatokból

In [12]:
data = []  # könyvtárak listája

for row in table.tbody.find_all('tr')[2 : -1] :
	cells = row.find_all('td')
	h_cells = row.find_all('th')

	# elmentjük változókba a megtisztított értékeket
	area_code = cells[0].get_text(strip = True)  # strip eltűnteti a felesleges whitespacet
	community_area_name = h_cells[0].get_text(strip = True)

	# a kiolvasott adatokat hozzáadjuk a listához kulcs-érték párként
	data.append({'area_code' : area_code, 'community_area' : community_area_name})

## dataframe készítése a list of dictionariesből

In [13]:
community_areas_df = pd.DataFrame(data)
community_areas_df['area_code'] = community_areas_df['area_code'].astype('int')
# community_areas_df.head()

## kiíratás csv-be

In [14]:
community_areas_df.to_csv('dim_community_areas.csv', index = False)