## XML 데이터 읽기

### 파이썬 표준 라이브러리 xml 모듈

In [None]:
import xml.etree.ElementTree as ET

In [None]:
tree = ET.parse('sample.xml')
root = tree.getroot()
print(f'루트 엘리먼트 : {root.tag}')

print(ET.tostring(root, encoding='utf-8').decode('utf-8'))

루트 엘리먼트 : customers
<customers>
    <customer>
        <name>홍길동</name>
        <address>서울 강남구</address>
    </customer>
    <customer>
        <name>고길동</name>
        <address>서울 강북구</address>
    </customer>
    <customer>
        <name>김길동</name>
        <address>서울 서초구</address>
    </customer>
</customers>


In [None]:
name = root.find('.//name')
print(name)
print(f'{name.tag}:{name.text}')

name2 = root.findtext('.//name')
print(name2)

<Element 'name' at 0x0000026AE6D31C60>
name:홍길동
홍길동


In [None]:
names = root.findall('.//name')
print(names)

[<Element 'name' at 0x0000026AE6D31C60>, <Element 'name' at 0x0000026AE6D30D60>, <Element 'name' at 0x0000026AE6D30630>]


In [None]:
print('len : ', len(names))
for name in names:
    print(type(name))
    print(f'{name.tag}:{name.text}')

len :  3
<class 'xml.etree.ElementTree.Element'>
name:홍길동
<class 'xml.etree.ElementTree.Element'>
name:고길동
<class 'xml.etree.ElementTree.Element'>
name:김길동


### lxml 라이브러리

- lxml 라이브러리
    - 매우 빠른 파싱 속도를 제공하고, 대규모 XML 문서 처리에 적합
    - 설치
    ```
    pip install lxml
    ```


### 웹 API를 사용하여 XML 데이터 수신
- RSS (Really Simple Syndication)
    - Rich Site Summary 라고 함.
    - 뉴스나 블로그와 같이 컨텐츠 업데이트가 자주 일어나는 웹사이트에서, 업데이트된 정보를 사용자에게 제공하기 위한 서비스
    - xml 데이터 형식

In [None]:
%pip install lxml

Collecting lxml
  Downloading lxml-5.2.2-cp312-cp312-win_amd64.whl.metadata (3.5 kB)
Downloading lxml-5.2.2-cp312-cp312-win_amd64.whl (3.8 MB)
   ---------------------------------------- 0.0/3.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/3.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/3.8 MB 653.6 kB/s eta 0:00:06
   - -------------------------------------- 0.1/3.8 MB 1.1 MB/s eta 0:00:04
   ----------------------------- ---------- 2.8/3.8 MB 20.0 MB/s eta 0:00:01
   ---------------------------------------- 3.8/3.8 MB 22.1 MB/s eta 0:00:00
Installing collected packages: lxml
Successfully installed lxml-5.2.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import requests
from lxml import etree
import xml.etree.ElementTree as ET

url = 'https://fs.jtbc.co.kr/RSS/culture.xml'
response = requests.get(url)

if response.status_code == 200:
    print(response.encoding)            # (ISO-8859-1) 수신된 데이터의 인코딩 방법
    print(response.apparent_encoding)   # (UTF-8-SIG) 추정 인코딩 방법

    print(type(response.content))       # <class 'bytes'>
    print(response.content)
    print(type(response.text))          # <class 'str'>
    print(response.text)

    #print(etree.fromstring(response.content))
    root = etree.fromstring(response.content)
else:
    print(response.status_code)

ISO-8859-1
UTF-8-SIG
<class 'bytes'>
b'\xef\xbb\xbf<?xml version="1.0" encoding="utf-8"?><rss version="2.0"><channel><title>JTBC News</title><link>https://fs.jtbc.co.kr/RSS/culture.xml</link><description>\xeb\xac\xb8\xed\x99\x94 RSS</description><language>ko</language><copyright>Copyright(C) JTBC All rights reserved.</copyright><category>\xeb\xac\xb8\xed\x99\x94</category><pubDate>2024\xeb\x85\x84 7\xec\x9b\x94 23\xec\x9d\xbc \xed\x99\x94\xec\x9a\x94\xec\x9d\xbc \xec\x98\xa4\xed\x9b\x84 3:15:00</pubDate><item><title>"\xeb\xaa\xbd\xea\xb3\xa8\xec\x9d\xb4 \xec\x9d\xb4\xeb\xaf\xb8 \xea\xb8\x88\xeb\xa9\x94\xeb\x8b\xac"\xe2\x80\xa6\xec\x98\xac\xeb\xa6\xbc\xed\x94\xbd \xec\x84\xa0\xec\x88\x98\xeb\x8b\xa8 \xec\x9d\x98\xec\x83\x81 \xeb\xb4\xa4\xeb\x8d\x94\xeb\x8b\x88 [\xec\x86\x8c\xec\x85\x9c\xed\x94\xbd]</title><link>https://news.jtbc.co.kr/article/article.aspx?news_id=NB12205819</link><description> 2024 \xed\x8c\x8c\xeb\xa6\xac \xec\x98\xac\xeb\xa6\xbc\xed\x94\xbd\xec\x9d\x84 \xec\x95\x9e\xe

In [None]:
# request 정보
print('response.request.method : ', response.request.method)
print('response.request.url : ', response.request.url)
print('response.request.headers : ', response.request.headers)
print('response.request.body : ', response.request.body)

response.request.method :  GET
response.request.url :  https://fs.jtbc.co.kr/RSS/culture.xml
response.request.headers :  {'User-Agent': 'python-requests/2.32.3', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
response.request.body :  None


In [None]:
# response 정보
print('response.status_code : ', response.status_code)
print('response.headers : ', response.headers)
print('response.text : ', response.text)

response.status_code :  200
response.headers :  {'Date': 'Tue, 23 Jul 2024 06:19:53 GMT', 'Content-Type': 'text/xml', 'Content-Length': '10821', 'Connection': 'keep-alive', 'Last-Modified': 'Tue, 23 Jul 2024 06:15:00 GMT', 'ETag': '"669f4a64-2a45"', 'X-Cache-Status': 'BYPASS, BYPASS', 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Credentials': 'true', 'Server': 'nginx', 'Accept-Ranges': 'bytes, bytes', 'Keep-Alive': 'timeout=10'}
response.text :  ï»¿<?xml version="1.0" encoding="utf-8"?><rss version="2.0"><channel><title>JTBC News</title><link>https://fs.jtbc.co.kr/RSS/culture.xml</link><description>ë¬¸í RSS</description><language>ko</language><copyright>Copyright(C) JTBC All rights reserved.</copyright><category>ë¬¸í</category><pubDate>2024ë 7ì 23ì¼ íìì¼ ì¤í 3:15:00</pubDate><item><title>"ëª½ê³¨ì´ ì´ë¯¸ ê¸ë©ë¬"â¦ì¬ë¦¼í½ ì ìë¨ ìì ë´¤ëë [ììí½]</title><link>https://news.jtbc.co.kr/article/article.aspx?news_id=NB12205819</link><descrip

In [None]:
print(root)
print('root type : ', type(root))   # <class 'lxml.etree._Element'>

<Element rss at 0x26af61f7340>
root type :  <class 'lxml.etree._Element'>


In [None]:
import pandas as pd

items = root.findall('.//item')

lst = []

for i, item in enumerate(items):
    # print(f'{i}\r\n')
    # print(item.tag, item.text)

    # lst.append([item.findtext('.//title'), item.findtext('.//link')])
    lst.append([item.find('.//title').text[:10], item.find('.//link').text[:10]])

print('lst : \r\n', lst)

df = pd.DataFrame(lst, columns = ['제목', '링크'])
df

lst : 
 [['"몽골이 이미 금메', 'https://ne'], ['"하이브가 만들었다', 'https://ne'], ['"남고생들, 마치 ', 'https://ne'], ['중앙그룹 대학생 크', 'https://ne'], ['"던지지 말랬는데…', 'https://ne'], ['배우도 카메라도 없', 'https://ne'], ['야구보다 깜짝 "진', 'https://ne'], ["'서울국제도서전' ", 'https://ne'], ['충주맨 보고 있나?', 'https://ne'], ['"미성년자도 있는데', 'https://ne'], ['"어제 꿈에서 본 ', 'https://ne'], ['드디어 손흥민 만난', 'https://ne'], ['일본서 197년 만', 'https://ne'], ["BTS 진에 '기습", 'https://ne'], ['"아미, 안녕!" ', 'https://ne'], ["'표절' 반박한 아", 'https://ne'], ['조선 왕실 유물이 ', 'https://ne'], ["경복궁 땅 밑 '보", 'https://ne'], ['어른들도 "나 아직', 'https://ne'], ['서울퀴어문화축제 2', 'https://ne']]


Unnamed: 0,제목,링크
0,"""몽골이 이미 금메",https://ne
1,"""하이브가 만들었다",https://ne
2,"""남고생들, 마치",https://ne
3,중앙그룹 대학생 크,https://ne
4,"""던지지 말랬는데…",https://ne
5,배우도 카메라도 없,https://ne
6,"야구보다 깜짝 ""진",https://ne
7,'서울국제도서전',https://ne
8,충주맨 보고 있나?,https://ne
9,"""미성년자도 있는데",https://ne
