## Extract, Transform, Load Data with Python

### 1. CSV file format

In [55]:
import pandas as pd

# Extraction
wine_quality_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
df = pd.read_csv(wine_quality_url, sep=";")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [38]:
# save to csv file
df.to_csv('data.csv')

In [39]:
f = open('data.csv')

for i in range(5):
    line = f.readline()
    print('line: ', i, line)
f.close()

line:  0 ,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality

line:  1 0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5

line:  2 1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5

line:  3 2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5

line:  4 3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6



### 2. JSON format

In [40]:
# Convert the DataFrame to a JSON object
json_object = df.to_json(orient='records', lines=True)
print(json_object)

{"fixed acidity":7.4,"volatile acidity":0.7,"citric acid":0.0,"residual sugar":1.9,"chlorides":0.076,"free sulfur dioxide":11.0,"total sulfur dioxide":34.0,"density":0.9978,"pH":3.51,"sulphates":0.56,"alcohol":9.4,"quality":5}
{"fixed acidity":7.8,"volatile acidity":0.88,"citric acid":0.0,"residual sugar":2.6,"chlorides":0.098,"free sulfur dioxide":25.0,"total sulfur dioxide":67.0,"density":0.9968,"pH":3.2,"sulphates":0.68,"alcohol":9.8,"quality":5}
{"fixed acidity":7.8,"volatile acidity":0.76,"citric acid":0.04,"residual sugar":2.3,"chlorides":0.092,"free sulfur dioxide":15.0,"total sulfur dioxide":54.0,"density":0.997,"pH":3.26,"sulphates":0.65,"alcohol":9.8,"quality":5}
{"fixed acidity":11.2,"volatile acidity":0.28,"citric acid":0.56,"residual sugar":1.9,"chlorides":0.075,"free sulfur dioxide":17.0,"total sulfur dioxide":60.0,"density":0.998,"pH":3.16,"sulphates":0.58,"alcohol":9.8,"quality":6}
{"fixed acidity":7.4,"volatile acidity":0.7,"citric acid":0.0,"residual sugar":1.9,"chlor

In [41]:
# Write the JSON object to a file
with open('data.json', 'w') as file:
    file.write(json_object)

In [46]:
pd.read_json('data.json', lines=True)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


### 3. XML format

In [51]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [56]:
df.to_xml()  

ValueError: Invalid tag name 'fixed acidity'

- The ValueError you are encountering with the message "Invalid tag name 'fixed acidity'" indicates that the column names in your DataFrame are not valid XML tag names. XML tag names cannot contain spaces or certain other characters.

In [62]:
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [63]:
# DataFrame의 열 이름 중 공백을 언더스코어로 대체
df.columns = df.columns.str.replace(' ', '_')
df.columns

Index(['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
       'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [65]:
df.to_xml('data.xml')

In [67]:
def print_lines(n, file_name):
    f = open(file_name)
    for i in range(n):
        print(f.readline())
    f.close()

print_lines(10, 'data.xml')


<?xml version='1.0' encoding='utf-8'?>

<data>

  <row>

    <index>0</index>

    <fixed_acidity>7.4</fixed_acidity>

    <volatile_acidity>0.7</volatile_acidity>

    <citric_acid>0.0</citric_acid>

    <residual_sugar>1.9</residual_sugar>

    <chlorides>0.076</chlorides>

    <free_sulfur_dioxide>11.0</free_sulfur_dioxide>



### 4. APIs

In [68]:
import requests
import pandas as pd

url = 'http://api.worldbank.org/v2/countries/br;cn;us;de/indicators/SP.POP.TOTL?format=json&per_page=1000'
r = requests.get(url)
r.json()
# converts the json into a dataframe
pd.DataFrame(r.json()[1])


Unnamed: 0,indicator,country,countryiso3code,date,value,unit,obs_status,decimal
0,"{'id': 'SP.POP.TOTL', 'value': 'Population, to...","{'id': 'BR', 'value': 'Brazil'}",BRA,2022,215313498,,,0
1,"{'id': 'SP.POP.TOTL', 'value': 'Population, to...","{'id': 'BR', 'value': 'Brazil'}",BRA,2021,214326223,,,0
2,"{'id': 'SP.POP.TOTL', 'value': 'Population, to...","{'id': 'BR', 'value': 'Brazil'}",BRA,2020,213196304,,,0
3,"{'id': 'SP.POP.TOTL', 'value': 'Population, to...","{'id': 'BR', 'value': 'Brazil'}",BRA,2019,211782878,,,0
4,"{'id': 'SP.POP.TOTL', 'value': 'Population, to...","{'id': 'BR', 'value': 'Brazil'}",BRA,2018,210166592,,,0
...,...,...,...,...,...,...,...,...
247,"{'id': 'SP.POP.TOTL', 'value': 'Population, to...","{'id': 'US', 'value': 'United States'}",USA,1964,191889000,,,0
248,"{'id': 'SP.POP.TOTL', 'value': 'Population, to...","{'id': 'US', 'value': 'United States'}",USA,1963,189242000,,,0
249,"{'id': 'SP.POP.TOTL', 'value': 'Population, to...","{'id': 'US', 'value': 'United States'}",USA,1962,186538000,,,0
250,"{'id': 'SP.POP.TOTL', 'value': 'Population, to...","{'id': 'US', 'value': 'United States'}",USA,1961,183691000,,,0


In [73]:
r.json()[0]

{'page': 1,
 'pages': 1,
 'per_page': 1000,
 'total': 252,
 'sourceid': '2',
 'lastupdated': '2024-02-21'}