<a href="https://colab.research.google.com/github/binhvd/Data-Management-2/blob/main/PELT/2-Extract-Load.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extract (Read) from different sources
**"from..."** functions extract a table from a file-like source or database

**"to..."** function load data from a table into a file-like source or database

In [75]:
"""
@author: ashish
"""

# petl is a framework with the help of which we can create ETL job
# Documentation Link: https://petl.readthedocs.io/en/stable/intro.html#

# Command to install the petl package : pip install petl
# Demo job to demonstrate an ETL workflow with the help of petl
!pip install petl
!pip install pymysql

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pymysql
  Downloading PyMySQL-1.0.2-py3-none-any.whl (43 kB)
[K     |████████████████████████████████| 43 kB 1.3 MB/s 
[?25hInstalling collected packages: pymysql
Successfully installed pymysql-1.0.2


In [51]:
import petl as etl

# Sequence of Columns as a table
cols = [[0, 1, 2], ['a', 'b', 'c']]

table = etl.fromcolumns(cols)
table.look()

+----+-----+
| f0 | f1  |
+====+=====+
|  0 | 'a' |
+----+-----+
|  1 | 'b' |
+----+-----+
|  2 | 'c' |
+----+-----+

# Delimited Files

In [52]:
import csv

# Setup a csv file to demonstrate with
table1 = [['foo','bar'],
          ['a', 1],
          ['b', 2],
          ['c', 2]]

with open('example1.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerows(table1)

In [53]:
# Extract values from CSV File   
# IMPORTANT: By default all values are considered as string and if required will have to convert numeric values using convert()
    
table2 = etl.fromcsv('example1.csv')
table2.look()

+-----+-----+
| foo | bar |
+=====+=====+
| 'a' | '1' |
+-----+-----+
| 'b' | '2' |
+-----+-----+
| 'c' | '2' |
+-----+-----+

In [54]:
# Load values to CSV File
etl.tocsv(table1, 'example2.csv')

# IMPORTANT: File already exist at the given location, it will be overwritten

# Result of above steps
print(open('example2.csv').read())

foo,bar
a,1
b,2
c,2



# Pickle Files

In [55]:
import pickle

with open('example.p', 'wb') as f:
    pickle.dump(['foo','bar'], f)
    pickle.dump(['a', 1], f)
    pickle.dump(['b', 2], f)
    pickle.dump(['c', 3], f)

In [56]:
# Extraction from Pickle using frompickle()
table2 = etl.frompickle('example.p')
table2.look()

+-----+-----+
| foo | bar |
+=====+=====+
| 'a' |   1 |
+-----+-----+
| 'b' |   2 |
+-----+-----+
| 'c' |   3 |
+-----+-----+

# Text Files

In [57]:
# Sample text file
text = 'a,1\nb,21\nc,2\n'

with open('example.txt', 'w') as f:
    f.write(text)

In [58]:
# Extraction from Text File
table1 = etl.fromtext('example.txt')
table1.look()

+--------+
| lines  |
| 'a,1'  |
+--------+
| 'b,21' |
+--------+
| 'c,2'  |
+--------+

In [59]:
# Split the contents in two columns
table2 = table1.capture('lines', '(\w+),(\d+)', ['foo','bar'])
table2.look()

+-----+------+
| foo | bar  |
| 'a' | '1'  |
+-----+------+
| 'b' | '21' |
+-----+------+
| 'c' | '2'  |
+-----+------+

Load into the text file

In [60]:
table1 = [['foo','bar'],
          ['a', 1],
          ['b', 2],
          ['c', 3]]

prologue = '''{| class="wikitable"
|-
! foo
! bar
'''

template = '''|-
| {foo}
| {bar}
'''
# IMPORTANT : Template will be used to format each row 

epilogue = '|}'

etl.totext(table1, 'example.txt', template=template,
           prologue=prologue, epilogue=epilogue)

# Result
print(open('example.txt').read())

{| class="wikitable"
|-
! foo
! bar
|-
| a
| 1
|-
| b
| 2
|-
| c
| 3
|}


# XML Files

In [66]:
# setup a file to demonstrate with petl
d = '''<table>
     <tr>
         <td>foo</td><td>bar</td>
     </tr>
     <tr>
         <td>a</td><td>1</td>
     </tr>
     <tr>
         <td>b</td><td>2</td>
     </tr>
     <tr>
         <td>c</td><td>2</td>
     </tr>
 </table>'''

with open('example1.xml', 'w') as f:
     f.write(d)

# Extraction from a XML File
table1 = etl.fromxml('example1.xml', 'tr', 'td')
table1.look()

+-----+-----+
| foo | bar |
+=====+=====+
| 'a' | '1' |
+-----+-----+
| 'b' | '2' |
+-----+-----+
| 'c' | '2' |
+-----+-----+

In [67]:
# Or : When Values are stored in an attribute
d = '''<table>
     <tr>
         <td v='foo'/><td v='bar'/>
     </tr>
     <tr>
         <td v='a'/><td v='1'/>
     </tr>
     <tr>
         <td v='b'/><td v='2'/>
     </tr>
     <tr>
         <td v='c'/><td v='2'/>
     </tr>
 </table>'''

with open('example2.xml', 'w') as f:
     f.write(d)

# Extract data from XML File when values are stored in attributes
table2 = etl.fromxml('example2.xml', 'tr', 'td', 'v')
table2.look()

+-----+-----+
| foo | bar |
+=====+=====+
| 'a' | '1' |
+-----+-----+
| 'b' | '2' |
+-----+-----+
| 'c' | '2' |
+-----+-----+

In [70]:
# Or: When Values are stored in a mapping of field names to elements path
d = '''<table>
     <row>
         <foo>a</foo><baz><bar v='1'/><bar v='3'/></baz>
     </row>
     <row>
         <foo>b</foo><baz><bar v='2'/></baz>
     </row>
     <row>
         <foo>c</foo><baz><bar v='2'/></baz>
     </row>
 </table>'''

with open('example3.xml', 'w') as f:
     f.write(d)

# Extract data from XML
table3 = etl.fromxml('example3.xml', 'row', {'foo': 'foo', 'bar': ('baz/bar', 'v')})
table3.look()

+------------+-----+
| bar        | foo |
| ('1', '3') | 'a' |
+------------+-----+
| '2'        | 'b' |
+------------+-----+
| '2'        | 'c' |
+------------+-----+

# JSON Files

In [71]:
data = '''
 [{"foo": "a", "bar": 1},
 {"foo": "b", "bar": 2},
 {"foo": "c", "bar": 2}]
 '''

with open('example.json', 'w') as f:
     f.write(data)

# Read from a JSON File
table1 = etl.fromjson('example.json', header=['foo', 'bar'])
table1.look()

+-----+-----+
| foo | bar |
+=====+=====+
| 'a' |   1 |
+-----+-----+
| 'b' |   2 |
+-----+-----+
| 'c' |   2 |
+-----+-----+

Extraction from a JSON File

In [72]:
# Read from the sequence of Python dict as a table
import petl as etl

dicts = [
            {"foo": "a", "bar": 1},
            {"foo": "b", "bar": 2},
            {"foo": "c", "bar": 2}
        ]

table1 = etl.fromdicts(dicts, header=['foo','bar'])
table1.look()

+-----+-----+
| foo | bar |
+=====+=====+
| 'a' |   1 |
+-----+-----+
| 'b' |   2 |
+-----+-----+
| 'c' |   2 |
+-----+-----+

Loading into a JSON File

In [73]:
table1 = [['foo', 'bar'],
           ['a', 1],
           ['b', 2],
           ['c', 3]]
etl.tojson(table1, 'example.json', sort_keys=True)
# Output of saved file
print(open('example.json').read())

[{"bar": 1, "foo": "a"}, {"bar": 2, "foo": "b"}, {"bar": 3, "foo": "c"}]


# Databases

In [78]:
import pymysql

# Extraction from a Database
connection = pymysql.connect(host='sql11.freesqldatabase.com', user='sql11519659', password='ABljg2IqYj', database='sql11519659')
table = etl.fromdb(connection, 'SELECT * FROM foobar')
table.look()

+-----+-----+
| foo | bar |
+=====+=====+
| 'a' |   1 |
+-----+-----+
| 'b' |   2 |
+-----+-----+
| 'c' |   3 |
+-----+-----+

In [77]:
# Loading into a Database
table = [['foo','bar'],
         ['a', 1],
         ['b', 2],
         ['c', 3]]


connection = pymysql.connect(host='sql11.freesqldatabase.com', user='sql11519659', password='ABljg2IqYj', database='sql11519659')
connection.cursor().execute('SET SQL_MODE=ANSI_QUOTES')
etl.todb(table, connection, 'foobar', create=True)

# Excel File

In [82]:
table = [['foo', 'bar'],
           ['a', 1],
           ['b', 2],
           ['c', 3]]

etl.toxls(table, 'example.xls', 'Example')
etl.toxlsx(table, 'example.xlsx', 'Example')

# Read from a Excel File
table1 = etl.fromxlsx('example.xlsx', 'Example')
table1.look()

+-----+-----+
| foo | bar |
+=====+=====+
| 'a' |   1 |
+-----+-----+
| 'b' |   2 |
+-----+-----+
| 'c' |   3 |
+-----+-----+

# Numpy Array

In [83]:
import numpy as np

a = np.array([('apples', 1, 2.5),
              ('oranges', 3, 4.4),
              ('pears', 7, 0.1)],
    dtype='U8, i4, f4')

table = etl.fromarray(a)
table.look()

+-----------+----+-----+
| f0        | f1 | f2  |
| 'apples'  | 1  | 2.5 |
+-----------+----+-----+
| 'oranges' | 3  | 4.4 |
+-----------+----+-----+
| 'pears'   | 7  | 0.1 |
+-----------+----+-----+

In [85]:
# Load data from the given table into a numpy array
b = table.toarray()
b

array([('apples', 1, 2.5), ('oranges', 3, 4.4), ('pears', 7, 0.1)],
      dtype=(numpy.record, [('f0', '<U7'), ('f1', '<i4'), ('f2', '<f4')]))

# Dataframe (Pandas)

In [87]:
# Create a DataFrame from a Table
df = table.todataframe()
df

Unnamed: 0,f0,f1,f2
0,apples,1,2.5
1,oranges,3,4.4
2,pears,7,0.1


In [88]:
# Load data from a DataFrame
table2 = etl.fromdataframe(df)
table2.look()

+-----------+----+---------------------+
| f0        | f1 | f2                  |
| 'apples'  |  1 |                 2.5 |
+-----------+----+---------------------+
| 'oranges' |  3 |   4.400000095367432 |
+-----------+----+---------------------+
| 'pears'   |  7 | 0.10000000149011612 |
+-----------+----+---------------------+