<a href="https://colab.research.google.com/github/binhvd/Data-Management-2/blob/main/PETL/3-Basic-Transformation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
"""
@author: ashish
"""

# petl is a framework with the help of which we can create ETL job
# Documentation Link: https://petl.readthedocs.io/en/stable/intro.html#

# Command to install the petl package : pip install petl
# Demo job to demonstrate an ETL workflow with the help of petl
!pip install petl

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting petl
  Downloading petl-1.7.11.tar.gz (408 kB)
[K     |████████████████████████████████| 408 kB 4.1 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: petl
  Building wheel for petl (PEP 517) ... [?25l[?25hdone
  Created wheel for petl: filename=petl-1.7.11-py3-none-any.whl size=226448 sha256=d2ed21e8acd74eb0af78623df25ac467ebc4f3a71dc78fb48a8508a9500314a3
  Stored in directory: /root/.cache/pip/wheels/bc/0f/ae/4f496e580063d9929bd46b9f4d97e8884ece77dc80cd0ccb79
Successfully built petl
Installing collected packages: petl
Successfully installed petl-1.7.11


# Transforming Tables

In [7]:
import petl as etl

table1 = [['foo', 'bar'],
           ['a', 1],
           ['b', 2],
           ['c', 5],
           ['d', 7],
           ['e', 42],
           ['f', 3],
           ['h', 90],
           ['k', 12],
           ['l', 77],
           ['q', 2]]

## Select the number of rows

In [11]:
# Select first n rows
table2 = etl.head(table1, 5) 
print(table2)

+-----+-----+
| foo | bar |
+=====+=====+
| a   |   1 |
+-----+-----+
| b   |   2 |
+-----+-----+
| c   |   5 |
+-----+-----+
| d   |   7 |
+-----+-----+
| e   |  42 |
+-----+-----+



In [12]:
# Select last n rows
table2 = etl.tail(table1, 5) 
print(table2)

+-----+-----+
| foo | bar |
+=====+=====+
| f   |   3 |
+-----+-----+
| h   |  90 |
+-----+-----+
| k   |  12 |
+-----+-----+
| l   |  77 |
+-----+-----+
| q   |   2 |
+-----+-----+



## Slice the Rows

In [13]:
# Slice first 2 rows from table
table3 = etl.rowslice(table1, 2) 
print(table3)

+-----+-----+
| foo | bar |
+=====+=====+
| a   |   1 |
+-----+-----+
| b   |   2 |
+-----+-----+



In [14]:
# Slice the rows between 1 and 4 records
table3 = etl.rowslice(table1, 1, 4) 
print(table3)

+-----+-----+
| foo | bar |
+=====+=====+
| b   |   2 |
+-----+-----+
| c   |   5 |
+-----+-----+
| d   |   7 |
+-----+-----+



In [15]:
# Slice the every 2nd row from selected range
table3 = etl.rowslice(table1, 0, 5, 2) 
print(table3)

+-----+-----+
| foo | bar |
+=====+=====+
| a   |   1 |
+-----+-----+
| c   |   5 |
+-----+-----+
| e   |  42 |
+-----+-----+



## Slice the columns or Choose the columns or Re-Order the Columns

In [16]:
data = [['foo', 'bar', 'baz'],
           ['A', 1, 2.7],
           ['B', 2, 3.4],
           ['B', 3, 7.8],
           ['D', 42, 9.0],
           ['E', 12]]

table4 = etl.cut(data, 'foo' , 'baz')
print(table4)

+-----+------+
| foo | baz  |
| A   |  2.7 |
+-----+------+
| B   |  3.4 |
+-----+------+
| B   |  7.8 |
+-----+------+
| D   |  9.0 |
+-----+------+
| E   | None |
+-----+------+



In [17]:
# Slice the columns by index
table4 = etl.cut(data, 0, 2)
print(table4)

+-----+------+
| foo | baz  |
| A   |  2.7 |
+-----+------+
| B   |  3.4 |
+-----+------+
| B   |  7.8 |
+-----+------+
| D   |  9.0 |
+-----+------+
| E   | None |
+-----+------+



In [18]:
# Slice the columns by names and indices
table4 = etl.cut(data, 'bar', 0) 
print(table4)

+-----+-----+
| bar | foo |
+=====+=====+
|   1 | A   |
+-----+-----+
|   2 | B   |
+-----+-----+
|   3 | B   |
+-----+-----+
|  42 | D   |
+-----+-----+
|  12 | E   |
+-----+-----+



In [19]:
# Slice the columns based on range of fields
table4 = etl.cut(data, *range(0,2)) 
print(table4)

+-----+-----+
| foo | bar |
+=====+=====+
| A   |   1 |
+-----+-----+
| B   |   2 |
+-----+-----+
| B   |   3 |
+-----+-----+
| D   |  42 |
+-----+-----+
| E   |  12 |
+-----+-----+



## Remove Columns

In [20]:
table4 = etl.cutout(data, 'bar')
print(table4)

+-----+------+
| foo | baz  |
| A   |  2.7 |
+-----+------+
| B   |  3.4 |
+-----+------+
| B   |  7.8 |
+-----+------+
| D   |  9.0 |
+-----+------+
| E   | None |
+-----+------+



In [21]:
# Move column to a new position
table4 = etl.movefield(data, 'baz', 0)
print(table4)

+------+-----+-----+
| baz  | foo | bar |
|  2.7 | A   |   1 |
+------+-----+-----+
|  3.4 | B   |   2 |
+------+-----+-----+
|  7.8 | B   |   3 |
+------+-----+-----+
|  9.0 | D   |  42 |
+------+-----+-----+
| None | E   |  12 |
+------+-----+-----+



## Concatenate two tables into one

In [22]:
table1 = [['foo', 'bar'],
           [1, 'A'],
           [2, 'B']]
table2 = [['bar', 'baz'],
           ['C', True],
           ['D', False]]

table3 = etl.cat(table1, table2)
print(table3)

+------+-----+-------+
| foo  | bar | baz   |
|    1 | A   | None  |
+------+-----+-------+
|    2 | B   | None  |
+------+-----+-------+
| None | C   | True  |
+------+-----+-------+
| None | D   | False |
+------+-----+-------+



In [23]:
table3 = etl.cat(table1, table2, header=['A','foo','B','bar', 'C'])
print(table3)

+------+------+------+-----+------+
| A    | foo  | B    | bar | C    |
| None |    1 | None | A   | None |
+------+------+------+-----+------+
| None |    2 | None | B   | None |
+------+------+------+-----+------+
| None | None | None | C   | None |
+------+------+------+-----+------+
| None | None | None | D   | None |
+------+------+------+-----+------+



In [24]:
# Concatenate tables without matching Headers
table4 = etl.stack(table1, table2)
print(table4)

+-----+-------+
| foo | bar   |
|   1 | A     |
+-----+-------+
|   2 | B     |
+-----+-------+
| C   | True  |
+-----+-------+
| D   | False |
+-----+-------+



## Skip Comments Rows

In [25]:
table1 = [['##aaa', 'bbb', 'ccc'],
           ['##mmm',],
           ['#foo', 'bar'],
           ['##nnn', 1],
           ['a', 1],
           ['b', 2]]

# Skip row where the first value is a string and starts with prefix
table2 = etl.skipcomments(table1, '##')
print(table2)

+------+-----+
| #foo | bar |
| a    |   1 |
+------+-----+
| b    |   2 |
+------+-----+



## Add Field with a fixed or Calculated Value

In [26]:
table1 = [['foo', 'bar'],
          ['M', 12],
          ['F', 34],
          ['-', 56]]

In [27]:
# Add a Column with fixed value
table2 = etl.addfield(table1, 'baz', 42)
print(table2)

+-----+-----+-----+
| foo | bar | baz |
+=====+=====+=====+
| M   |  12 |  42 |
+-----+-----+-----+
| F   |  34 |  42 |
+-----+-----+-----+
| -   |  56 |  42 |
+-----+-----+-----+



In [28]:
# Add a column with calculating value
table2 = etl.addfield(table1, 'baz', lambda rec: rec['bar'] * 2)
print(table2)

+-----+-----+-----+
| foo | bar | baz |
+=====+=====+=====+
| M   |  12 |  24 |
+-----+-----+-----+
| F   |  34 |  68 |
+-----+-----+-----+
| -   |  56 | 112 |
+-----+-----+-----+



## Add a column of data to a table 
(Difference is in addfield it was either constant or a calculated field but in addcolumn it can have vary data values)

In [29]:
table1 = [['foo', 'bar'],
          ['A', 1],
          ['B', 2]]
col = [True, False]

table2 = etl.addcolumn(table1, 'baz', col)
print(table2)

+-----+-----+-------+
| foo | bar | baz   |
| A   |   1 | True  |
+-----+-----+-------+
| B   |   2 | False |
+-----+-----+-------+



## Imputations based on previous or next values
Fill the data in the rows based on previous, current and next rows values

In [30]:
table1 = [['foo','bar'],
          ['A', 1],
          ['B', 4],
          ['C', 5],
          ['D', 9]]

def upstream(prv, cur, nxt):
    if prv is None:
        return None
    else:
        return cur.bar - prv.bar

def downstream(prv, cur, nxt):
    if nxt is None:
        return None
    else:
        return nxt.bar - cur.bar

table2 = etl.addfieldusingcontext(table1, 'baz', upstream) # Subtract a value from previous field and the current field
table3 = etl.addfieldusingcontext(table2, 'quux', downstream) # Subtract a value from next field and current field

print(table2) # Query parameter is the function operating on the current, previous and next rows and returning the value

+-----+-----+------+
| foo | bar | baz  |
| A   |   1 | None |
+-----+-----+------+
| B   |   4 |    3 |
+-----+-----+------+
| C   |   5 |    1 |
+-----+-----+------+
| D   |   9 |    4 |
+-----+-----+------+



## Join two or more tables by row order

In [31]:
table1 = [['foo', 'bar'],
          ['A', 9],
          ['C', 2],
          ['F', 1]]

table2 = [['foo', 'baz'],
           ['B', 3],
           ['D', 10]]

table3 = etl.annex(table1, table2)
print(table3)

+-----+-----+------+------+
| foo | bar | foo  | baz  |
| A   |   9 | B    |    3 |
+-----+-----+------+------+
| C   |   2 | D    |   10 |
+-----+-----+------+------+
| F   |   1 | None | None |
+-----+-----+------+------+

