# House ETL

in this notebook we make an ETL to ingest house.csv file into datalake

### import libraries

In [1]:
import sys
from platform import python_version

import findspark
findspark.init()

In [2]:
import pyspark
import pyspark.sql.functions as f

from pyspark.sql           import SparkSession
from pyspark.sql.functions import col, explode, regexp_replace, udf
from pyspark.sql.types     import Row, ArrayType, IntegerType, LongType, StringType


### Load data

In [3]:
spark = (SparkSession
        .builder
        .appName( 'house_house_payments' )
        .getOrCreate()
        )

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/15 17:49:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
in_path  = '/home/art/data/hpay/in/house.csv'
out_path = '/home/art/data/hpay/out/house'

In [5]:
df = (spark
      .read
      .options( header = True, inferSchema = True, delimiter = ','  )
      .csv( in_path )
     )

In [6]:
df.show()

+--------+-------+------+-------+
|house_id|user_id|street|ext_num|
+--------+-------+------+-------+
| 100pino|      1|  pino|    100|
| 101pino|      2|  pino|    101|
| 102pino|      3|  pino|    102|
| 200caob|      4| caoba|    200|
| 201caob|      5| caoba|    201|
| 202caob|      6| caoba|    202|
| 100abed|      7|abedul|    100|
| 101abed|      8|abedul|    101|
| 102abed|      9|abedul|    102|
+--------+-------+------+-------+



In [7]:
df.printSchema()

root
 |-- house_id: string (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- street: string (nullable = true)
 |-- ext_num: integer (nullable = true)



### Transform

In [8]:
df.createOrReplaceTempView( 'house' )

In [9]:
query = '''
SELECT 
  upper( house_id ) as house_id,
  user_id,
  upper( street ) as street,
  ext_num

FROM house

'''

In [10]:
df = spark.sql( query )
df.show()

+--------+-------+------+-------+
|house_id|user_id|street|ext_num|
+--------+-------+------+-------+
| 100PINO|      1|  PINO|    100|
| 101PINO|      2|  PINO|    101|
| 102PINO|      3|  PINO|    102|
| 200CAOB|      4| CAOBA|    200|
| 201CAOB|      5| CAOBA|    201|
| 202CAOB|      6| CAOBA|    202|
| 100ABED|      7|ABEDUL|    100|
| 101ABED|      8|ABEDUL|    101|
| 102ABED|      9|ABEDUL|    102|
+--------+-------+------+-------+



### Load data into Data Lake

In [11]:
(df
 .write
 .option( 'header', True )
 .csv( out_path )
)

In [12]:
print( 'check your clean data in: {}'.format( out_path ) )

check your clean data in: /home/art/data/hpay/out/house
