# ETL - user dataset

In this notebook we do the same as user_01, but here using SQL.

###  import libraries

In [1]:
import findspark
findspark.init()

In [2]:
import sys
from IPython.display         import display, HTML
from platform                import python_version

import pyspark
import pyspark.sql.functions as f

from   pyspark.sql           import SparkSession
from   pyspark.sql.functions import col, explode, regexp_replace, udf
from   pyspark.sql.types     import Row, IntegerType, LongType ,StringType, ArrayType





### version

In [3]:
python_version()

'3.10.11'

In [4]:
sys.executable

'/home/art/git/data_engineer/venv/bin/python'

In [5]:
pyspark.__version__

'3.4.0'

In [6]:
!jupyter --version

Selected Jupyter core packages...
IPython          : 8.14.0
ipykernel        : 6.23.1
ipywidgets       : 8.0.6
jupyter_client   : 8.2.0
jupyter_core     : 5.3.0
jupyter_server   : 2.6.0
jupyterlab       : not installed
nbclient         : 0.8.0
nbconvert        : 7.4.0
nbformat         : 5.9.0
notebook         : 6.5.4
qtconsole        : 5.4.3
traitlets        : 5.9.0


### load data

In [7]:
in_path  = '/home/art/data/hpay/in/user.csv'
out_path = '/home/art/data/hpay/out/user'

In [8]:
spark = (SparkSession
         .builder
         .appName( 'user02_house_pay' )
         .getOrCreate()
        )

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/06 10:48:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/03/06 10:48:26 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [9]:
df = (spark
      .read
      .options( header= True, inferSchema = True, delimiter = ',' )
      .csv( in_path ))

In [10]:
df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- cel: string (nullable = true)
 |-- email: string (nullable = true)



In [11]:
df.show()

+-------+----------+---------+------+------------+------------------+
|user_id|first_name|last_name|   sex|         cel|             email|
+-------+----------+---------+------+------------+------------------+
|      1|    mickey|    mouse|  male|11 1111 1111| mickey@disney.com|
|      2|    Minnie|    Mouse|female|22 2222 2222| Minnie@disney.com|
|      3|    Donald|     Duck|  male|33 3333 3333| Donald@disney.com|
|      4|     daisY|     duck|female|44 4444 4444|  daisY@disney.com|
|      5|   aladdin|      ali|  male|55 5555 5555|aladdin@disney.com|
|      6|    jazmin|   bagdad|female|66 6666 6666| jazmin@disney.com|
|      7|     mulan|   gun fu|female|77 7777 7777|  mulan@disney.com|
|      8|      jack|  sparrow|  male|88 8888 8888|   jack@disney.com|
|      9|    merida|    brave|female|99 9999 9999| merida@disney.com|
|   null|      xxxx|     null|  null|        null|              null|
|   null|      yyyy|     null|  null|        null|              null|
|   null|      zzzz|

### Transform. Clean

In [14]:
df.createOrReplaceTempView( 'df_user' )

In [15]:
query = '''
SELECT 
  user_id, 
  upper( first_name ) as first_name,
  upper( last_name  ) as last_name ,
  
  CASE
    WHEN sex = 'female' THEN 0
    WHEN sex = 'male'   THEN 1
  END 
  as sex,
  
  replace( cel, ' ', '' ) as cel,
  lower( email ) as email
  
FROM  df_user
WHERE user_id IS NOT NULL
'''

In [16]:
df = spark.sql( query )
df.show()

+-------+----------+---------+---+----------+------------------+
|user_id|first_name|last_name|sex|       cel|             email|
+-------+----------+---------+---+----------+------------------+
|      1|    MICKEY|    MOUSE|  1|1111111111| mickey@disney.com|
|      2|    MINNIE|    MOUSE|  0|2222222222| minnie@disney.com|
|      3|    DONALD|     DUCK|  1|3333333333| donald@disney.com|
|      4|     DAISY|     DUCK|  0|4444444444|  daisy@disney.com|
|      5|   ALADDIN|      ALI|  1|5555555555|aladdin@disney.com|
|      6|    JAZMIN|   BAGDAD|  0|6666666666| jazmin@disney.com|
|      7|     MULAN|   GUN FU|  0|7777777777|  mulan@disney.com|
|      8|      JACK|  SPARROW|  1|8888888888|   jack@disney.com|
|      9|    MERIDA|    BRAVE|  0|9999999999| merida@disney.com|
+-------+----------+---------+---+----------+------------------+



In [17]:
df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- sex: integer (nullable = true)
 |-- cel: string (nullable = true)
 |-- email: string (nullable = true)



### Load clean data to DataLake

In [18]:
(df
 .write
 .option( 'header' , True )
 .csv( out_path )
)


In [19]:
print( 'check your file in the next folder: \n {}'.format( out_path ) )

check your file in the next folder: 
 /home/art/data/hpay/out/user
