# Curso Big Data #3- SparkSession crear y leer DataFrames en PySpark

In [4]:
#Importamos las librerias

import findspark
findspark.init()

In [5]:
from pyspark.sql import SparkSession

1. Crear la SparkSession

In [6]:
spark = SparkSession.builder.appName('firstSession')\
    .config('spark.master', 'local[4]')\
    .config('spark.executor.memory', '1g')\
    .config("spark.sql.shuffle.partitions", 1)\
    .config('spark.driver.memory','1g')\
    .getOrCreate()

In [7]:
spark

In [8]:
spark.conf.get('spark.sql.shuffle.partitions')

'1'

2. Crear tabla

2.1 A partir de una lista

In [9]:
columnas = ["id", "nombre", "l"]
lista = [(1, "Errodringer", "a"), (2, "Paco", "b"), (3, "Hola", "c"), (4, "Adios", "d")]
lista

[(1, 'Errodringer', 'a'),
 (2, 'Paco', 'b'),
 (3, 'Hola', 'c'),
 (4, 'Adios', 'd')]

In [10]:
df_1 = spark.createDataFrame(lista, schema=columnas)

In [11]:
df_1.count()

4

In [14]:
df_1.show(2)

+---+-----------+---+
| id|     nombre|  l|
+---+-----------+---+
|  1|Errodringer|  a|
|  2|       Paco|  b|
+---+-----------+---+
only showing top 2 rows



Columnas del data frame

In [15]:
df_1.columns

['id', 'nombre', 'l']

Schema del DataFrame

In [16]:
df_1.printSchema()

root
 |-- id: long (nullable = true)
 |-- nombre: string (nullable = true)
 |-- l: string (nullable = true)



Resumen del DF

In [17]:
df_1.describe().show()

+-------+------------------+------+----+
|summary|                id|nombre|   l|
+-------+------------------+------+----+
|  count|                 4|     4|   4|
|   mean|               2.5|  null|null|
| stddev|1.2909944487358056|  null|null|
|    min|                 1| Adios|   a|
|    max|                 4|  Paco|   d|
+-------+------------------+------+----+



In [18]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType



In [19]:
schema_1 = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("l", StringType(), True)])



In [20]:
df_11 = spark.createDataFrame(lista, schema=schema_1)

In [21]:
df_11.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- l: string (nullable = true)



In [22]:
df_11.show()

+---+-----------+---+
| id|       name|  l|
+---+-----------+---+
|  1|Errodringer|  a|
|  2|       Paco|  b|
|  3|       Hola|  c|
|  4|      Adios|  d|
+---+-----------+---+



## 2.2 A partir de un csv

In [23]:
#separado por comas
df = spark.read.csv('data/business.csv', sep=',', header=True)

In [24]:
df.count()

1936

In [25]:
df.show(5)

+----------------+-------+----------+----------+------+-------+---------+--------------------+--------------------+--------------------+--------------------+--------------+--------------+--------------+
|Series_reference| Period|Data_value|Suppressed|STATUS|  UNITS|Magnitude|             Subject|               Group|      Series_title_1|      Series_title_2|Series_title_3|Series_title_4|Series_title_5|
+----------------+-------+----------+----------+------+-------+---------+--------------------+--------------------+--------------------+--------------------+--------------+--------------+--------------+
|   BDCQ.SF1AA2CA|2016.06|  1116.386|      null|     F|Dollars|        6|Business Data Col...|Industry by finan...|Sales (operating ...|Forestry and Logging|Current prices|    Unadjusted|          null|
|   BDCQ.SF1AA2CA|2016.09|  1070.874|      null|     F|Dollars|        6|Business Data Col...|Industry by finan...|Sales (operating ...|Forestry and Logging|Current prices|    Unadjusted| 

In [26]:
df.show(5, truncate=False)

+----------------+-------+----------+----------+------+-------+---------+------------------------------+------------------------------+------------------------+--------------------+--------------+--------------+--------------+
|Series_reference|Period |Data_value|Suppressed|STATUS|UNITS  |Magnitude|Subject                       |Group                         |Series_title_1          |Series_title_2      |Series_title_3|Series_title_4|Series_title_5|
+----------------+-------+----------+----------+------+-------+---------+------------------------------+------------------------------+------------------------+--------------------+--------------+--------------+--------------+
|BDCQ.SF1AA2CA   |2016.06|1116.386  |null      |F     |Dollars|6        |Business Data Collection - BDC|Industry by financial variable|Sales (operating income)|Forestry and Logging|Current prices|Unadjusted    |null          |
|BDCQ.SF1AA2CA   |2016.09|1070.874  |null      |F     |Dollars|6        |Business Data Colle

In [27]:
df.printSchema()

root
 |-- Series_reference: string (nullable = true)
 |-- Period: string (nullable = true)
 |-- Data_value: string (nullable = true)
 |-- Suppressed: string (nullable = true)
 |-- STATUS: string (nullable = true)
 |-- UNITS: string (nullable = true)
 |-- Magnitude: string (nullable = true)
 |-- Subject: string (nullable = true)
 |-- Group: string (nullable = true)
 |-- Series_title_1: string (nullable = true)
 |-- Series_title_2: string (nullable = true)
 |-- Series_title_3: string (nullable = true)
 |-- Series_title_4: string (nullable = true)
 |-- Series_title_5: string (nullable = true)



In [28]:
df.count()

1936

Escritura de un DataFrame. 
En este caso a ficheros tipo parquet, un formato de fichero optimizado para trabajar en entornos big data con grandes volumenes de datos.

In [29]:
df.write.parquet("parquet_example", mode='overwrite')


2.3 A partir de un parquet

In [30]:
df_p = spark.read.parquet('parquet_example')

In [32]:
df_p.show(4)

+----------------+-------+----------+----------+------+-------+---------+--------------------+--------------------+--------------------+--------------------+--------------+--------------+--------------+
|Series_reference| Period|Data_value|Suppressed|STATUS|  UNITS|Magnitude|             Subject|               Group|      Series_title_1|      Series_title_2|Series_title_3|Series_title_4|Series_title_5|
+----------------+-------+----------+----------+------+-------+---------+--------------------+--------------------+--------------------+--------------------+--------------+--------------+--------------+
|   BDCQ.SF1AA2CA|2016.06|  1116.386|      null|     F|Dollars|        6|Business Data Col...|Industry by finan...|Sales (operating ...|Forestry and Logging|Current prices|    Unadjusted|          null|
|   BDCQ.SF1AA2CA|2016.09|  1070.874|      null|     F|Dollars|        6|Business Data Col...|Industry by finan...|Sales (operating ...|Forestry and Logging|Current prices|    Unadjusted| 

In [33]:
df_p.describe().show()

+-------+----------------+------------------+------------------+----------+------+-------+---------+--------------------+--------------------+--------------------+--------------------+--------------+--------------+--------------+
|summary|Series_reference|            Period|        Data_value|Suppressed|STATUS|  UNITS|Magnitude|             Subject|               Group|      Series_title_1|      Series_title_2|Series_title_3|Series_title_4|Series_title_5|
+-------+----------------+------------------+------------------+----------+------+-------+---------+--------------------+--------------------+--------------------+--------------------+--------------+--------------+--------------+
|  count|            1936|              1936|              1936|         0|  1936|   1936|     1936|                1936|                1936|                1936|                1936|          1936|          1936|             0|
|   mean|            null| 2018.217975206615|2704.3055568181853|      null|  nul

In [34]:
#convertir a pandas
df_pandas = df_p.toPandas()

In [35]:
df_pandas.head()

Unnamed: 0,Series_reference,Period,Data_value,Suppressed,STATUS,UNITS,Magnitude,Subject,Group,Series_title_1,Series_title_2,Series_title_3,Series_title_4,Series_title_5
0,BDCQ.SF1AA2CA,2016.06,1116.386,,F,Dollars,6,Business Data Collection - BDC,Industry by financial variable,Sales (operating income),Forestry and Logging,Current prices,Unadjusted,
1,BDCQ.SF1AA2CA,2016.09,1070.874,,F,Dollars,6,Business Data Collection - BDC,Industry by financial variable,Sales (operating income),Forestry and Logging,Current prices,Unadjusted,
2,BDCQ.SF1AA2CA,2016.12,1054.408,,F,Dollars,6,Business Data Collection - BDC,Industry by financial variable,Sales (operating income),Forestry and Logging,Current prices,Unadjusted,
3,BDCQ.SF1AA2CA,2017.03,1010.665,,F,Dollars,6,Business Data Collection - BDC,Industry by financial variable,Sales (operating income),Forestry and Logging,Current prices,Unadjusted,
4,BDCQ.SF1AA2CA,2017.06,1233.7,,F,Dollars,6,Business Data Collection - BDC,Industry by financial variable,Sales (operating income),Forestry and Logging,Current prices,Unadjusted,


In [None]:
#spark.stop()