In [19]:
from pyspark.sql import SparkSession

1. Creamos una SparSession

In [20]:
spark = SparkSession\
    .builder\
        .master("local[*]")\
            .appName('Test')\
                .getOrCreate()

1.1 Leemos un archivo .CSV para crear un __DataFrame__

In [21]:
PATH = "/mnt/d/Proyectos/Tutorial-SparkAWS/data/StudentData.csv"
df = spark.read\
    .option("header", "True") \
        .csv(PATH)

1.2 Aplicamos una __Accion__sobre el __DataFrame__ para mostrar su contenido

In [22]:
df.show(5)

+---+------+----------------+------+-----+-----+--------------------+
|age|gender|            name|course| roll|marks|               email|
+---+------+----------------+------+-----+-----+--------------------+
| 28|Female| Hubert Oliveras|    DB|02984|   59|Annika Hoffman_Na...|
| 29|Female|Toshiko Hillyard| Cloud|12899|   62|Margene Moores_Ma...|
| 28|  Male|  Celeste Lollis|    PF|21267|   45|Jeannetta Golden_...|
| 29|Female|    Elenore Choy|    DB|32877|   29|Billi Clore_Mitzi...|
| 28|  Male|  Sheryll Towler|   DSA|41487|   41|Claude Panos_Judi...|
+---+------+----------------+------+-----+-----+--------------------+
only showing top 5 rows



2. Trabajamos con Esquemas

2.1 Mostramos el esquema de DF anterior __Vemos que todo es string__ y no es lo que queremos.

In [23]:
df.printSchema()

root
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- name: string (nullable = true)
 |-- course: string (nullable = true)
 |-- roll: string (nullable = true)
 |-- marks: string (nullable = true)
 |-- email: string (nullable = true)



2.2 Inferimos el Esquema usando __".option("inferSchema", True)__

In [24]:
df = spark.read.option("Header","True").option("inferSchema", True).csv(PATH)
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- name: string (nullable = true)
 |-- course: string (nullable = true)
 |-- roll: integer (nullable = true)
 |-- marks: integer (nullable = true)
 |-- email: string (nullable = true)



2.3 Especificar esquema de forma manual.

In [25]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType 

schema_a = StructType ([
    StructField("age", IntegerType(), True),
    StructField("gender", StringType(), True),
    StructField("name", StringType(), True),
    StructField("course", StringType(), True),
    StructField("roll", StringType(), True),
    StructField("marks", IntegerType(), True),
    StructField("email", StringType(), True),
])

df = spark.read.schema(schema_a).option("header",True).csv(PATH)
df.show(5)
print(df.printSchema())

+---+------+----------------+------+-----+-----+--------------------+
|age|gender|            name|course| roll|marks|               email|
+---+------+----------------+------+-----+-----+--------------------+
| 28|Female| Hubert Oliveras|    DB|02984|   59|Annika Hoffman_Na...|
| 29|Female|Toshiko Hillyard| Cloud|12899|   62|Margene Moores_Ma...|
| 28|  Male|  Celeste Lollis|    PF|21267|   45|Jeannetta Golden_...|
| 29|Female|    Elenore Choy|    DB|32877|   29|Billi Clore_Mitzi...|
| 28|  Male|  Sheryll Towler|   DSA|41487|   41|Claude Panos_Judi...|
+---+------+----------------+------+-----+-----+--------------------+
only showing top 5 rows

root
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- name: string (nullable = true)
 |-- course: string (nullable = true)
 |-- roll: string (nullable = true)
 |-- marks: integer (nullable = true)
 |-- email: string (nullable = true)

None


3.1 Usar **Kwargs en Spark

Se usa para no tener tantos __.option()__ anidados remplazandolo por un unico __.options()__

In [26]:
df = spark.read.options(inferSchema=True, Header=True, delimiter=",").csv(PATH)
df.show(2)

+---+------+----------------+------+-----+-----+--------------------+
|age|gender|            name|course| roll|marks|               email|
+---+------+----------------+------+-----+-----+--------------------+
| 28|Female| Hubert Oliveras|    DB| 2984|   59|Annika Hoffman_Na...|
| 29|Female|Toshiko Hillyard| Cloud|12899|   62|Margene Moores_Ma...|
+---+------+----------------+------+-----+-----+--------------------+
only showing top 2 rows



4. Spark DF from RDDs

Primero creamos un RDD y Luego lo pasamos a DF

In [27]:
from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName('Test_rdd').setMaster("local[*]")
spark_rdd = SparkContext.getOrCreate(conf=conf)

In [53]:
PATH = "/mnt/d/Proyectos/Tutorial-SparkAWS/data/StudentData.csv"

rdd = spark_rdd.textFile(PATH)
header = rdd.first()
rdd_data = rdd.filter(lambda x: x != header)
#print(rdd_data.count())

In [58]:
rdd_data_split = rdd_data.map(lambda x:x.split(","))
rdd_data_f = rdd_data_split.map(lambda x: [int(x[0]), x[1], x[2], x[3], x[4], int(x[5]), x[6]])
rdd_data_f.first()

[28,
 'Female',
 'Hubert Oliveras',
 'DB',
 '02984',
 59,
 'Annika Hoffman_Naoma Fritts@OOP.com']

4.1 Convertimos el __Rdd__ en un __DataFrame__ usando el Header que sacamos el __Rdd__

In [62]:
lista_headers = header.split(",")
df = rdd_data_f.toDF(lista_headers)

In [63]:
df.show(5)

+---+------+----------------+------+-----+-----+--------------------+
|age|gender|            name|course| roll|marks|               email|
+---+------+----------------+------+-----+-----+--------------------+
| 28|Female| Hubert Oliveras|    DB|02984|   59|Annika Hoffman_Na...|
| 29|Female|Toshiko Hillyard| Cloud|12899|   62|Margene Moores_Ma...|
| 28|  Male|  Celeste Lollis|    PF|21267|   45|Jeannetta Golden_...|
| 29|Female|    Elenore Choy|    DB|32877|   29|Billi Clore_Mitzi...|
| 28|  Male|  Sheryll Towler|   DSA|41487|   41|Claude Panos_Judi...|
+---+------+----------------+------+-----+-----+--------------------+
only showing top 5 rows



In [64]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- gender: string (nullable = true)
 |-- name: string (nullable = true)
 |-- course: string (nullable = true)
 |-- roll: string (nullable = true)
 |-- marks: long (nullable = true)
 |-- email: string (nullable = true)



4.2 Como el esquema no se puede inferir, lo que hacemos es crear nuestro propio esquema, pero en lugar de usar la __transformacion rdd.toDF()__ usamos __spark.createDataFrame(rdd, schema=)__

In [65]:

schema_rdd = StructType([
    StructField("age", IntegerType(), True),
    StructField("gender", StringType(), True),
    StructField("name", StringType(), True),
    StructField("course", StringType(), True),
    StructField("roll", StringType(), True),
    StructField("marks", IntegerType(), True),
    StructField("email", StringType(), True)
])

mi_df = spark.createDataFrame(rdd_data_f, schema=schema_rdd)
print(mi_df.printSchema())

root
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- name: string (nullable = true)
 |-- course: string (nullable = true)
 |-- roll: string (nullable = true)
 |-- marks: integer (nullable = true)
 |-- email: string (nullable = true)

None


5. Seleccion de columnas

5.1 Notación directa, especificando la columna

In [67]:
mi_df.select("age", "gender").show(2)

+---+------+
|age|gender|
+---+------+
| 28|Female|
| 29|Female|
+---+------+
only showing top 2 rows



5.2 Usando __dot notation__

In [68]:
df.select(df.age, df.email).show(2)

+---+--------------------+
|age|               email|
+---+--------------------+
| 28|Annika Hoffman_Na...|
| 29|Margene Moores_Ma...|
+---+--------------------+
only showing top 2 rows



5.3 Usando __col()__

Es una funcion dentro de la libreria __pyspark.sql.functions__

In [70]:
from pyspark.sql.functions import col

df.select(col("age"), col("gender")).show(5)

+---+------+
|age|gender|
+---+------+
| 28|Female|
| 29|Female|
| 28|  Male|
| 29|Female|
| 28|  Male|
+---+------+
only showing top 5 rows



5.4 Mostrar todas las columnas usando __*__

In [72]:
df.select("*").show(4)

+---+------+----------------+------+-----+-----+--------------------+
|age|gender|            name|course| roll|marks|               email|
+---+------+----------------+------+-----+-----+--------------------+
| 28|Female| Hubert Oliveras|    DB|02984|   59|Annika Hoffman_Na...|
| 29|Female|Toshiko Hillyard| Cloud|12899|   62|Margene Moores_Ma...|
| 28|  Male|  Celeste Lollis|    PF|21267|   45|Jeannetta Golden_...|
| 29|Female|    Elenore Choy|    DB|32877|   29|Billi Clore_Mitzi...|
+---+------+----------------+------+-----+-----+--------------------+
only showing top 4 rows



5.5 Select de las columnas usando Index

In [76]:
df.select(df.columns[3:]).show(4)

+------+-----+-----+--------------------+
|course| roll|marks|               email|
+------+-----+-----+--------------------+
|    DB|02984|   59|Annika Hoffman_Na...|
| Cloud|12899|   62|Margene Moores_Ma...|
|    PF|21267|   45|Jeannetta Golden_...|
|    DB|32877|   29|Billi Clore_Mitzi...|
+------+-----+-----+--------------------+
only showing top 4 rows



5.6 Podemos combinar todo lo anterior