In [None]:
!pip install pyspark

In [None]:
# Solo se necesitan si se corre con spark-submit (o si se corre en Google Colab)
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Ejemplo2').getOrCreate()

In [None]:
# Lectura de los datos: 
#   * La primera fila se considera cabecera
#   * Se infiere el esquema de los datos
df = spark.read.csv('/content/appl_stock.csv', inferSchema=True, header=True)

# Mostrar el esquema inferido
df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [None]:
# Mostrar las 5 primeras filas
df.show(5)

+----------+----------+----------+------------------+------------------+---------+------------------+
|      Date|      Open|      High|               Low|             Close|   Volume|         Adj Close|
+----------+----------+----------+------------------+------------------+---------+------------------+
|2010-01-04|213.429998|214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05|214.599998|215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06|214.379993|    215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07|    211.75|212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08|210.299994|212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|
+----------+----------+----------+------------------+------------------+---------+------------------+
only showing top 5 rows



## Filtraje de datos

In [None]:
# Filtrar las condiciones de alguna columna (columna Close)
df.filter("Close < 100").show(5)

+----------+---------+---------+---------+---------+--------+---------+
|      Date|     Open|     High|      Low|    Close|  Volume|Adj Close|
+----------+---------+---------+---------+---------+--------+---------+
|2014-06-09|92.699997|93.879997|    91.75|93.699997|75415000|88.906324|
|2014-06-10|94.730003|95.050003|    93.57|    94.25|62777000|89.428189|
|2014-06-11|94.129997|94.760002|93.470001|93.860001|45681000|89.058142|
|2014-06-12|94.040001|94.120003|91.900002|92.290001|54749000|87.568463|
|2014-06-13|92.199997|92.440002|90.879997|91.279999|54525000|86.610132|
+----------+---------+---------+---------+---------+--------+---------+
only showing top 5 rows



In [None]:
# Filtrar las condiciones de alguna columna y mostrar solo algunas columnas
df.filter("Close < 100").select(['Date','Close', 'Volume']).show(5)

+----------+---------+--------+
|      Date|    Close|  Volume|
+----------+---------+--------+
|2014-06-09|93.699997|75415000|
|2014-06-10|    94.25|62777000|
|2014-06-11|93.860001|45681000|
|2014-06-12|92.290001|54749000|
|2014-06-13|91.279999|54525000|
+----------+---------+--------+
only showing top 5 rows



In [None]:
# Filtrar según dos condiciones (usando Y lógico)
df2 = df.filter((df["Close"] < 100) & (df["Volume"]>100000000)).select(['Date','Close', 'Volume'])
df2.show()

+----------+-----------------+---------+
|      Date|            Close|   Volume|
+----------+-----------------+---------+
|2014-06-20|        90.910004|100898000|
|2014-09-03|        98.940002|125421000|
|2014-09-09|        97.989998|189846300|
|2014-09-25|        97.870003|100092000|
|2014-10-15|        97.540001|100933600|
|2016-01-27|93.41999799999999|133369700|
|2016-04-27|            97.82|114602100|
+----------+-----------------+---------+



In [None]:
# Resumen de la tabla obtenida
df2.describe().show()

+-------+----------+------------------+--------------------+
|summary|      Date|             Close|              Volume|
+-------+----------+------------------+--------------------+
|  count|         7|                 7|                   7|
|   mean|      null| 96.35571514285714|1.2359467142857143E8|
| stddev|      null|2.9850559698837844|3.2015803143120546E7|
|    min|2014-06-20|         90.910004|           100092000|
|    max|2016-04-27|         98.940002|           189846300|
+-------+----------+------------------+--------------------+



In [None]:
# Filtrar según 2 condiciones (usando OR lógico y NOT lógico)
df3 = df.filter((df["Close"] < 100) | ~(df["Volume"]>100000000)).select(['Date','Close', 'Volume'])
df3.show(5)

+----------+------------------+--------+
|      Date|             Close|  Volume|
+----------+------------------+--------+
|2010-02-10|195.12000700000002|92590400|
|2010-02-22|        200.419994|97640900|
|2010-03-03|        209.329998|93013200|
|2010-03-04|210.71000299999997|91510300|
|2010-03-18|        224.650002|85527400|
+----------+------------------+--------+
only showing top 5 rows



### Acceso a los datos

In [None]:
# Recuperación de los datos
datos = df3.take(5)
datos

[Row(Date='2010-02-10', Close=195.12000700000002, Volume=92590400),
 Row(Date='2010-02-22', Close=200.419994, Volume=97640900),
 Row(Date='2010-03-03', Close=209.329998, Volume=93013200),
 Row(Date='2010-03-04', Close=210.71000299999997, Volume=91510300),
 Row(Date='2010-03-18', Close=224.650002, Volume=85527400)]

In [None]:
datos[0]

Row(Date='2010-02-10', Close=195.12000700000002, Volume=92590400)

In [None]:
# Recuperar una sola fila con una condición de igualdad
fila = df.filter(df["Low"]==197.16).collect()
fila

[Row(Date='2010-01-22', Open=206.78000600000001, High=207.499996, Low=197.16, Close=197.75, Volume=220441900, Adj Close=25.620401)]

In [None]:
fila[0]

Row(Date='2010-01-22', Open=206.78000600000001, High=207.499996, Low=197.16, Close=197.75, Volume=220441900, Adj Close=25.620401)

In [None]:
# Acceso a un elemento (High)
print(fila[0].High)
print(fila[0][2])

207.499996
207.499996


In [None]:
# Convertir datos en un diccionario
f = fila[0].asDict()
print(f)

{'Date': '2010-01-22', 'Open': 206.78000600000001, 'High': 207.499996, 'Low': 197.16, 'Close': 197.75, 'Volume': 220441900, 'Adj Close': 25.620401}


In [None]:
f['High']

207.499996

## Fechas y estampas de tiempo

In [None]:
from pyspark.sql.functions import (dayofmonth, hour, dayofyear,
                                   month, year, weekofyear,
                                   format_number, date_format)

In [None]:
df.select(df['Date'] ).show(5)

+----------+
|      Date|
+----------+
|2010-01-04|
|2010-01-05|
|2010-01-06|
|2010-01-07|
|2010-01-08|
+----------+
only showing top 5 rows



In [None]:
# Mostrar solo el año
df.select( year(df['Date']) ).show(5)

+----------+
|year(Date)|
+----------+
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
+----------+
only showing top 5 rows



In [None]:
# Mostrar solo el mes
df.select( month(df['Date']) ).show(5)

+-----------+
|month(Date)|
+-----------+
|          1|
|          1|
|          1|
|          1|
|          1|
+-----------+
only showing top 5 rows



In [None]:
# Mostrar solo el día del mes
df.select( dayofmonth(df['Date']) ).show(5)

+----------------+
|dayofmonth(Date)|
+----------------+
|               4|
|               5|
|               6|
|               7|
|               8|
+----------------+
only showing top 5 rows



In [None]:
df.select( weekofyear(df['Date']) ).show(10)

+----------------+
|weekofyear(Date)|
+----------------+
|               1|
|               1|
|               1|
|               1|
|               1|
|               2|
|               2|
|               2|
|               2|
|               2|
+----------------+
only showing top 10 rows



In [None]:
# Mostrar a partir de abril (meses mayores a 3)
df4 = df.filter( month(df['Date']) > 3 )
df4.show(3)

+----------+------------------+------------------+------------------+------------------+---------+------------------+
|      Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+----------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-04-01|            237.41|238.73000299999998|            232.75|235.96999399999999|150786300|30.572165999999996|
|2010-04-05|        234.980011|        238.509998|234.76999300000003|238.48999799999999|171126900|         30.898657|
|2010-04-06|238.20000499999998|240.23999799999999|        237.000004|        239.540009|111754300|31.034696000000004|
+----------+------------------+------------------+------------------+------------------+---------+------------------+
only showing top 3 rows



In [None]:
df4.select( dayofyear(df4['Date']) ).show(5)

+---------------+
|dayofyear(Date)|
+---------------+
|             91|
|             95|
|             96|
|             97|
|             98|
+---------------+
only showing top 5 rows

