In [1]:
import os
import seaborn as sns
import pandas as pd
import pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import collect_list, col, when, udf, avg, count, sum
from pyspark.sql.types import IntegerType, FloatType, StringType

pyspark.__version__

'3.5.0'

In [2]:

session = SparkSession.builder.appName('notebook1').getOrCreate()
session

In [3]:
df = session.createDataFrame(sns.load_dataset('tips'))
df.head()

Row(total_bill=16.99, tip=1.01, sex='Female', smoker='No', day='Sun', time='Dinner', size=2)

In [None]:
df.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: long (nullable = true)



In [5]:
df.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [6]:
df.limit(10).toPandas()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.0,Male,No,Sun,Dinner,2
7,26.88,3.12,Male,No,Sun,Dinner,4
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2


In [7]:
df.columns

['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

In [8]:
df.select('total_bill', 'tip', 'sex').show(5)

+----------+----+------+
|total_bill| tip|   sex|
+----------+----+------+
|     16.99|1.01|Female|
|     10.34|1.66|  Male|
|     21.01| 3.5|  Male|
|     23.68|3.31|  Male|
|     24.59|3.61|Female|
+----------+----+------+
only showing top 5 rows



In [9]:
df[['total_bill', 'tip']].show(5) # esto es en pandas


+----------+----+
|total_bill| tip|
+----------+----+
|     16.99|1.01|
|     10.34|1.66|
|     21.01| 3.5|
|     23.68|3.31|
|     24.59|3.61|
+----------+----+
only showing top 5 rows



In [10]:
df.describe().toPandas()

Unnamed: 0,summary,total_bill,tip,sex,smoker,day,time,size
0,count,244.0,244.0,244,244,244,244,244.0
1,mean,19.785942622950813,2.9982786885245907,,,,,2.569672131147541
2,stddev,8.902411954856856,1.383638189001182,,,,,0.9510998047322344
3,min,3.07,1.0,Female,No,Fri,Dinner,1.0
4,max,50.81,10.0,Male,Yes,Thur,Lunch,6.0


In [11]:
df.dtypes

[('total_bill', 'double'),
 ('tip', 'double'),
 ('sex', 'string'),
 ('smoker', 'string'),
 ('day', 'string'),
 ('time', 'string'),
 ('size', 'bigint')]

In [12]:
df.schema['total_bill']

StructField('total_bill', DoubleType(), True)

In [13]:
#df_cast = df.withColumn('total_bill', col('total_bill').cast('float'))\
 #           .withColumn('tip', col('tip').cast('int'))
df_cast = df.withColumn('total_bill', col('total_bill').cast(FloatType()))\
            .withColumn('tip', col('tip').cast(IntegerType()))
df_cast.printSchema()

root
 |-- total_bill: float (nullable = true)
 |-- tip: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: long (nullable = true)



In [14]:
#agregaciones
df.select('total_bill', 'tip', 'size').summary('count', 'min', 'max', 'mean').show() #agregar show()para verlo

+-------+------------------+------------------+-----------------+
|summary|        total_bill|               tip|             size|
+-------+------------------+------------------+-----------------+
|  count|               244|               244|              244|
|    min|              3.07|               1.0|                1|
|    max|             50.81|              10.0|                6|
|   mean|19.785942622950813|2.9982786885245907|2.569672131147541|
+-------+------------------+------------------+-----------------+



In [15]:
df.summary().show() #hace todos los calculos, hay que agregarle el show() para verlo

+-------+------------------+------------------+------+------+----+------+------------------+
|summary|        total_bill|               tip|   sex|smoker| day|  time|              size|
+-------+------------------+------------------+------+------+----+------+------------------+
|  count|               244|               244|   244|   244| 244|   244|               244|
|   mean|19.785942622950813|2.9982786885245907|  NULL|  NULL|NULL|  NULL| 2.569672131147541|
| stddev| 8.902411954856856| 1.383638189001182|  NULL|  NULL|NULL|  NULL|0.9510998047322344|
|    min|              3.07|               1.0|Female|    No| Fri|Dinner|                 1|
|    25%|             13.28|               2.0|  NULL|  NULL|NULL|  NULL|                 2|
|    50%|             17.78|              2.88|  NULL|  NULL|NULL|  NULL|                 2|
|    75%|             24.08|              3.55|  NULL|  NULL|NULL|  NULL|                 3|
|    max|             50.81|              10.0|  Male|   Yes|Thur| Lun

In [16]:
#Filtros
#Forma 1
df.filter(df['total_bill']> 20).show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     20.65|3.35|  Male|    No|Sat|Dinner|   3|
|     20.29|2.75|Female|    No|Sat|Dinner|   2|
|     39.42|7.58|  Male|    No|Sat|Dinner|   4|
|      21.7| 4.3|  Male|    No|Sat|Dinner|   2|
|     20.69|2.45|Female|    No|Sat|Dinner|   4|
|     24.06| 3.6|  Male|    No|Sat|Dinner|   3|
|     31.27| 5.0|  Male|    No|Sat|Dinner|   3|
|      30.4| 5.6|  Male|    No|Sun|Dinner|   4|
|     22.23| 5.0|  Male|    No|Sun|Dinner|   2|
|      32.4| 6.0|  Male|    No|Sun|Dinner|   4|
|     28.55|2.05|  Male|    No|Sun|Dinne

In [17]:
#forma 2
#df.filter(df['total_bill'] > 20).collect()[0]['total_bill']
#forma 3
#df.filter(col('total_bill') > 20).show()

In [18]:
df.filter((df['total_bill'] > 20) & (df['tip'] > 3)).show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [19]:
#crear una nueva columna con el 10% de total_bill para el iva
df_new = df.withColumn('total_bill_iva_10', df['total_bill'] * 0.10)
df_new.show(2)

+----------+----+------+------+---+------+----+------------------+
|total_bill| tip|   sex|smoker|day|  time|size| total_bill_iva_10|
+----------+----+------+------+---+------+----+------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|1.6989999999999998|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|             1.034|
+----------+----+------+------+---+------+----+------------------+
only showing top 2 rows



In [20]:
# en pandas solemos aplicar una transformacion utilizando apply()
#crear una columna categorica a partir de una numérica
df.withColumn(
    'tip_category',
    when(df['tip'] <= 1, 'baja')
    .when((df['tip'] > 1)& (df['tip'] <= 3), 'media')
    .otherwise('alta')).show()

+----------+----+------+------+---+------+----+------------+
|total_bill| tip|   sex|smoker|day|  time|size|tip_category|
+----------+----+------+------+---+------+----+------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|       media|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|       media|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|        alta|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|        alta|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|        alta|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|        alta|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|       media|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|        alta|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|       media|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|        alta|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|       media|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|        alta|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|       media|
|     18.43| 3.0|  Male|

In [21]:
# esto es mejor solo para los casos avanzados en los que no nos sirve con las
#funciones que ya hay en finction
def categorize_tip(tip):
  if tip <= 1:
    return 'baja'
  elif tip > 1 and tip <= 3:
    return 'media'
  else:
    return 'alta'

In [22]:
udf_categorize_tip = udf(categorize_tip, StringType())
df.withColumn('tip_category', udf_categorize_tip('tip')).show()

+----------+----+------+------+---+------+----+------------+
|total_bill| tip|   sex|smoker|day|  time|size|tip_category|
+----------+----+------+------+---+------+----+------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|       media|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|       media|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|        alta|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|        alta|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|        alta|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|        alta|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|       media|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|        alta|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|       media|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|        alta|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|       media|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|        alta|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|       media|
|     18.43| 3.0|  Male|

In [23]:
# renombrar columnas
df_renamed = df.withColumnRenamed('sex', 'genre')
df_renamed.show(4)

+----------+----+------+------+---+------+----+
|total_bill| tip| genre|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
+----------+----+------+------+---+------+----+
only showing top 4 rows



In [24]:
#borrar una columna
df_dropped = df.drop('sex', 'smoker')
df_dropped.show(4)

+----------+----+---+------+----+
|total_bill| tip|day|  time|size|
+----------+----+---+------+----+
|     16.99|1.01|Sun|Dinner|   2|
|     10.34|1.66|Sun|Dinner|   3|
|     21.01| 3.5|Sun|Dinner|   3|
|     23.68|3.31|Sun|Dinner|   2|
+----------+----+---+------+----+
only showing top 4 rows



In [25]:
#ordenar por una columna, en pandas es sort_values()
df.sort('total_bill').show(4) # asc

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|      3.07| 1.0|Female|   Yes|Sat|Dinner|   1|
|      5.75| 1.0|Female|   Yes|Fri|Dinner|   2|
|      7.25|5.15|  Male|   Yes|Sun|Dinner|   2|
|      7.25| 1.0|Female|    No|Sat|Dinner|   1|
+----------+----+------+------+---+------+----+
only showing top 4 rows



In [26]:
df.sort(col('total_bill').desc()).show(4) # descendente hay que usal col y desc

+----------+----+----+------+---+------+----+
|total_bill| tip| sex|smoker|day|  time|size|
+----------+----+----+------+---+------+----+
|     50.81|10.0|Male|   Yes|Sat|Dinner|   3|
|     48.33| 9.0|Male|    No|Sat|Dinner|   4|
|     48.27|6.73|Male|    No|Sat|Dinner|   4|
|     48.17| 5.0|Male|    No|Sun|Dinner|   6|
+----------+----+----+------+---+------+----+
only showing top 4 rows



In [27]:
df.orderBy(col('total_bill').desc()).show(4)

+----------+----+----+------+---+------+----+
|total_bill| tip| sex|smoker|day|  time|size|
+----------+----+----+------+---+------+----+
|     50.81|10.0|Male|   Yes|Sat|Dinner|   3|
|     48.33| 9.0|Male|    No|Sat|Dinner|   4|
|     48.27|6.73|Male|    No|Sat|Dinner|   4|
|     48.17| 5.0|Male|    No|Sun|Dinner|   6|
+----------+----+----+------+---+------+----+
only showing top 4 rows



In [28]:
#agrupar datos
# equivalente a value_counts de pandas
df.groupBy('sex').count().show()

+------+-----+
|   sex|count|
+------+-----+
|Female|   87|
|  Male|  157|
+------+-----+



In [29]:
# similar a pandas pordemos usar una funcion de agregacion para pedir varias agregaciones
df.groupby('sex').agg(
    count('*').alias('count_rows'),
    avg('total_bill').alias('avg_total_bill'),
    sum('tip').alias('sum_tips')
).show()

+------+----------+------------------+------------------+
|   sex|count_rows|    avg_total_bill|          sum_tips|
+------+----------+------------------+------------------+
|Female|        87| 18.05689655172414|            246.51|
|  Male|       157|20.744076433121016|485.07000000000005|
+------+----------+------------------+------------------+



In [30]:
# elimina filas donde hay al menos un valor nulo:
df_no_nulls = df.dropna()
#eliminar filas donde hay nulos solo en algunas columnas especificadas
df.dropna(subset=['tip'])

DataFrame[total_bill: double, tip: double, sex: string, smoker: string, day: string, time: string, size: bigint]

In [41]:
df_imputed = df_no_nulls.fillna({
    'total_bill': 0,
    'smoker': 'desconocido'
})
df_imputed.show(4)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
+----------+----+------+------+---+------+----+
only showing top 4 rows



In [32]:
#cargar CSV desde pandas y luego a pyspark
url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/refs/heads/master/tips.csv'
df_pandas = pd.read_csv(url)
df_spark = session.createDataFrame(df_pandas)
df_spark.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [33]:
# Cargar CSV directamente con pyspark (más recomendable)
import requests
url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/refs/heads/master/tips.csv'
csv_path = '/tmp/tips.csv'
csv_path = 'tips.csv'
with open(csv_path, 'wb') as file:
    file.write(requests.get(url).content)
    
df_spark = session.read.csv(csv_path, header=True, inferSchema=True)
df_spark.show(5)


+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [34]:
# Cargar CSV directamente con pyspark + schema(más recomendable)  
import requests
from pyspark.sql.types import StructType, StructField, FloatType, IntegerType, StringType
url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/refs/heads/master/tips.csv'
csv_path = '/tmp/tips.csv'
csv_path = 'tips.csv'
with open(csv_path, 'wb') as file:
    file.write(requests.get(url).content)
    
schema = StructType([
    # columnas del dataset y su tipo de dato
    StructField('total_bill', FloatType(), True),
    StructField('tip', FloatType(), True),
    StructField('sex', StringType(), True),
    StructField('smoker', StringType(), True),
    StructField('day', StringType(), True),
    StructField('time', StringType(), True),
    StructField('size', IntegerType(), True)
])
  
df_spark = session.read.csv(csv_path, header=True, inferSchema=True)
df_spark.show(5)
df_spark.printSchema()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)



In [None]:
#guardar datos a un CSV
# por defecto se particiona en múltiples archivos para procesado distribuido y repartirlos en nodos(maquinas)
# df.coalesce(1).write.csv('tips_clean.csv', header=True, mode='overwrite') # para reducir a una sola particion ( no recomendable)
df.write.csv('tips_clean.csv', header=True, mode='overwrite')

In [36]:
#verificar que aparece el archivo guardado:
import os

files = os.listdir('.')
for file in files:
    print(file)

.bashrc
.profile
.bash_logout
tips_clean.csv
.npm
.ipython
.cache
tips.csv
.local
.conda
.config
.jupyter
.wget-hsts
work


In [38]:
df_tips_clean = session.read.csv('tips_clean.csv', header=True, inferSchema=True)
df_tips_clean.show(3)

+----------+----+----+------+---+------+----+
|total_bill| tip| sex|smoker|day|  time|size|
+----------+----+----+------+---+------+----+
|     30.06| 2.0|Male|   Yes|Sat|Dinner|   3|
|     25.89|5.16|Male|   Yes|Sat|Dinner|   4|
|     48.33| 9.0|Male|    No|Sat|Dinner|   4|
+----------+----+----+------+---+------+----+
only showing top 3 rows



In [None]:
# # se puede conectar con otras fuentes de datos, como MySQL
# spark = SparkSession.builder.appName('mysqlapp').config('spark.jars', '/opt/mysql-connector-java-8.0.41.jar').getOrCreate()

# # java database connectivity
# df_mysql = spark.read.format('jdbc') \
#            .option('url', 'jdbc:mysql://localhost:3306/testing_db') \
#            .option('query', 'SELECT * FROM customers WHERE salary > 1000') \
#            .option('usser', 'root') \
#            .option('password', 'admin') \
#            .load()