# Apache Spark en Google Colab
Ejercicios de WordCount, DataFrame API y MLlib (clasificación)

## Ejemplo 1: WordCount con RDD

In [1]:
from pyspark.sql import SparkSession

# Esto funciona en EMR; no necesita instalar nada
spark = SparkSession.builder.appName("WordCount S3").getOrCreate()

# El SparkContext ya está listo
sc = spark.sparkContext


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
2,application_1763342042710_0003,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

text = sc.textFile("s3://yasirblandon-datalake/datasets/gutenberg-small/*.txt")
# Simular archivo de texto
# text = sc.parallelize(["Hola Spark Hola Big Data", "Spark es rápido y poderoso"])
counts = text.flatMap(lambda x: x.split(" ")) \
             .map(lambda x: (x, 1)) \
             .reduceByKey(lambda a, b: a + b)
counts.collect()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…



## Ejemplo 2: Análisis con DataFrame API

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

# Simular DataFrame de ventas
data = [("martillo", 12000), ("taladro", 45000), ("martillo", 15000)]
columns = ["producto", "valor"]
df = spark.createDataFrame(data, columns)
df.groupBy("producto").sum("valor").show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+----------+
|producto|sum(valor)|
+--------+----------+
|martillo|     27000|
| taladro|     45000|
+--------+----------+

## Ejemplo 3: Clasificación con MLlib

In [6]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression

df = spark.read.csv("s3://yasirblandon-datalake/datasets/clientes.csv", header=True, inferSchema=True)

assembler = VectorAssembler(inputCols=["edad", "ingresos"], outputCol="features")
data = assembler.transform(df).select("features", df["comprador"].alias("label"))
train, test = data.randomSplit([0.8, 0.2], seed=42)
lr = LogisticRegression()
model = lr.fit(train)
model.transform(test).select("features", "label", "prediction").show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------+-----+----------+
|     features|label|prediction|
+-------------+-----+----------+
|[34.0,4500.0]|    1|       0.0|
+-------------+-----+----------+

## Ejemplo 4: Spark GraphX

In [9]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("GraphFrames PageRank") \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.3-spark3.5-s_2.12") \
    .getOrCreate()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…