# Apache Spark en Google Colab
Ejercicios de WordCount, DataFrame API y MLlib (clasificación)

In [1]:
#configuración en google colab de spark y pyspark
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!apt-get install openjdk-17-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-4.0.1/spark-4.0.1-bin-hadoop3.tgz
!tar xf spark-4.0.1-bin-hadoop3.tgz
!pip install -q findspark
!pip install -q pyspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-4.0.1-bin-hadoop3"
import findspark
findspark.init()

## Ejemplo 1: WordCount con RDD

In [3]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

text = sc.textFile("/content/gdrive/MyDrive/datasets/gutenberg-small/*.txt")
# Simular archivo de texto
# text = sc.parallelize(["Hola Spark Hola Big Data", "Spark es rápido y poderoso"])
counts = text.flatMap(lambda x: x.split(" ")) \
             .map(lambda x: (x, 1)) \
             .reduceByKey(lambda a, b: a + b)
counts.collect()

[('', 27298),
 ('Published', 3),
 ('themselves', 192),
 ('were', 1450),
 ('sheet', 4),
 ('despatched', 4),
 ('most', 551),
 ('turbulent', 2),
 ('A.', 1456),
 ('ORIGINALS', 1),
 ('IN', 84),
 ('BROTHER]', 2),
 ('more', 1211),
 ('forget', 46),
 ('prove', 113),
 ('Give', 14),
 ('Johnston:--', 1),
 ('request', 53),
 ('comply', 10),
 ('times', 117),
 ('know.', 24),
 ('doubt', 134),
 ("day's", 8),
 ('does', 445),
 ('what', 1162),
 ('nail,"', 3),
 ('Let', 249),
 ('fair', 127),
 ('own', 664),
 ('dollar.', 5),
 ('yourself', 67),
 ('County.', 16),
 ('ever.', 20),
 ("months'", 7),
 ('it?', 167),
 ('mine.', 24),
 ('power', 430),
 ('give,', 10),
 ('cause,', 44),
 ('APPROVAL', 2),
 ('First', 28),
 ('States,', 567),
 ('consider', 165),
 ('seems', 147),
 ('accession', 8),
 ('Republican', 186),
 ('cause', 184),
 ('believe', 421),
 ('so,', 180),
 ('Those', 38),
 ('acceptance,', 7),
 ('rights', 124),
 ('control', 125),
 ('invasion', 23),
 ('under', 741),
 ('pretext,', 4),
 ('sentiments;', 6),
 ('susceptib

## Ejemplo 2: Análisis con DataFrame API

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

# Simular DataFrame de ventas
data = [("martillo", 12000), ("taladro", 45000), ("martillo", 15000)]
columns = ["producto", "valor"]
df = spark.createDataFrame(data, columns)
df.groupBy("producto").sum("valor").show()

+--------+----------+
|producto|sum(valor)|
+--------+----------+
|martillo|     27000|
| taladro|     45000|
+--------+----------+



## Ejemplo 3: Clasificación con MLlib

In [6]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression

df = spark.read.csv("/content/gdrive/MyDrive/datasets/clientes.csv", header=True, inferSchema=True)

assembler = VectorAssembler(inputCols=["edad", "ingresos"], outputCol="features")
data = assembler.transform(df).select("features", df["comprador"].alias("label"))
train, test = data.randomSplit([0.8, 0.2], seed=42)
lr = LogisticRegression()
model = lr.fit(train)
model.transform(test).select("features", "label", "prediction").show()

+-------------+-----+----------+
|     features|label|prediction|
+-------------+-----+----------+
|[34.0,4500.0]|    1|       0.0|
+-------------+-----+----------+



## Ejemplo 4: Spark GraphX

In [7]:
!pip install -q pyspark
!pyspark --packages graphframes:graphframes:0.8.3-spark3.5-s_2.12

Python 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
:: loading settings :: url = jar:file:/content/spark-4.0.1-bin-hadoop3/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /root/.ivy2.5.2/cache
The jars for the packages stored in: /root/.ivy2.5.2/jars
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-5510e7fd-415a-4982-a1a4-d52ae35506a1;1.0
	confs: [default]
	found graphframes#graphframes;0.8.3-spark3.5-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
downloading https://repos.spark-packages.org/graphframes/graphframes/0.8.3-spark3.5-s_2.12/graphframes-0.8.3-spark3.5-s_2.12.jar ...
	[SUCCESSFUL ] graphframes#graphframes;0.8.3-spark3.5-s_2.12!graphframes.jar (136ms)
downloading https://repo1.maven.org/maven2/org/slf4j/slf4j-api/1.7.16/slf4j-api-1.7.16.jar ...
	[SUCCESSFUL ] org.s

In [13]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("GraphFrames PageRank") \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.3-spark3.5-s_2.12") \
    .getOrCreate()