#### Launch spark session

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark import StorageLevel
from pyspark.sql.types import *

from datetime import datetime, date
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
from IPython.display import display

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

spark = SparkSession.builder \
    .master("spark://spark-master:7077") \
    .config("spark.executor.instances", 2) \
    .config("spark.executor.memory", "1g") \
    .config("spark.executor.cores", "1") \
    .config("spark.driver.cores", "4") \
    .config("spark.hadoop.fs.s3a.endpoint", "minio:9010") \
    .config("spark.hadoop.fs.s3a.access.key", "root") \
    .config("spark.hadoop.fs.s3a.secret.key", "root12345") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.1,com.crealytics:spark-excel_2.12:0.13.5") \
    .config("spark.driver.memory", "6g") \
    .config("spark.driver.maxResultSize", "3g") \
    .config("spark.deploy.defaultCores", 1) \
    .config("spark.dynamicAllocation.enabled", True) \
    .appName("MySparkApp") \
    .getOrCreate()

### RDD programming guide

In [2]:
%%time
l = [('Alice', 1)]

l = spark.createDataFrame(l, ['name', 'age'])

l.collect()

print(l.is_cached)
l.persist(storageLevel=StorageLevel.MEMORY_AND_DISK)
print(l.is_cached)
l.count()
l.unpersist()

False
True
CPU times: user 19.9 ms, sys: 516 µs, total: 20.4 ms
Wall time: 11.6 s


DataFrame[name: string, age: bigint]

In [3]:
# spark.stop()

In [4]:
lines = spark.sparkContext.textFile("data.txt")
pairs = lines.map(lambda s: (s, 1))
counts = pairs.reduceByKey(lambda a, b: a + b)
counts.collect()

[('asdasdas', 2), ('asdr23rsaa', 4), ('asdasdaswe', 1)]

In [5]:
# counts = pairs.reduce(lambda a, b: a + b)
# counts

In [6]:
l = spark.sparkContext.parallelize([('a', 1), ('b', 2), ('a', 3), ('c', 3), ('c', 4), ('c', 5)])
print(l.reduceByKey(lambda x, y: x + y).collect())

l = spark.sparkContext.parallelize([1,2,3,4,5])
l.reduce(lambda x, y: x + y)

[('b', 2), ('c', 12), ('a', 4)]


15

In [7]:
('a',1) + ('a', 1)

('a', 1, 'a', 1)

In [8]:
sentences = spark.sparkContext.parallelize(["Hello world", "How are you"])

# Apply flatMap to split each sentence into words
words = sentences.flatMap(lambda sentence: sentence.split())

# Collect the results
result = words.collect()

result

['Hello', 'world', 'How', 'are', 'you']

In [9]:
accum = spark.sparkContext.accumulator(0)

def g(x):
    accum.add(1)
    return x + x

data = spark.sparkContext.parallelize([1,2,3])

data.map(g).collect()

[2, 4, 6]

In [10]:
data.map(g).collect()
accum

Accumulator<id=0, value=6>

### Data skewness

In [None]:
df.groupBy(F.spark_partition_id()).count().show()

In [2]:
import pandas as pd
from faker import Faker
import numpy as np

# Initialize Faker
fake = Faker()

# Define the size of the DataFrame
num_records = 10**5

# Generate names
names = [fake.name() for _ in range(4)]  # 4 names

# Define the distribution of random integers
# 85% of random integers correspond to the first name
# 15% of random integers are evenly distributed across the other three names
int_distribution = [0.85 * num_records] + [0.05 * num_records] * 3

# Generate random integers according to the specified distribution
random_ints = np.random.choice(len(names), size=num_records, p=[0.85, 0.05, 0.05, 0.05])

# Create the DataFrame
df = pd.DataFrame({'name': [names[i] for i in random_ints],
                   'random_int': np.random.randint(1000, 10000, size=num_records)})

spark_df = spark.createDataFrame(df)

In [3]:
%%time
spark_df = spark_df.repartition(4)
spark_df.groupBy(F.spark_partition_id()).count().show()

+--------------------+-----+
|SPARK_PARTITION_ID()|count|
+--------------------+-----+
|                   0|25000|
|                   2|25000|
|                   1|25000|
|                   3|25000|
+--------------------+-----+

CPU times: user 7.27 ms, sys: 186 µs, total: 7.46 ms
Wall time: 6.73 s


In [None]:
%%time
spark_df.repartition(4).write.partitionBy('name').mode('overwrite').parquet('s3a://spark/spark_df.parquet')