# Big Data Analytics — Assignment 02
> Author : Badr TAJINI - Big Data Analytics - ESIEE 2025-2026


**Chapter 3 :** From MapReduce → Spark patterns  
**Chapter 4 :** Text analysis in PySpark

**Tools :** Spark or PySpark.   
**Advice:** Keep evidence and reproducibility.

## 0. Bootstrap

In [1]:
import subprocess
print(subprocess.check_output(["which", "python"]).decode())
print(subprocess.check_output(["python", "--version"]).decode())


/home/aurel/miniconda3/envs/bda-env/bin/python

Python 3.10.19



In [2]:
# write some code here
# - create SparkSession('BDA-A02') with UTC timezone
# - print Spark/PySpark/Python versions
# - set spark.sql.shuffle.partitions to a small value for local runs

import os, sys

import sys
import platform
from pyspark.sql import SparkSession
import pyspark

# python_path = "/home/aurel/miniconda3/envs/bda-env/bin/python"
# os.environ["PYSPARK_PYTHON"] = python_path
# os.environ["PYSPARK_DRIVER_PYTHON"] = python_path

spark = (
    SparkSession.builder
    .appName("BDA-02")
    .config("spark.sql.session.timeZone", "GMT+1")
    # .config("spark.executorEnv.PYSPARK_PYTHON", python_path)
    # .config("spark.executorEnv.PYSPARK_DRIVER_PYTHON", python_path)
    # .config("spark.yarn.appMasterEnv.PYSPARK_PYTHON", python_path)
    # .config("spark.yarn.appMasterEnv.PYSPARK_DRIVER_PYTHON", python_path)
    .config("spark.sql.shuffle.partitions", "4")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN")

print(f"Spark version: {spark.version}")
print(f"PySpark version: {pyspark.__version__}")
print(f"Python version: {sys.version.split()[0]}")
print(f"Session timezone: {spark.conf.get('spark.sql.session.timeZone')}")
print(f"Shuffle partitions: {spark.conf.get('spark.sql.shuffle.partitions')}")


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/11/23 16:03:34 WARN Utils: Your hostname, PCPORTABLEAUR, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/11/23 16:03:34 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/23 16:03:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark version: 4.0.1
PySpark version: 4.0.1
Python version: 3.10.19
Session timezone: GMT+1
Shuffle partitions: 4


## 1. Dataset acquisition

In [3]:
# write some code here
# - ensure data/shakespeare.txt exists; if missing, download from the URL in the overview
# - create (a) an RDD of lines and (b) a DataFrame with column 'line'
# - show a few lines

from pathlib import Path
import urllib.request
import re

BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "data"
OUTPUTS_DIR = BASE_DIR / "outputs"
PROOF_DIR = BASE_DIR / "proof"
for directory in (DATA_DIR, OUTPUTS_DIR, PROOF_DIR):
    directory.mkdir(exist_ok=True)

SOURCE_URL = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
TEXT_PATH = DATA_DIR / "shakespeare.txt"

if not TEXT_PATH.exists():
    print(f"Downloading Shakespeare text from {SOURCE_URL}...")
    urllib.request.urlretrieve(SOURCE_URL, TEXT_PATH)
    print(f"Downloaded to {TEXT_PATH}")

# Create RDD and DataFrame
lines_rdd = spark.sparkContext.textFile(str(TEXT_PATH)).cache()
lines_df = spark.read.text(str(TEXT_PATH)).withColumnRenamed("value", "line").cache()

num_lines = lines_df.count()
print(f"\n=== Dataset loaded: {num_lines} lines from {TEXT_PATH} ===")
lines_df.show(5, truncate=False)


                                                                                


=== Dataset loaded: 122458 lines from /home/aurel/bda_labs/bda_assignment02/data/shakespeare.txt ===
+----------------------+
|line                  |
+----------------------+
|1609                  |
|                      |
|THE SONNETS           |
|                      |
|by William Shakespeare|
+----------------------+
only showing top 5 rows


## 2. Tokenization helper

In [4]:
# write some code here
# - implement a tokenizer: lowercase, split on non-letters, drop empties
# - implement truncate(tokens, n=40) for PMI

from pyspark import StorageLevel

pattern = re.compile(r"[a-z]+")

def tokenize(text: str):
    """Lowercase, split on non-letters, drop empties"""
    return pattern.findall(text.lower())

def truncate(tokens, n=40):
    """Truncate tokens to first n tokens for PMI computation"""
    return tokens[:n]

# Cache tokenized lines for reuse
tokenized_lines_rdd = lines_rdd.map(tokenize).map(lambda toks: [t for t in toks if t])
tokenized_lines_rdd = tokenized_lines_rdd.persist(StorageLevel.MEMORY_ONLY)

print(f"\n=== Tokenization complete ===")
print("Sample tokenized line:", tokenized_lines_rdd.take(1))



=== Tokenization complete ===


[Stage 5:>                                                          (0 + 1) / 1]

Sample tokenized line: [[]]


                                                                                

## 3. Part A — Bigram relative frequency (pairs)

In [5]:
# write some code here
# - emit ((w_i, w_{i+1}), 1) and ((w_i, '*'), 1)
# - reduceByKey to counts; compute relative frequency
# - write outputs/bigram_pairs_top.csv (top N)
# - save explain('formatted') from a DF stage to proof/plan_bigrams.txt

from operator import add
from io import StringIO
from contextlib import redirect_stdout
from pyspark.sql import functions as F

# Bigram counts (pairs design) : Emit ((w_i, w_{i+1}), 1) and ((w_i, '*'), 1)
pair_counts = (
    tokenized_lines_rdd
    .flatMap(lambda tokens: [((tokens[i], tokens[i + 1]), 1) for i in range(len(tokens) - 1)])
    .reduceByKey(add)
)

marginal_counts = (
    tokenized_lines_rdd
    .flatMap(lambda tokens: [((tokens[i], '*'), 1) for i in range(len(tokens) - 1)])
    .reduceByKey(add)
    .map(lambda kv: (kv[0][0], kv[1]))
)

# Compute relative frequency
relative_freq_rdd = (
    pair_counts
    .map(lambda kv: (kv[0][0], (kv[0][1], kv[1])))
    .join(marginal_counts)
    .map(lambda kv: (kv[0], kv[1][0][0], kv[1][0][1] / kv[1][1], kv[1][0][1]))
)

bigram_pairs_df = spark.createDataFrame(relative_freq_rdd, schema=["w1", "w2", "rel_freq", "count"])

pairs_top_df = (
    bigram_pairs_df
    .orderBy(F.desc("rel_freq"), F.desc("count"), F.asc("w1"), F.asc("w2"))
    .limit(50)
)

print("Top bigrams (pairs approach):")
pairs_top_df.show(20, truncate=False)


(pairs_top_df
    .toPandas()
    .to_csv(OUTPUTS_DIR / "bigram_pairs_top.csv", index=False)
)
print(f"Saved to {OUTPUTS_DIR / 'bigram_pairs_top.csv'}")

# Save execution plan
plan_buffer = StringIO()
with redirect_stdout(plan_buffer):
    pairs_top_df.explain("formatted")
(PROOF_DIR / "plan_bigrams.txt").write_text(plan_buffer.getvalue())
print(f"Execution plan saved to {PROOF_DIR / 'plan_bigrams.txt'}")



                                                                                

Top bigrams (pairs approach):


                                                                                

+--------+--------+--------+-----+
|w1      |w2      |rel_freq|count|
+--------+--------+--------+-----+
|pleas   |d       |1.0     |69   |
|fac     |d       |1.0     |44   |
|deceiv  |d       |1.0     |41   |
|assur   |d       |1.0     |37   |
|dramatis|personae|1.0     |36   |
|chang   |d       |1.0     |34   |
|reveng  |d       |1.0     |34   |
|resolv  |d       |1.0     |33   |
|belov   |d       |1.0     |31   |
|prepar  |d       |1.0     |30   |
|abus    |d       |1.0     |29   |
|advis   |d       |1.0     |29   |
|amaz    |d       |1.0     |29   |
|cam     |st      |1.0     |29   |
|rul     |d       |1.0     |28   |
|forc    |d       |1.0     |27   |
|prov    |d       |1.0     |27   |
|asham   |d       |1.0     |26   |
|begg    |d       |1.0     |26   |
|charg   |d       |1.0     |26   |
+--------+--------+--------+-----+
only showing top 20 rows
Saved to /home/aurel/bda_labs/bda_assignment02/outputs/bigram_pairs_top.csv
Execution plan saved to /home/aurel/bda_labs/bda_assignment

                                                                                

## 4. Part A — Bigram relative frequency (stripes)

In [6]:
# write some code here
# - build stripes: w_i -> dict{ w_{i+1}: count }, merge with reduceByKey
# - normalize inside each stripe; write outputs/bigram_stripes_top.csv

print("\n=== Part A: Bigram Relative Frequency (Stripes) ===")

def emit_stripes(tokens):
    """Emit (w_i, {w_{i+1}: 1}) for each bigram"""
    result = []
    for i in range(len(tokens) - 1):
        result.append((tokens[i], {tokens[i + 1]: 1}))
    return result

def merge_dicts(dict1, dict2):
    """Merge two dictionaries by summing values"""
    result = dict1.copy()
    for key, value in dict2.items():
        result[key] = result.get(key, 0) + value
    return result

# Build stripes
stripes_rdd = (
    tokenized_lines_rdd
    .flatMap(emit_stripes)
    .reduceByKey(merge_dicts)
)

# Normalize each stripe to get relative frequencies
def normalize_stripe(kv):
    word, stripe = kv
    total = sum(stripe.values())
    normalized = {w: count / total for w, count in stripe.items()}
    return (word, normalized)

stripes_normalized = stripes_rdd.map(normalize_stripe)

# Convert to flat format for output
stripes_flat = (
    stripes_normalized
    .flatMap(lambda kv: [(kv[0], w2, freq) for w2, freq in kv[1].items()])
    .map(lambda x: (x[0], x[1], x[2]))
)

stripes_df = spark.createDataFrame(stripes_flat, schema=["w1", "w2", "rel_freq"])

stripes_top_df = (
    stripes_df
    .orderBy(F.desc("rel_freq"), F.asc("w1"), F.asc("w2"))
    .limit(50)
)

print("Top bigrams (stripes approach):")
stripes_top_df.show(20, truncate=False)

stripes_top_df.toPandas().to_csv(OUTPUTS_DIR / "bigram_stripes_top.csv", index=False)
print(f"Saved to {OUTPUTS_DIR / 'bigram_stripes_top.csv'}")




=== Part A: Bigram Relative Frequency (Stripes) ===


                                                                                

Top bigrams (stripes approach):


                                                                                

+------------+----------+--------+
|w1          |w2        |rel_freq|
+------------+----------+--------+
|abaissiez   |votre     |1.0     |
|abash       |d         |1.0     |
|abatements  |and       |1.0     |
|abates      |the       |1.0     |
|abbeys      |and       |1.0     |
|abbominable |it        |1.0     |
|abbots      |imprisoned|1.0     |
|abbreviated |ne        |1.0     |
|aberga      |ny        |1.0     |
|abet        |him       |1.0     |
|abetting    |him       |1.0     |
|abhominable |which     |1.0     |
|abjects     |and       |1.0     |
|abjur       |d         |1.0     |
|abler       |than      |1.0     |
|abodements  |must      |1.0     |
|aboding     |luckless  |1.0     |
|abominations|turns     |1.0     |
|abortives   |presages  |1.0     |
|abounding   |valour    |1.0     |
+------------+----------+--------+
only showing top 20 rows
Saved to /home/aurel/bda_labs/bda_assignment02/outputs/bigram_stripes_top.csv


                                                                                

## 5. Part B — PMI with threshold K

In [7]:
# write some code here
# - keep only first 40 tokens per line
# - compute counts for x and (x,y); PMI = log10( P(x,y) / (P(x)*P(y)) )
# - --threshold K to filter low-frequency pairs
# - write outputs/pmi_pairs_sample.csv (or stripes version)
# - save proof/plan_pmi.txt if any DF stages are used

# ============================================================================
# 5. PART B - PMI WITH THRESHOLD K
# ============================================================================
import math

print("\n=== Part B: PMI with Threshold ===")

K = 10  # Threshold for minimum frequency

# Truncate to first 40 tokens per line
truncated_lines_rdd = tokenized_lines_rdd.map(lambda tokens: truncate(tokens, 40))

# Count total pairs (for normalization)
total_pairs = truncated_lines_rdd.map(lambda tokens: max(0, len(tokens) - 1)).reduce(add)

# Count individual words
word_counts = (
    truncated_lines_rdd
    .flatMap(lambda tokens: [(w, 1) for w in tokens])
    .reduceByKey(add)
)

# Count bigrams (only pairs, not all co-occurrences)
bigram_counts = (
    truncated_lines_rdd
    .flatMap(lambda tokens: [((tokens[i], tokens[i + 1]), 1) for i in range(len(tokens) - 1)])
    .reduceByKey(add)
    .filter(lambda kv: kv[1] >= K)  # Apply threshold
)

# Calculate PMI
# PMI(x,y) = log10(P(x,y) / (P(x) * P(y)))
word_probs = word_counts.map(lambda kv: (kv[0], kv[1] / total_pairs)).collectAsMap()
word_probs_bc = spark.sparkContext.broadcast(word_probs)

def calculate_pmi(kv):
    (w1, w2), count = kv
    p_xy = count / total_pairs
    p_x = word_probs_bc.value.get(w1, 0)
    p_y = word_probs_bc.value.get(w2, 0)
    
    if p_x > 0 and p_y > 0:
        pmi = math.log10(p_xy / (p_x * p_y))
        return (w1, w2, pmi, count)
    return None

pmi_rdd = (
    bigram_counts
    .map(calculate_pmi)
    .filter(lambda x: x is not None)
)

pmi_df = spark.createDataFrame(pmi_rdd, schema=["w1", "w2", "pmi", "count"])

pmi_sample_df = (
    pmi_df
    .orderBy(F.desc("pmi"), F.desc("count"))
    .limit(100)
)

print(f"PMI results (threshold K={K}):")
pmi_sample_df.show(20, truncate=False)

pmi_sample_df.toPandas().to_csv(OUTPUTS_DIR / "pmi_pairs_sample.csv", index=False)
print(f"Saved to {OUTPUTS_DIR / 'pmi_pairs_sample.csv'}")

# Save PMI execution plan
plan_buffer_pmi = StringIO()
with redirect_stdout(plan_buffer_pmi):
    pmi_df.explain("formatted")
(PROOF_DIR / "plan_pmi.txt").write_text(plan_buffer_pmi.getvalue())
print(f"PMI execution plan saved to {PROOF_DIR / 'plan_pmi.txt'}")



=== Part B: PMI with Threshold ===


                                                                                

PMI results (threshold K=10):
+--------+-----------+------------------+-----+
|w1      |w2         |pmi               |count|
+--------+-----------+------------------+-----+
|l       |envoy      |4.540248931571097 |16   |
|milford |haven      |4.434166930415811 |11   |
|dramatis|personae   |4.345674266821402 |36   |
|boist   |rous       |4.299916776260727 |10   |
|desp    |rate       |4.258524091102502 |10   |
|le      |beau       |4.123825517205046 |22   |
|willow  |willow     |4.106096750244614 |10   |
|wat     |ry         |4.091640833833645 |13   |
|threat  |ning       |3.9409746745020677|17   |
|murd    |rous       |3.878911463519996 |11   |
|william |shakespeare|3.8085550824264542|38   |
|saint   |albans     |3.7913870572894406|19   |
|ta      |en         |3.7301118050897926|95   |
|pow     |rs         |3.698422641015296 |24   |
|stol    |n          |3.6932804045267384|59   |
|pow     |r          |3.643789523478166 |33   |
|alarum  |excursions |3.6352748007046016|10   |
|flow    |

## 6. Part C — Inverted index build

In [8]:
# write some code here
# - assign doc_id = line_number // 10
# - compute term frequencies ((term, doc_id), tf)
# - aggregate into postings per term; compute df
# - write Parquet to outputs/index_parquet/

print("\n=== Part C: Inverted Index Build ===")

# Assign doc_id = line_number // 10
lines_with_id_rdd = lines_rdd.zipWithIndex().map(lambda x: (x[1] // 10, x[0]))

# Tokenize and compute term frequencies per document
doc_term_freq = (
    lines_with_id_rdd
    .flatMap(lambda x: [((term, x[0]), 1) for term in tokenize(x[1])])
    .reduceByKey(add)
)

# Build postings: term -> list of (doc_id, tf)
postings = (
    doc_term_freq
    .map(lambda kv: (kv[0][0], (kv[0][1], kv[1])))
    .groupByKey()
    .mapValues(list)
)

# Compute document frequency (df) for each term
def compute_df(kv):
    term, posting_list = kv
    df = len(posting_list)
    return (term, posting_list, df)

index_rdd = postings.map(compute_df)

# Convert to DataFrame
index_df = spark.createDataFrame(
    index_rdd.map(lambda x: (x[0], str(x[1]), x[2])),
    schema=["term", "postings", "df"]
)

print("Sample of inverted index:")
index_df.show(10, truncate=False)

# Save as Parquet
index_parquet_path = OUTPUTS_DIR / "index_parquet"
index_df.write.mode("overwrite").parquet(str(index_parquet_path))
print(f"Inverted index saved to {index_parquet_path}")





=== Part C: Inverted Index Build ===


                                                                                

Sample of inverted index:
+-----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[Stage 42:>                                                         (0 + 2) / 2]

Inverted index saved to /home/aurel/bda_labs/bda_assignment02/outputs/index_parquet


                                                                                

## 7. Part C — Boolean retrieval (AND / OR)

In [9]:
# write some code here
# - implement evaluate_and(terms) and evaluate_or(terms) using postings
# - ranking: sum(tf) or df-normalized tf
# - run 3–5 queries; write outputs/queries_and_results.md

print("\n=== Part C: Boolean Retrieval ===")

# Load index
index_loaded = spark.read.parquet(str(index_parquet_path))
index_dict = index_loaded.rdd.map(lambda row: (row.term, eval(row.postings))).collectAsMap()

def evaluate_and(terms):
    """Retrieve documents containing ALL terms"""
    if not terms:
        return []
    
    # Get postings for each term
    postings_lists = []
    for term in terms:
        if term not in index_dict:
            return []  # If any term is missing, no results
        postings_lists.append(set(doc_id for doc_id, _ in index_dict[term]))
    
    # Intersection of all posting lists
    result_docs = postings_lists[0]
    for postings in postings_lists[1:]:
        result_docs = result_docs.intersection(postings)
    
    # Compute scores (sum of tf)
    scored_docs = []
    for doc_id in result_docs:
        score = sum(
            tf for term in terms 
            for d, tf in index_dict[term] 
            if d == doc_id
        )
        scored_docs.append((doc_id, score))
    
    return sorted(scored_docs, key=lambda x: -x[1])

def evaluate_or(terms):
    """Retrieve documents containing ANY term"""
    if not terms:
        return []
    
    doc_scores = {}
    for term in terms:
        if term in index_dict:
            for doc_id, tf in index_dict[term]:
                doc_scores[doc_id] = doc_scores.get(doc_id, 0) + tf
    
    return sorted(doc_scores.items(), key=lambda x: -x[1])

# Run sample queries
queries = [
    (["love", "death"], "AND"),
    (["king", "queen"], "OR"),
    (["thou", "thy"], "AND"),
    (["war", "peace"], "OR"),
    (["romeo", "juliet"], "AND")
]

results_md = "# Boolean Retrieval Results\n\n"

for terms, op in queries:
    print(f"\nQuery: {' {op} '.join(terms)}")
    
    if op == "AND":
        results = evaluate_and(terms)
    else:
        results = evaluate_or(terms)
    
    results_md += f"## Query: {' {op} '.join(terms)}\n\n"
    results_md += f"Total documents found: {len(results)}\n\n"
    results_md += "Top 10 results:\n\n"
    results_md += "| Rank | Doc ID | Score |\n"
    results_md += "|------|--------|-------|\n"
    
    for rank, (doc_id, score) in enumerate(results[:10], 1):
        print(f"  {rank}. Doc {doc_id}: score = {score}")
        results_md += f"| {rank} | {doc_id} | {score} |\n"
    
    results_md += "\n"

# Save query results
(OUTPUTS_DIR / "queries_and_results.md").write_text(results_md)
print(f"\nQuery results saved to {OUTPUTS_DIR / 'queries_and_results.md'}")




=== Part C: Boolean Retrieval ===


                                                                                


Query: love {op} death
  1. Doc 4709: score = 7
  2. Doc 7882: score = 5
  3. Doc 9533: score = 5
  4. Doc 9761: score = 4
  5. Doc 5728: score = 4
  6. Doc 121: score = 4
  7. Doc 124: score = 4
  8. Doc 4384: score = 4
  9. Doc 9586: score = 4
  10. Doc 9726: score = 4

Query: king {op} queen
  1. Doc 9338: score = 11
  2. Doc 9329: score = 9
  3. Doc 9339: score = 8
  4. Doc 4698: score = 7
  5. Doc 5334: score = 7
  6. Doc 5035: score = 7
  7. Doc 3830: score = 6
  8. Doc 4234: score = 6
  9. Doc 4566: score = 6
  10. Doc 4724: score = 6

Query: thou {op} thy
  1. Doc 10564: score = 21
  2. Doc 9647: score = 14
  3. Doc 6970: score = 14
  4. Doc 10017: score = 12
  5. Doc 10624: score = 11
  6. Doc 10784: score = 11
  7. Doc 2917: score = 11
  8. Doc 6226: score = 11
  9. Doc 130: score = 10
  10. Doc 8877: score = 10

Query: war {op} peace
  1. Doc 1819: score = 7
  2. Doc 5371: score = 7
  3. Doc 3326: score = 5
  4. Doc 11438: score = 5
  5. Doc 3977: score = 4
  6. Doc 10629: 

## 8. Part D — Performance study

In [10]:
# write some code here
# - vary spark.sql.shuffle.partitions and compare runtime and UI metrics
# - discuss pairs vs stripes trade-offs; include one explain('formatted') text in proof/

print("\n=== Part D: Performance Study ===")

performance_md = "# Performance Study\n\n"
performance_md += "## Shuffle Partitions Experiment\n\n"

# Test different partition counts
partition_counts = [8, 50, 200]

for num_partitions in partition_counts:
    print(f"\nTesting with {num_partitions} shuffle partitions...")
    spark.conf.set("spark.sql.shuffle.partitions", num_partitions)
    
    import time
    start = time.time()
    
    # Re-run bigram computation
    test_df = (
        bigram_pairs_df
        .groupBy("w1")
        .agg(F.sum("count").alias("total_count"))
        .orderBy(F.desc("total_count"))
        .limit(20)
    )
    test_df.count()  # Force execution
    
    elapsed = time.time() - start
    
    performance_md += f"### Partitions: {num_partitions}\n"
    performance_md += f"- Execution time: {elapsed:.2f}s\n\n"
    print(f"  Execution time: {elapsed:.2f}s")

performance_md += "\n## Pairs vs Stripes Trade-offs\n\n"
performance_md += "### Pairs Approach\n"
performance_md += "- **Pros**: Simple implementation, easy to understand\n"
performance_md += "- **Cons**: More shuffling overhead (two passes needed)\n\n"
performance_md += "### Stripes Approach\n"
performance_md += "- **Pros**: Reduced shuffling (one pass), better memory locality\n"
performance_md += "- **Cons**: More complex reducer logic, potential memory issues with large vocabularies\n\n"

# Save formatted plan for performance analysis
plan_buffer_perf = StringIO()
with redirect_stdout(plan_buffer_perf):
    bigram_pairs_df.explain("formatted")
performance_md += "\n## Execution Plan (Pairs Approach)\n\n```\n"
performance_md += plan_buffer_perf.getvalue()
performance_md += "\n```\n"

(PROOF_DIR / "performance_study.md").write_text(performance_md)
print(f"\nPerformance study saved to {PROOF_DIR / 'performance_study.md'}")




=== Part D: Performance Study ===

Testing with 8 shuffle partitions...


                                                                                

  Execution time: 1.66s

Testing with 50 shuffle partitions...


                                                                                

  Execution time: 0.82s

Testing with 200 shuffle partitions...


                                                                                

  Execution time: 0.98s

Performance study saved to /home/aurel/bda_labs/bda_assignment02/proof/performance_study.md


## 9. Spark UI evidence
Open http://localhost:4040 during runs. Capture Files Read, Input Size, Shuffle Read/Write and save screenshots under `proof/`.

## 10. Environment and reproducibility

In [11]:
# write some code here
# - print Java version, Spark conf of interest, and OS info
# - write ENV.md with versions + key configs


print("\n=== Environment Information ===")

import platform
import subprocess

env_md = "# Environment Configuration\n\n"

# Java version
try:
    java_version = subprocess.check_output(['java', '-version'], stderr=subprocess.STDOUT).decode()
    env_md += "## Java Version\n```\n" + java_version + "\n```\n\n"
    print("Java version:", java_version.split('\n')[0])
except:
    env_md += "## Java Version\nNot available\n\n"

# Spark configuration
env_md += "## Spark Configuration\n\n"
spark_conf = spark.sparkContext.getConf().getAll()
for key, value in sorted(spark_conf):
    env_md += f"- `{key}`: {value}\n"
    print(f"  {key}: {value}")

# OS info
env_md += "\n## System Information\n\n"
env_md += f"- **OS**: {platform.system()} {platform.release()}\n"
env_md += f"- **Python**: {platform.python_version()}\n"
env_md += f"- **Architecture**: {platform.machine()}\n"

print(f"\nOS: {platform.system()} {platform.release()}")
print(f"Python: {platform.python_version()}")

(BASE_DIR / "ENV.md").write_text(env_md)
print(f"\nEnvironment info saved to {BASE_DIR / 'ENV.md'}")

print("\n" + "="*70)
print("Vérifier les dossiers : outputs/, proof/, and ENV.md pour les résultats.")
print("="*70)




=== Environment Information ===
Java version: openjdk version "21.0.6-internal" 2025-01-21
  spark.app.id: local-1763910217319
  spark.app.name: BDA-02
  spark.app.startTime: 1763910215811
  spark.app.submitTime: 1763910214991
  spark.driver.extraJavaOptions: -Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-modules=jdk.incubator.vector --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --a