In [0]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("DataFrame Partitions Example") \
    .getOrCreate()

# Data array
data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

# Create DataFrame from data
df = spark.createDataFrame([(x,) for x in data], ["value"])

# Repartition to 5 partitions
df_repartitioned = df.repartition(5)

# Print number of partitions
num_partitions = df_repartitioned.rdd.getNumPartitions()
print(f"Number of partitions: {num_partitions}")


Number of partitions: 5


In [0]:
# Manual data input
data = [
    "1827|2008-01-01|WEB|JUAN|NH|2008|TRUE",
    "1828|2008-01-02|TH|JUAN|NH|2008|FALSE",
    "1829|2008-01-03|FR|JUAN|NH|2008|FALSE",
    "1830|2008-01-04|SA|JUAN|NH|2008|FALSE",
    "1831|2008-01-05|SU|JUAN|NH|2008|FALSE",
    "1832|2008-01-06|MO|JUAN|NH|2008|FALSE",
    "1833|2008-01-07|TU|JUAN|NH|2008|FALSE",
    "1837|2008-01-08|WE|JUAN|NH|2008|FALSE",
    "1838|2008-01-09|TH|JUAN|NH|2008|FALSE",
    "1839|2008-01-10|FR|JUAN|NH|2008|FALSE",
    "1837|2008-01-11|SA|JUAN|NH|2008|FALSE",
    "1838|2008-01-12|SU|JUAN|NH|2008|FALSE",
    "1839|2008-01-13|MO|JUAN|NH|2008|FALSE",
    "1840|2008-01-14|TU|JUAN|NH|2008|FALSE",
    "1841|2008-01-14|WE|JUAN|NH|2008|FALSE",
    "1842|2008-01-16|TH|JUAN|NH|2008|FALSE",
    "1842|2008-01-17|FR|JUAN|NH|2008|FALSE",
    "1843|2008-01-18|SA|JUAN|NH|2008|FALSE",
    "1845|2008-01-19|SU|JUAN|NH|2008|FALSE",
    "1846|2008-01-20|MO|JUAN|NH|2008|FALSE",
    "1847|2008-01-21|TU|JUAN|NH|2008|FALSE",
    "1848|2008-01-24|WE|JUAN|NH|2008|FALSE",
    "1849|2008-01-24|TH|JUAN|NH|2008|FALSE",
    "1851|2008-01-25|FR|JUAN|NH|2008|FALSE",
    "1851|2008-01-26|SA|JUAN|NH|2008|FALSE"
]

# Create DataFrame from the data
df = spark.createDataFrame([(line,) for line in data], ["value"])

# Convert to RDD
rdd = df.rdd

# Count the number of records
record_count = rdd.count()

print(f"Total number of records: {record_count}")

# Show the data
display(df)

# Print number of partitions
print(f"\nNumber of partitions: {rdd.getNumPartitions()}")

Total number of records: 25


value
1827|2008-01-01|WEB|JUAN|NH|2008|TRUE
1828|2008-01-02|TH|JUAN|NH|2008|FALSE
1829|2008-01-03|FR|JUAN|NH|2008|FALSE
1830|2008-01-04|SA|JUAN|NH|2008|FALSE
1831|2008-01-05|SU|JUAN|NH|2008|FALSE
1832|2008-01-06|MO|JUAN|NH|2008|FALSE
1833|2008-01-07|TU|JUAN|NH|2008|FALSE
1837|2008-01-08|WE|JUAN|NH|2008|FALSE
1838|2008-01-09|TH|JUAN|NH|2008|FALSE
1839|2008-01-10|FR|JUAN|NH|2008|FALSE



Number of partitions: 4


In [0]:
# Input paragraph
text = """Python Lists allow us to hold items of heterogeneous types. In this article, we will learn how to create a list in Python; access the list items; find the number of items in the list, how to add an item to list; how to remove an item from the list; loop through list items; sorting a list, reversing a list, and many more transformation and aggregation actions on Python Lists."""

# Create RDD from the text
text_rdd = spark.sparkContext.parallelize([text])

# Split into words and keep original case, strip punctuation
words_rdd = text_rdd.flatMap(lambda line: line.split()) \
    .map(lambda word: word.strip('.,;:!?()[]{}'))

# Count each word
word_counts = words_rdd.map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a + b)

# Collect and sort results by word
results = word_counts.collect()
results_sorted = sorted(results, key=lambda x: x[0])

# Print each tuple
for item in results_sorted:
    print(item)

('In', 1)
('Lists', 2)
('Python', 3)
('a', 3)
('access', 1)
('actions', 1)
('add', 1)
('aggregation', 1)
('allow', 1)
('an', 2)
('and', 2)
('article', 1)
('create', 1)
('find', 1)
('from', 1)
('heterogeneous', 1)
('hold', 1)
('how', 3)
('in', 2)
('item', 2)
('items', 4)
('learn', 1)
('list', 8)
('loop', 1)
('many', 1)
('more', 1)
('number', 1)
('of', 2)
('on', 1)
('remove', 1)
('reversing', 1)
('sorting', 1)
('the', 4)
('this', 1)
('through', 1)
('to', 5)
('transformation', 1)
('types', 1)
('us', 1)
('we', 1)
('will', 1)


In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

data = [
    ("James", "", "Smith", "36636", "M", 3000),
    ("Michael", "Rose", "", "40288", "M", 4000),
    ("Robert", "", "Williams", "42114", "M", 4000),
    ("Maria", "Anne", "Jones", "39192", "F", 4000),
    ("Jen", "Mary", "Brown", "", "F", -1)
]

schema = StructType([
    StructField("firstname", StringType(), True),
    StructField("middlename", StringType(), True),
    StructField("lastname", StringType(), True),
    StructField("id", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("salary", IntegerType(), True)
])

# Create DataFrame
df = spark.createDataFrame(data, schema)

# a) Register DataFrame as view named 'Users'
df.createOrReplaceTempView("Users")

# b) DataFrame API: Filter users with salary > 3000
df_filtered = df.filter(df.salary > 3000)
display(df_filtered)

# b) SQL: Query users with salary > 3000
result_sql = spark.sql(
    "SELECT * FROM Users WHERE salary > 3000"
)
display(result_sql)

firstname,middlename,lastname,id,gender,salary
Michael,Rose,,40288,M,4000
Robert,,Williams,42114,M,4000
Maria,Anne,Jones,39192,F,4000


firstname,middlename,lastname,id,gender,salary
Michael,Rose,,40288,M,4000
Robert,,Williams,42114,M,4000
Maria,Anne,Jones,39192,F,4000


In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

structureData = [
    (("James", "", "Smith"), "36636", "M", 3100),
    (("Michael", "Rose", ""), "40288", "M", 4300),
    (("Robert", "", "Williams"), "42114", "M", 1400),
    (("Maria", "Anne", "Jones"), "39192", "F", 5500),
    (("Jen", "Mary", "Brown"), "", "F", -1)
]

structureSchema = StructType([
    StructField('name', StructType([
        StructField('firstname', StringType(), True),
        StructField('middlename', StringType(), True),
        StructField('lastname', StringType(), True)
    ])),
    StructField('id', StringType(), True),
    StructField('gender', StringType(), True),
    StructField('salary', IntegerType(), True)
])

# Create DataFrame
df = spark.createDataFrame(structureData, structureSchema)

# c) Register DataFrame as view named 'Users'
df.createOrReplaceTempView("Users")

# d) SQL to return 'firstname' when 'lastname' is 'Rose'
result_sql = spark.sql(
    "SELECT name.firstname FROM Users WHERE name.lastname = 'Rose'"
)
display(result_sql)

# Since Rose is a middle name nothing is returned.

firstname
