In [2]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285398 sha256=59d4a23d96bd198189a6b6bdd68e05fabfbc57e855675b8678f9bf566ef2bdb2
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


1. Working with RDDs:
   a) Write a Python program to create an RDD from a local data source.




```
from pyspark import SparkContext

# Create a SparkContext
sc = SparkContext("local", "RDD Example")

# Create an RDD from a local data source
rdd = sc.textFile("path/to/local/file.txt")

# Perform operations on the RDD
# For example, let's count the number of lines in the file
line_count = rdd.count()

# Print the line count
print("Number of lines in the file:", line_count)

# Stop the SparkContext
sc.stop()
```



   b) Implement transformations and actions on the RDD to perform data processing tasks.



```
from pyspark import SparkContext

# Create a SparkContext
sc = SparkContext("local", "RDD Example")

# Create an RDD from a local data source
rdd = sc.textFile("path/to/local/file.txt")

# Perform transformations and actions on the RDD

# 1. Filter transformation: Keep lines containing the word "error"
error_lines = rdd.filter(lambda line: "error" in line.lower())

# 2. Map transformation: Extract words from each line
words = rdd.flatMap(lambda line: line.split(" "))

# 3. ReduceByKey transformation: Count the occurrence of each word
word_counts = words.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)

# 4. Take action: Print the first 10 word counts
top_word_counts = word_counts.take(10)
for word, count in top_word_counts:
    print(word, count)

# Stop the SparkContext
sc.stop()
```






c) Analyze and manipulate data using RDD operations such as map, filter, reduce, or aggregate.





```
from pyspark import SparkContext

# Create a SparkContext
sc = SparkContext("local", "RDD Example")

# Create an RDD from a local data source
rdd = sc.textFile("path/to/local/file.txt")

# Perform data analysis and manipulation using RDD operations

# Map operation: Extract the length of each line
line_lengths = rdd.map(lambda line: len(line))

# Filter operation: Keep lines with length greater than 50
long_lines = rdd.filter(lambda line: len(line) > 50)

# Reduce operation: Calculate the total length of all lines
total_length = line_lengths.reduce(lambda a, b: a + b)

# Aggregate operation: Calculate the average length of lines
total_count = rdd.count()
average_length = total_length / total_count

# Print the analysis results
print("Average line length:", average_length)
print("Long lines:")
for line in long_lines.collect():
    print(line)

# Stop the SparkContext
sc.stop()
```



2. Spark DataFrame Operations:
   a) Write a Python program to load a CSV file into a Spark DataFrame.




```
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("DataFrame Example").getOrCreate()

# Load a CSV file into a DataFrame
df = spark.read.csv("path/to/csv/file.csv", header=True, inferSchema=True)

# Show the contents of the DataFrame
df.show()

# Stop the SparkSession
spark.stop()
```



   b)Perform common DataFrame operations such as filtering, grouping, or joining.



```
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("DataFrame Operations").getOrCreate()

# Load a CSV file into a DataFrame
df = spark.read.csv("path/to/csv/file.csv", header=True, inferSchema=True)

# Perform DataFrame operations

# Filtering: Keep rows where the age is greater than 30
filtered_df = df.filter(df.age > 30)

# Grouping: Calculate the average salary for each department
grouped_df = df.groupBy("department").avg("salary")

# Joining: Perform an inner join with another DataFrame
other_df = spark.read.csv("path/to/other/csv/file.csv", header=True, inferSchema=True)
joined_df = df.join(other_df, df.id == other_df.id, "inner")

# Show the results
print("Filtered DataFrame:")
filtered_df.show()

print("Grouped DataFrame:")
grouped_df.show()

print("Joined DataFrame:")
joined_df.show()

# Stop the SparkSession
spark.stop()
```



   c) Apply Spark SQL queries on the DataFrame to extract insights from the data.



```
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("Spark SQL Example").getOrCreate()

# Load a CSV file into a DataFrame
df = spark.read.csv("path/to/csv/file.csv", header=True, inferSchema=True)

# Register the DataFrame as a temporary table
df.createOrReplaceTempView("employees")

# Apply Spark SQL queries on the DataFrame

# Example 1: Retrieve all records
query1 = "SELECT * FROM employees"
result1 = spark.sql(query1)

# Example 2: Calculate the average salary
query2 = "SELECT AVG(salary) AS avg_salary FROM employees"
result2 = spark.sql(query2)

# Example 3: Filter and sort the records
query3 = "SELECT name, age, department FROM employees WHERE age > 30 ORDER BY age DESC"
result3 = spark.sql(query3)

# Show the results
print("Query 1 - All records:")
result1.show()

print("Query 2 - Average salary:")
result2.show()

print("Query 3 - Filtered and sorted records:")
result3.show()

# Stop the SparkSession
spark.stop()
```



3. Spark Streaming:
  a) Write a Python program to create a Spark Streaming application.




```
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# Create a SparkContext
sc = SparkContext("local[2]", "Spark Streaming Example")

# Create a StreamingContext with a batch interval of 1 second
ssc = StreamingContext(sc, 1)

# Create a DStream from a TCP socket as a streaming source
lines = ssc.socketTextStream("localhost", 9999)

# Perform operations on the DStream

# Example: Word count
word_counts = lines.flatMap(lambda line: line.split(" ")) \
                  .map(lambda word: (word, 1)) \
                  .reduceByKey(lambda a, b: a + b)

# Print the word counts
word_counts.pprint()

# Start the streaming computation
ssc.start()

# Wait for the streaming computation to finish
ssc.awaitTermination()

# Stop the SparkContext
sc.stop()
```



   b) Configure the application to consume data from a streaming source (e.g., Kafka or a socket).



```
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# Create a SparkContext
sc = SparkContext("local[2]", "Spark Streaming Example")

# Create a StreamingContext with a batch interval of 1 second
ssc = StreamingContext(sc, 1)

# Configure the application to consume data from a streaming source

# Option 1: Consume data from a Kafka topic
from pyspark.streaming.kafka import KafkaUtils

kafka_params = {
    "bootstrap.servers": "localhost:9092",
    "group.id": "spark-streaming-consumer-group"
}

topics = ["my-topic"]
kafka_stream = KafkaUtils.createDirectStream(ssc, topics, kafka_params)

# Option 2: Consume data from a socket
socket_stream = ssc.socketTextStream("localhost", 9999)

# Perform operations on the DStream

# Example: Word count
word_counts = kafka_stream.flatMap(lambda line: line[1].split(" ")) \
                          .map(lambda word: (word, 1)) \
                          .reduceByKey(lambda a, b: a + b)

# Print the word counts
word_counts.pprint()

# Start the streaming computation
ssc.start()

# Wait for the streaming computation to finish
ssc.awaitTermination()

# Stop the SparkContext
sc.stop()
```



   c) Implement streaming transformations and actions to process and analyze the incoming data stream.



```
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# Create a SparkContext
sc = SparkContext("local[2]", "Spark Streaming Example")

# Create a StreamingContext with a batch interval of 1 second
ssc = StreamingContext(sc, 1)

# Configure the application to consume data from a streaming source
socket_stream = ssc.socketTextStream("localhost", 9999)

# Implement streaming transformations and actions

# Example: Word count
word_counts = socket_stream.flatMap(lambda line: line.split(" ")) \
                          .map(lambda word: (word, 1)) \
                          .reduceByKey(lambda a, b: a + b)

# Print the word counts
word_counts.pprint()

# Example: Compute average length of lines
line_lengths = socket_stream.map(lambda line: len(line))
total_length = line_lengths.reduce(lambda a, b: a + b)
line_count = line_lengths.count()
average_length = total_length / line_count

# Print the average line length
print("Average line length:", average_length)

# Start the streaming computation
ssc.start()

# Wait for the streaming computation to finish
ssc.awaitTermination()

# Stop the SparkContext
sc.stop()
```



4. Spark SQL and Data Source Integration:
   a) Write a Python program to connect Spark with a relational database (e.g., MySQL, PostgreSQL).




```
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("Spark SQL Example").getOrCreate()

# Configure the connection to the relational database

# MySQL example:
url = "jdbc:mysql://localhost:3306/mydatabase"
properties = {
    "driver": "com.mysql.jdbc.Driver",
    "user": "username",
    "password": "password"
}

# PostgreSQL example:
# url = "jdbc:postgresql://localhost:5432/mydatabase"
# properties = {
#     "driver": "org.postgresql.Driver",
#     "user": "username",
#     "password": "password"
# }

# Load data from a table in the relational database
table_name = "my_table"
df = spark.read.jdbc(url=url, table=table_name, properties=properties)

# Perform operations on the DataFrame
# Example: Show the contents of the DataFrame
df.show()

# Stop the SparkSession
spark.stop()
```



   b)Perform SQL operations on the data stored in the database using Spark SQL.



```
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("Spark SQL Example").getOrCreate()

# Configure the connection to the relational database

# MySQL example:
url = "jdbc:mysql://localhost:3306/mydatabase"
properties = {
    "driver": "com.mysql.jdbc.Driver",
    "user": "username",
    "password": "password"
}

# PostgreSQL example:
# url = "jdbc:postgresql://localhost:5432/mydatabase"
# properties = {
#     "driver": "org.postgresql.Driver",
#     "user": "username",
#     "password": "password"
# }

# Register the table as a temporary view
table_name = "my_table"
df = spark.read.jdbc(url=url, table=table_name, properties=properties)
df.createOrReplaceTempView("my_table_view")

# Perform SQL operations on the data stored in the database

# Example 1: Retrieve all records
query1 = "SELECT * FROM my_table_view"
result1 = spark.sql(query1)

# Example 2: Calculate the average value
query2 = "SELECT AVG(value) AS avg_value FROM my_table_view"
result2 = spark.sql(query2)

# Example 3: Filter the records
query3 = "SELECT * FROM my_table_view WHERE category = 'A'"
result3 = spark.sql(query3)

# Show the results
print("Query 1 - All records:")
result1.show()

print("Query 2 - Average value:")
result2.show()

print("Query 3 - Filtered records:")
result3.show()

# Stop the SparkSession
spark.stop()
```



   c) Explore the integration capabilities of Spark with other data sources, such as Hadoop Distributed File System (HDFS) or Amazon S3.



```
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("Spark S3 Example").getOrCreate()

# Configure AWS credentials
spark.conf.set("spark.hadoop.fs.s3a.access.key", "your_access_key")
spark.conf.set("spark.hadoop.fs.s3a.secret.key", "your_secret_key")

# Read data from Amazon S3
s3_path = "s3a://bucket-name/path/to/data"
df = spark.read.csv(s3_path, header=True, inferSchema=True)

# Perform operations on the DataFrame
# ...

# Stop the SparkSession
spark.stop()
```

