In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("MySparkApp") \
    .getOrCreate()
spark

In [2]:
# Step 1: Import SparkContext

from pyspark import SparkContext

# Step 2: Initialize SparkContext (use getOrCreate to avoid multiple context errors)

sc = SparkContext.getOrCreate()

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum

In [4]:
# Create Spark session
spark = SparkSession.builder.appName("My Spark").master("local[*]").getOrCreate()

In [21]:
# Load Rural Dataset and Rename Column
ruralDF = spark.read.option("header", "true").csv("C:/Users/ajaypalu/Downloads/FINAL_POPULATION_RURAL 4.csv").withColumnRenamed("D_2025", "Rural_2025")

In [22]:
# Load Urban Dataset and Rename Column
urbanDF = spark.read.option("header", "true").csv("C:/Users/ajaypalu/Downloads/FINAL_POPULATION_URBAN 3.csv").withColumnRenamed("D_2025", "Urban_2025")

In [31]:
ruralDF = ruralDF.drop("CONTINENT")

In [32]:
# Join on "Country ID"
joinDF = urbanDF.join(ruralDF, on="Country ID", how="inner")

In [33]:
updatedDF = joinDF \
    .withColumn("Urban_2025", col("Urban_2025").cast("long")) \
    .withColumn("Rural_2025", col("Rural_2025").cast("long")) \
    .withColumn("Total_2025", col("Urban_2025") + col("Rural_2025"))

In [34]:
# Continent Wise Total Population
result1 = updatedDF.select("CONTINENT", "Total_2025") \
                   .groupBy("CONTINENT") \
                   .agg(sum("Total_2025").alias("Total_Population_2025"))

In [35]:
result1.show()

+--------------------+---------------------+
|           CONTINENT|Total_Population_2025|
+--------------------+---------------------+
|              Europe|               743826|
|Latin America and...|               678697|
|              Africa|              1417060|
|    Northern America|               388405|
|             Oceania|                44626|
|                Asia|              4715469|
+--------------------+---------------------+



In [38]:
# Save Continent-wise result as Parquet
result1.write.parquet("C:/Users/ajaypalu/OneDrive - Capgemini/Desktop/pySparkOutput1")

In [39]:
# Development Wise Total Population
result3 = updatedDF.select("DEVELOPMENT", "Total_2025") \
                   .groupBy("DEVELOPMENT") \
                   .agg(sum("Total_2025").alias("Total_Population_2025"))
 
result3.show()

+--------------+---------------------+
|   DEVELOPMENT|Total_Population_2025|
+--------------+---------------------+
|LESS DEVELOPED|              6701473|
|MORE DEVELOPED|              1286610|
+--------------+---------------------+



In [40]:
# Save Development-wise result as CSV
result3.write.option("header", "true").csv("C:/Users/ajaypalu/OneDrive - Capgemini/Desktop/pySparkOutput3")