In [1]:
import findspark

findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("MySparkApp").getOrCreate()

spark

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum, desc
from pyspark.sql.types import LongType

In [3]:
# Initialize Spark session
spark = SparkSession.builder.appName("World Population Analysis").master("local[*]").getOrCreate()

In [4]:
# Load CSV files with header and infer schema
urban_df = spark.read.option("header", "true").option("inferSchema", "true").csv("C:/Users/kchangde/Downloads/FINAL_POPULATION_URBAN 2.csv")

rural_df = spark.read.option("header", "true").option("inferSchema", "true").csv("C:/Users/kchangde/Downloads/FINAL_POPULATION_RURAL 2.csv")


In [5]:

# Rename column to match
urban_renamed = urban_df.withColumnRenamed("DEVELOPMENT_CATEGORY", "DEVELOPMENT")



In [6]:
# Create list of year columns
year_cols = [col(f"D_{year}") for year in range(1950, 2051, 5)]


In [7]:
# Add total population column
urban_with_total = urban_renamed.withColumn("TOTAL_POPULATION", sum(year_cols))
rural_with_total = rural_df.withColumn("TOTAL_POPULATION", sum(year_cols))


In [8]:
# Merge datasets
merged_df = urban_with_total.unionByName(rural_with_total)

In [11]:
# Group by country and sum total population
country_total = merged_df.groupBy("COUNTRY").agg(_sum("TOTAL_POPULATION").alias("TOTAL_POPULATION")).withColumn("TOTAL_POPULATION", col("TOTAL_POPULATION").cast(LongType()))


In [12]:
# Get top country
top_country = country_total.orderBy(desc("TOTAL_POPULATION")).limit(1)


In [13]:
print("🏆 Country with Highest Total Population:")
top_country.show(truncate=False)


🏆 Country with Highest Total Population:
+-------+----------------+
|COUNTRY|TOTAL_POPULATION|
+-------+----------------+
|China  |23490064        |
+-------+----------------+



In [15]:
top_country.write.mode("overwrite").orc("C:/Users/kchangde/Desktop/PY9.orc")

