In [7]:
import findspark

findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("MySparkApp").getOrCreate()

spark

In [8]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum, desc
from pyspark.sql.types import LongType




In [9]:
# Initialize Spark session
spark = SparkSession.builder.appName("World Population Analysis").master("local[*]").getOrCreate()

In [10]:

# Load CSV files with header and infer schema
urban_df = spark.read.option("header", "true").option("inferSchema", "true").csv("C:/Users/kchangde/Downloads/FINAL_POPULATION_URBAN 2.csv")

rural_df = spark.read.option("header", "true").option("inferSchema", "true").csv("C:/Users/kchangde/Downloads/FINAL_POPULATION_RURAL 2.csv")


In [13]:

# Rename column to match
urban_renamed = urban_df.withColumnRenamed("DEVELOPMENT_CATEGORY", "DEVELOPMENT")


In [14]:


# Create list of year columns
year_cols = [col(f"D_{year}") for year in range(1950, 2051, 5)]



In [15]:

# Add total population column
urban_with_total = urban_renamed.withColumn("TOTAL_POPULATION", sum(year_cols))
rural_with_total = rural_df.withColumn("TOTAL_POPULATION", sum(year_cols))


In [16]:


merged_df = urban_with_total.unionByName(rural_with_total)


In [18]:

# Group by continent and sum total population
continent_total = merged_df.groupBy("CONTINENT").agg(_sum("TOTAL_POPULATION").alias("TOTAL_POPULATION")).withColumn("TOTAL_POPULATION", col("TOTAL_POPULATION").cast(LongType()))


In [19]:

# Get top continent
top_continent = continent_total.orderBy(desc("TOTAL_POPULATION")).limit(1)


In [20]:

print("🌍 Continent with Highest Total Population:")
top_continent.show(truncate=False)



🌍 Continent with Highest Total Population:
+---------+----------------+
|CONTINENT|TOTAL_POPULATION|
+---------+----------------+
|Asia     |73949554        |
+---------+----------------+



In [22]:
top_continent.write.mode("overwrite").orc("C:/Users/kchangde/Desktop/PY8.orc")
