In [2]:
#Step 1: Install Java and PySpark
!apt-get install openjdk-11-jdk -qq > /dev/null
!pip install -q pyspark

#Step 2: Environment Setup
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

#Step 3: Start SparkSession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("COVID19 Big Data Analysis").getOrCreate()

#Step 4: Upload the Dataset (CSV)
from google.colab import files
uploaded = files.upload()  # Upload the CSV file here

#Step 5: Load Data into Spark DataFrame
df = spark.read.csv("Data Science for COVID-19.csv", header=True, inferSchema=True)
df.printSchema()
df.show(5)

#Step 6: Check Column Names
print("Columns:", df.columns)

#Step 7: Basic Info
print("Total Records:", df.count())
print("Total Columns:", len(df.columns))

#Step 8: Drop null rows (optional cleanup)
df = df.dropna(how="any")

#Step 9: Top 10 provinces with highest confirmed cases
df.groupBy("province").sum("confirmed").orderBy("sum(confirmed)", ascending=False).show(10)

#Step 10: Top 10 cities with highest confirmed cases
df.groupBy("city").sum("confirmed").orderBy("sum(confirmed)", ascending=False).show(10)

#Step 11: Correlation between confirmed cases and latitude (sample idea)
from pyspark.sql.functions import col
df_corr = df.select(col("confirmed").cast("double"), col("latitude").cast("double"))
correlation = df_corr.stat.corr("confirmed", "latitude")
print("Correlation between confirmed and latitude:", correlation)

#Step 12: Group-wise stats (summary by city)
df.groupBy("city").agg(
    {"confirmed": "sum", "latitude": "avg", "longitude": "avg"}
).orderBy("sum(confirmed)", ascending=False).show(10)


Saving Data Science for COVID-19.csv to Data Science for COVID-19 (1).csv
root
 |--  case_id: integer (nullable = true)
 |-- province: string (nullable = true)
 |-- city: string (nullable = true)
 |-- group: boolean (nullable = true)
 |-- infection_case: string (nullable = true)
 |-- confirmed: integer (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)

+--------+--------+------------+-----+--------------------+---------+---------+----------+
| case_id|province|        city|group|      infection_case|confirmed| latitude| longitude|
+--------+--------+------------+-----+--------------------+---------+---------+----------+
| 1000001|   Seoul|  Yongsan-gu| true|       Itaewon Clubs|      139|37.538621|126.992652|
| 1000002|   Seoul|   Gwanak-gu| true|             Richway|      119| 37.48208|126.901384|
| 1000003|   Seoul|     Guro-gu| true| Guro-gu Call Center|       95|37.508163|126.884387|
| 1000004|   Seoul|Yangcheon-gu| true|Yangcheon Ta