In [1]:
# PySpark and visualization libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, desc

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Post-COVID Conditions Analysis") \
    .getOrCreate()


In [2]:
# Load CSV file into PySpark DataFrame
df = spark.read.csv("Post-COVID_Conditions.csv", header=True, inferSchema=True)

# Display schema and few rows
df.printSchema()
df.show(5)


root
 |-- Indicator: string (nullable = true)
 |-- Group: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Subgroup: string (nullable = true)
 |-- Phase: double (nullable = true)
 |-- Time Period: integer (nullable = true)
 |-- Time Period Label: string (nullable = true)
 |-- Time Period Start Date: string (nullable = true)
 |-- Time Period End Date: string (nullable = true)
 |-- Value: double (nullable = true)
 |-- LowCI: double (nullable = true)
 |-- HighCI: double (nullable = true)
 |-- Confidence Interval: string (nullable = true)
 |-- Quartile range: string (nullable = true)
 |-- Quartile number: integer (nullable = true)
 |-- Suppression Flag: integer (nullable = true)

+--------------------+-----------------+-------------+-------------+-----+-----------+--------------------+----------------------+--------------------+-----+-----+------+-------------------+--------------+---------------+----------------+
|           Indicator|            Group|        State|    

In [5]:
# Count rows and columns
print("Total Records:", df.count())
print("Total Columns:", len(df.columns))

# Display column names
print("Columns:", df.columns)





Total Records: 18639
Total Columns: 16
Columns: ['Indicator', 'Group', 'State', 'Subgroup', 'Phase', 'Time Period', 'Time Period Label', 'Time Period Start Date', 'Time Period End Date', 'Value', 'LowCI', 'HighCI', 'Confidence Interval', 'Quartile range', 'Quartile number', 'Suppression Flag']


In [7]:
# Select relevant columns for analysis
selected_df = df.select("Indicator", "Group", "State", "Value", "LowCI", "HighCI")

selected_df.show(5)



+--------------------+-----------------+-------------+-----+-----+------+
|           Indicator|            Group|        State|Value|LowCI|HighCI|
+--------------------+-----------------+-------------+-----+-----+------+
|Ever experienced ...|National Estimate|United States| 14.0| 13.5|  14.5|
|Ever experienced ...|           By Age|United States| 17.8| 15.9|  19.8|
|Ever experienced ...|           By Age|United States| 15.2| 14.1|  16.2|
|Ever experienced ...|           By Age|United States| 16.9| 15.7|  18.3|
|Ever experienced ...|           By Age|United States| 15.3| 14.1|  16.7|
+--------------------+-----------------+-------------+-----+-----+------+
only showing top 5 rows


In [8]:
# Filter data for a specific indicator (replace with actual one from your dataset)
indicator_name = "Post-COVID Conditions"
filtered_df = selected_df.filter(col("Indicator") == indicator_name)

filtered_df.show(5)


+---------+-----+-----+-----+-----+------+
|Indicator|Group|State|Value|LowCI|HighCI|
+---------+-----+-----+-----+-----+------+
+---------+-----+-----+-----+-----+------+



In [9]:
# Group by 'State' and compute average Value
state_avg = df.groupBy("State").agg(avg("Value").alias("Average_Value"))

state_avg.show(10)


+------------+------------------+
|       State|     Average_Value|
+------------+------------------+
|        Utah|25.250892857142855|
|      Hawaii|16.386627906976745|
|   Minnesota| 22.23190476190475|
|        Ohio| 21.47549999999999|
|    Arkansas| 21.96530612244898|
|      Oregon|23.112328767123298|
|       Texas| 25.08026315789474|
|North Dakota|19.995505617977525|
|Pennsylvania|21.701415094339623|
| Connecticut|22.010552763819103|
+------------+------------------+
only showing top 10 rows


In [10]:
state_pd = state_avg.toPandas()
