In [17]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import types as t
from pyspark.sql.window import Window

In [3]:
spark = (
    SparkSession
    .builder
    .appName("War Survival Data Analysis")
    .master("local[*]")
    .config('spark.sql.adaptive.enabled', 'true')
    .getOrCreate()
)

24/05/05 15:48:06 WARN Utils: Your hostname, codespaces-0d4183 resolves to a loopback address: 127.0.0.1; using 172.16.5.4 instead (on interface eth0)
24/05/05 15:48:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/05 15:48:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/05/05 15:48:20 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [6]:
war_survival_df = (
    spark
    .read
    .option('header', 'true')
    .csv("../input_data/war_survival_data.csv")
)
war_survival_df.show(5)

+-------+---+------------------+----------------------+--------------+-----------+-----------+-----------------+--------------------+--------------+----------------+------------------------------+------------------------+-----------------------+
|   Name|Age|Food Supply (Days)|Water per Day (Liters)|First Aid Kits|Antibiotics|Painkillers|Weapons Available|Defensive Structures|Training Level|Radios Available|Access to Reliable Information|Support Groups Available|Entertainment Available|
+-------+---+------------------+----------------------+--------------+-----------+-----------+-----------------+--------------------+--------------+----------------+------------------------------+------------------------+-----------------------+
| Sophia| 43|                13|     4.150040328218717|             4|         10|         21|               21|                   5|             2|              15|                            No|                     Yes|                  Books|
|   Emma| 35|   

In [8]:
war_survival_df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Food Supply (Days): string (nullable = true)
 |-- Water per Day (Liters): string (nullable = true)
 |-- First Aid Kits: string (nullable = true)
 |-- Antibiotics: string (nullable = true)
 |-- Painkillers: string (nullable = true)
 |-- Weapons Available: string (nullable = true)
 |-- Defensive Structures: string (nullable = true)
 |-- Training Level: string (nullable = true)
 |-- Radios Available: string (nullable = true)
 |-- Access to Reliable Information: string (nullable = true)
 |-- Support Groups Available: string (nullable = true)
 |-- Entertainment Available: string (nullable = true)



## Medium 

### Data Cleaning and Transformation:

In [9]:
# Convert the 'Age', 'Food Supply (Days)', 'Water per Day (Liters)', and other relevant columns to appropriate numeric data types.
cleaned_df = (
    war_survival_df
    .withColumn('Age', f.col('Age').cast(t.IntegerType()))
    .withColumn('Food Supply (Days)', f.col('Food Supply (Days)').cast(t.IntegerType()))
    .withColumn('Water per Day (Liters)', f.col('Water per Day (Liters)').cast(t.FloatType()))
    .withColumn('First Aid Kits', f.col('First Aid Kits').cast(t.IntegerType()))
    .withColumn('Antibiotics', f.col('Antibiotics').cast(t.IntegerType()))
    .withColumn('Painkillers', f.col('Painkillers').cast(t.IntegerType()))
    .withColumn('Weapons Available', f.col('Weapons Available').cast(t.IntegerType()))
)
cleaned_df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Food Supply (Days): integer (nullable = true)
 |-- Water per Day (Liters): float (nullable = true)
 |-- First Aid Kits: integer (nullable = true)
 |-- Antibiotics: integer (nullable = true)
 |-- Painkillers: integer (nullable = true)
 |-- Weapons Available: integer (nullable = true)
 |-- Defensive Structures: string (nullable = true)
 |-- Training Level: string (nullable = true)
 |-- Radios Available: string (nullable = true)
 |-- Access to Reliable Information: string (nullable = true)
 |-- Support Groups Available: string (nullable = true)
 |-- Entertainment Available: string (nullable = true)



### Aggregation and Analysis:

In [15]:
# Calculate the average age of individuals in the dataset.
# Find the total number of first aid kits, antibiotics, and painkillers available across all individuals.
# Determine the average number of weapons available per person.

summary_df = (
    cleaned_df
    .agg(f.avg('Age')
         , f.sum('First Aid Kits').alias('total_first_aid_kits')
         , f.sum('Antibiotics').alias('total_Antibiotics')
         , f.sum('Painkillers').alias('total_Painkillers')
         , f.avg('Weapons Available').alias('avg_Weapons_Available')
        )
    
)
summary_df.show()

+--------+--------------------+-----------------+-----------------+---------------------+
|avg(Age)|total_first_aid_kits|total_Antibiotics|total_Painkillers|avg_Weapons_Available|
+--------+--------------------+-----------------+-----------------+---------------------+
|  38.489|                5154|             9612|            25105|               47.564|
+--------+--------------------+-----------------+-----------------+---------------------+



## Hard Level

### Advanced Aggregation and Window Functions:

In [23]:
# Calculate the cumulative sum of 'Food Supply (Days)' for each individual, ordered by age.
window_spec = Window.partitionBy().orderBy('Age').rowsBetween(Window.unboundedPreceding, 0)

cumulative_sum_food_supply_df = (
    cleaned_df
    .withColumn('cumulative_sum_food_supply', f.sum('Food Supply (Days)').over(window_spec))
)

cumulative_sum_food_supply_df.show()

24/05/05 16:26:41 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/05 16:26:41 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/05 16:26:41 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/05 16:26:41 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/05 16:26:41 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------+---+------------------+----------------------+--------------+-----------+-----------+-----------------+--------------------+--------------+----------------+------------------------------+------------------------+-----------------------+--------------------------+
|   Name|Age|Food Supply (Days)|Water per Day (Liters)|First Aid Kits|Antibiotics|Painkillers|Weapons Available|Defensive Structures|Training Level|Radios Available|Access to Reliable Information|Support Groups Available|Entertainment Available|cumulative_sum_food_supply|
+-------+---+------------------+----------------------+--------------+-----------+-----------+-----------------+--------------------+--------------+----------------+------------------------------+------------------------+-----------------------+--------------------------+
|Michael| 18|                13|             4.3504906|             0|         13|         13|               83|                   7|             3|               2|                

In [24]:
# Compute the rolling average of 'Water per Day (Liters)' for a window of 5 individuals, ordered by age.
window_spec = Window.partitionBy().orderBy('age').rowsBetween(-5, 0)

rolling_avg_water_per_day_df = (
    cleaned_df
    .withColumn('rolling_avg_water_per_day', f.avg('Water per Day (Liters)').over(window_spec))
)

rolling_avg_water_per_day_df.show()

24/05/05 16:32:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/05 16:32:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/05 16:32:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/05 16:32:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/05 16:32:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------+---+------------------+----------------------+--------------+-----------+-----------+-----------------+--------------------+--------------+----------------+------------------------------+------------------------+-----------------------+-------------------------+
|   Name|Age|Food Supply (Days)|Water per Day (Liters)|First Aid Kits|Antibiotics|Painkillers|Weapons Available|Defensive Structures|Training Level|Radios Available|Access to Reliable Information|Support Groups Available|Entertainment Available|rolling_avg_water_per_day|
+-------+---+------------------+----------------------+--------------+-----------+-----------+-----------------+--------------------+--------------+----------------+------------------------------+------------------------+-----------------------+-------------------------+
|Michael| 18|                13|             4.3504906|             0|         13|         13|               83|                   7|             3|               2|                   

In [28]:
columns = cleaned_df.columns

null_columns_list = [(c, cleaned_df.filter(f.col(c).isNull()).count()) for c in columns]

for column , count in null_columns_list:
    print(f'{column} \t null--> {count}')

Name 	 null--> 0
Age 	 null--> 0
Food Supply (Days) 	 null--> 0
Water per Day (Liters) 	 null--> 0
First Aid Kits 	 null--> 0
Antibiotics 	 null--> 0
Painkillers 	 null--> 0
Weapons Available 	 null--> 0
Defensive Structures 	 null--> 0
Training Level 	 null--> 0
Radios Available 	 null--> 0
Access to Reliable Information 	 null--> 0
Support Groups Available 	 null--> 0
Entertainment Available 	 null--> 0


In [30]:
# Handle missing values in the dataset. For example, impute missing 'Water per Day (Liters)' values with the median value
# Since there are no missing/null values, lets excercise it using the below sample data:
# Sample data with missing values
data = [
    ("John", 30, 2),   # Name, Age, Water per Day (Liters)
    ("Alice", 25, None),
    ("Bob", 40, 4),
    ("Jane", 35, 5),
    ("Emma", None, 6),
    ("Michael", 55, 7),
    ("Sophia", 50, None)
]

# Create DataFrame
df = spark.createDataFrame(data, ["Name", "Age", "Water per Day (Liters)"])

# Calculate median value for 'Water per Day (Liters)'
median_water_per_day = df.approxQuantile("Water per Day (Liters)", [0.5], 0.25)[0]
print(f'median_water_per_day: {median_water_per_day}')

# Impute missing values with median value
df = df.na.fill({"Water per Day (Liters)": median_water_per_day})

# Show the result
df.show()

median_water_per_day: 5.0
+-------+----+----------------------+
|   Name| Age|Water per Day (Liters)|
+-------+----+----------------------+
|   John|  30|                     2|
|  Alice|  25|                     5|
|    Bob|  40|                     4|
|   Jane|  35|                     5|
|   Emma|NULL|                     6|
|Michael|  55|                     7|
| Sophia|  50|                     5|
+-------+----+----------------------+

