In [43]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import year, count, col, desc, to_timestamp, avg , month, to_date, when
from pyspark.sql.types import StringType
from datetime import datetime

In [85]:
spark = SparkSession.builder.appName("SpaceMissions").config("spark.sql.legacy.timeParserPolicy", "LEGACY").getOrCreate()

In [86]:
df=spark.read.option('header','true').csv('mission_launches.csv',inferSchema=True)

In [87]:
df = df.withColumn("ParsedDate", to_timestamp(df["Date"], "EEE MMM dd, yyyy HH:mm z")) \
       .withColumn("Year", year("ParsedDate")) \
       .drop("ParsedDate")


In [88]:
df = df.filter((col("Year").isNotNull()) & (col("Year") != 0))

In [95]:
failure_df = df.filter((df["Mission_Status"] == "Failure") | (df["Mission_Status"] == "Partial Failure"))
failure_counts_per_year = failure_df.groupBy("Year").count().orderBy("Year")
failure_counts_per_year.show(truncate=False)

+----+-----+
|Year|count|
+----+-----+
|1957|1    |
|1958|16   |
|1959|12   |
|1960|19   |
|1961|20   |
|1962|16   |
|1963|9    |
|1964|9    |
|1965|12   |
|1966|16   |
|1967|16   |
|1968|9    |
|1969|16   |
|1970|14   |
|1971|14   |
|1972|10   |
|1973|5    |
|1974|8    |
|1975|6    |
|1976|5    |
+----+-----+
only showing top 20 rows



In [102]:

# Filter success counts per year into two groups: before 2000 and after
before_2000_df = failure_counts_per_year.filter(col("Year") <= 2000)
after_2000_df = failure_counts_per_year.filter(col("Year") > 2000)

# Calculate the average success rate for each group
before_2000_avg_failure_rate = before_2000_df.select(avg("count")).collect()[0][0]
after_2000_avg_failure_rate = after_2000_df.select(avg("count")).collect()[0][0]

# Display the results
print("Average failure rate for years before 2000:", before_2000_avg_failure_rate)
print("Average failure rate for years after 2000:", after_2000_avg_failure_rate)

Average failure rate for years before 2000: 7.681818181818182
Average failure rate for years after 2000: 3.2


In [98]:
success_df = df.filter(df["Mission_Status"] == "Success")
success_counts_per_year = success_df.groupBy("Year").count().orderBy("Year")
success_counts_per_year.show(truncate=False)


+----+-----+
|Year|count|
+----+-----+
|1957|2    |
|1958|6    |
|1959|8    |
|1960|19   |
|1961|32   |
|1962|65   |
|1963|29   |
|1964|47   |
|1965|74   |
|1966|81   |
|1967|86   |
|1968|91   |
|1969|85   |
|1970|92   |
|1971|102  |
|1972|86   |
|1973|94   |
|1974|89   |
|1975|106  |
|1976|104  |
+----+-----+
only showing top 20 rows



In [101]:

# Filter success counts per year into two groups: before 2000 and after
before_2000_df = success_counts_per_year.filter(col("Year") <= 2000)
after_2000_df = success_counts_per_year.filter(col("Year") > 2000)

# Calculate the average success rate for each group
before_2000_avg_success_rate = before_2000_df.select(avg("count")).collect()[0][0]
after_2000_avg_success_rate = after_2000_df.select(avg("count")).collect()[0][0]

# Display the results
print("Average success rate for years before 2000:", before_2000_avg_success_rate)
print("Average success rate for years after 2000:", after_2000_avg_success_rate)

Average success rate for years before 2000: 62.09090909090909
Average success rate for years after 2000: 53.1
