<a href="https://colab.research.google.com/github/ahammedtechi/BIG-DATA-ANALYSIS/blob/main/BIG_DATA_ANALYSIS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# Install Java and PySpark only
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!pip install -q pyspark

In [4]:
import os
from pyspark.sql import SparkSession

# Set JAVA_HOME (required for PySpark)
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("CODTECH-BigDataAnalysis") \
    .getOrCreate()

# Confirm Spark is running
spark

In [5]:
# Download the CSV locally
!wget -O flights.csv "https://people.sc.fsu.edu/~jburkardt/data/csv/airtravel.csv"


--2025-07-13 16:17:47--  https://people.sc.fsu.edu/~jburkardt/data/csv/airtravel.csv
Resolving people.sc.fsu.edu (people.sc.fsu.edu)... 144.174.0.22
Connecting to people.sc.fsu.edu (people.sc.fsu.edu)|144.174.0.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 321 [text/csv]
Saving to: ‘flights.csv’


2025-07-13 16:17:47 (159 MB/s) - ‘flights.csv’ saved [321/321]



In [6]:
# Load the downloaded file using Spark
df = spark.read.csv("flights.csv", header=True, inferSchema=True)

# Show data
df.show(5)
df.printSchema()

+-----+-------+-------+-------+
|Month| "1958"| "1959"| "1960"|
+-----+-------+-------+-------+
|  JAN|  340.0|  360.0|  417.0|
|  FEB|  318.0|  342.0|  391.0|
|  MAR|  362.0|  406.0|  419.0|
|  APR|  348.0|  396.0|  461.0|
|  MAY|  363.0|  420.0|  472.0|
+-----+-------+-------+-------+
only showing top 5 rows

root
 |-- Month: string (nullable = true)
 |--  "1958": double (nullable = true)
 |--  "1959": double (nullable = true)
 |--  "1960": double (nullable = true)



In [7]:
# Show column names
df.columns

# Print schema (data types)
df.printSchema()

# Basic row count
print("✅ Total Records:", df.count())

root
 |-- Month: string (nullable = true)
 |--  "1958": double (nullable = true)
 |--  "1959": double (nullable = true)
 |--  "1960": double (nullable = true)

✅ Total Records: 12


In [8]:
# Rename columns to remove quotes and make them Spark-safe
df = df.withColumnRenamed(" \"1958\"", "1958") \
       .withColumnRenamed(" \"1959\"", "1959") \
       .withColumnRenamed(" \"1960\"", "1960")

In [9]:
from pyspark.sql.functions import col, lit

# Create individual year DataFrames and union them
df_1958 = df.select(col("Month"), col("1958").alias("Passengers")).withColumn("Year", lit(1958))
df_1959 = df.select(col("Month"), col("1959").alias("Passengers")).withColumn("Year", lit(1959))
df_1960 = df.select(col("Month"), col("1960").alias("Passengers")).withColumn("Year", lit(1960))

# Union all
df_long = df_1958.union(df_1959).union(df_1960)

# Show sample
df_long.show(5)

+-----+----------+----+
|Month|Passengers|Year|
+-----+----------+----+
|  JAN|     340.0|1958|
|  FEB|     318.0|1958|
|  MAR|     362.0|1958|
|  APR|     348.0|1958|
|  MAY|     363.0|1958|
+-----+----------+----+
only showing top 5 rows



In [10]:
df_long.groupBy("Year").sum("Passengers").withColumnRenamed("sum(Passengers)", "Total_Passengers").show()

+----+----------------+
|Year|Total_Passengers|
+----+----------------+
|1958|          4572.0|
|1959|          5140.0|
|1960|          5714.0|
+----+----------------+



In [11]:
df_long.groupBy("Month").avg("Passengers").withColumnRenamed("avg(Passengers)", "Average_Passengers").show()

+-----+------------------+
|Month|Average_Passengers|
+-----+------------------+
|  APR| 401.6666666666667|
|  OCT|             409.0|
|  NOV|             354.0|
|  FEB| 350.3333333333333|
|  SEP| 458.3333333333333|
|  JAN| 372.3333333333333|
|  AUG| 556.6666666666666|
|  MAR| 395.6666666666667|
|  DEC| 391.3333333333333|
|  JUN| 480.6666666666667|
|  JUL| 553.6666666666666|
|  MAY| 418.3333333333333|
+-----+------------------+

