In [2]:
import pyspark

In [3]:
from pyspark.sql import SparkSession

In [4]:
# Initialize Spark session
spark = SparkSession.builder.appName("Sample CSV Data").getOrCreate()

In [6]:
# Load the CSV file
df = spark.read.csv("../data/data.csv", header=True, inferSchema=True)

In [7]:
# Show the data
df.show()

+---------+------+-----+--------------------+-----------+--------------+-----+----+--------------------+
|   stu_id|gender|not_d|                dept|     region|marital_status|  age|g_12|             college|
+---------+------+-----+--------------------+-----------+--------------+-----+----+--------------------+
|R/2791/06|  Male|    6|   Political Science|       Afar|        Single| 30.0| 335|Social Science an...|
|R/2253/06|  Male|    4|          Anesthesia|       Afar|        Single| 30.0| 343|            Medicine|
|R/1737/06|  Male|    1|Public Administra...|       Afar|        Single| 29.0| 435|Business and Econ...|
|R/0268/06|  Male|    2|Construction Engi...|       Afar|        Single| 28.0| 385|Institute of Tech...|
|R/0400/06|  Male|    2|Construction Engi...|       Afar|        Single| 28.0| 371|Institute of Tech...|
|R/3300/08|  Male|    2|Mechanical Engine...|     Tigray|             1|153.0| 472|Institute of Tech...|
|R/0293/06|  Male|    2|Chemical Engineering|       SNN

In [8]:
df.printSchema()

root
 |-- stu_id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- not_d: integer (nullable = true)
 |-- dept: string (nullable = true)
 |-- region: string (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- age: double (nullable = true)
 |-- g_12: string (nullable = true)
 |-- college: string (nullable = true)



In [11]:
df.select('stu_id','gender','dept','college').show()

+---------+------+--------------------+--------------------+
|   stu_id|gender|                dept|             college|
+---------+------+--------------------+--------------------+
|R/2791/06|  Male|   Political Science|Social Science an...|
|R/2253/06|  Male|          Anesthesia|            Medicine|
|R/1737/06|  Male|Public Administra...|Business and Econ...|
|R/0268/06|  Male|Construction Engi...|Institute of Tech...|
|R/0400/06|  Male|Construction Engi...|Institute of Tech...|
|R/3300/08|  Male|Mechanical Engine...|Institute of Tech...|
|R/0293/06|  Male|Chemical Engineering|Institute of Tech...|
|R/0337/06|  Male| Textile Engineering|Institute of Tech...|
|R/0797/06|  Male|Construction Engi...|Institute of Tech...|
|R/0862/06|Female|   Civil Engineering|Institute of Tech...|
|R/0885/06|  Male|Construction Engi...|Institute of Tech...|
|R/0920/06|  Male|Industrial Engine...|Institute of Tech...|
|R/1469/06|  Male| Textile Engineering|Institute of Tech...|
|R/0991/06|  Male|Indust

In [12]:
df.filter(df.age>30).show()

+---------+------+-----+--------------------+--------+--------------+-----+----+--------------------+
|   stu_id|gender|not_d|                dept|  region|marital_status|  age|g_12|             college|
+---------+------+-----+--------------------+--------+--------------+-----+----+--------------------+
|R/3300/08|  Male|    2|Mechanical Engine...|  Tigray|             1|153.0| 472|Institute of Tech...|
|R/0293/06|  Male|    2|Chemical Engineering|    SNNP|        Single| 93.0| 380|Institute of Tech...|
|R/3274/08|Female|    2|     Pre Engineering|  Oromia|             1|148.0| 353|Institute of Tech...|
|R/3263/08|Female|    6|          Psychology|Diredawa|             1|144.0| 311|Social Science an...|
|R/0802/06|  Male|    2|Mechanical Engine...|  Tigray|             1| 89.0| 379|Institute of Tech...|
|R/1126/06|  Male|    4|          Anesthesia|  Tigray|             1| 89.0| 374|            Medicine|
|R/1438/06|  Male|    4|          Anesthesia|  Amhara|             1| 89.0| 379|  

In [9]:
df.filter(df.salary>80000).show()

+---+---------------+---+-----------+------+
| id|           name|age|       city|salary|
+---+---------------+---+-----------+------+
|  4|      Bob Brown| 45|    Houston| 90000|
|  7| Michael Miller| 40|San Antonio| 85000|
| 10|Olivia Anderson| 31|   San Jose| 82000|
+---+---------------+---+-----------+------+



In [10]:
from pyspark.sql.functions import col

In [11]:
 # Add a new column "bonus" as 10% of salary
df_with_bonus = df.withColumn("bonus", col("salary") * 0.1)
df_with_bonus.show()

+---+---------------+---+------------+------+------+
| id|           name|age|        city|salary| bonus|
+---+---------------+---+------------+------+------+
|  1|       John Doe| 28|    New York| 70000|7000.0|
|  2|     Jane Smith| 34| Los Angeles| 80000|8000.0|
|  3|  Alice Johnson| 22|     Chicago| 60000|6000.0|
|  4|      Bob Brown| 45|     Houston| 90000|9000.0|
|  5|  Charlie Davis| 30|     Phoenix| 75000|7500.0|
|  6|     Eva Wilson| 29|Philadelphia| 72000|7200.0|
|  7| Michael Miller| 40| San Antonio| 85000|8500.0|
|  8|   Sophia Moore| 26|   San Diego| 68000|6800.0|
|  9| William Taylor| 33|      Dallas| 77000|7700.0|
| 10|Olivia Anderson| 31|    San Jose| 82000|8200.0|
+---+---------------+---+------------+------+------+



In [12]:
df_renamed = df.withColumnRenamed("city", "location")
df_renamed.show()

+---+---------------+---+------------+------+
| id|           name|age|    location|salary|
+---+---------------+---+------------+------+
|  1|       John Doe| 28|    New York| 70000|
|  2|     Jane Smith| 34| Los Angeles| 80000|
|  3|  Alice Johnson| 22|     Chicago| 60000|
|  4|      Bob Brown| 45|     Houston| 90000|
|  5|  Charlie Davis| 30|     Phoenix| 75000|
|  6|     Eva Wilson| 29|Philadelphia| 72000|
|  7| Michael Miller| 40| San Antonio| 85000|
|  8|   Sophia Moore| 26|   San Diego| 68000|
|  9| William Taylor| 33|      Dallas| 77000|
| 10|Olivia Anderson| 31|    San Jose| 82000|
+---+---------------+---+------------+------+



In [13]:
from pyspark.sql.functions import sum, avg

In [14]:
# Average salary by city
df.groupBy("city").agg(avg("salary").alias("avg_salary")).show()


+------------+----------+
|        city|avg_salary|
+------------+----------+
|     Phoenix|   75000.0|
|      Dallas|   77000.0|
| San Antonio|   85000.0|
|Philadelphia|   72000.0|
| Los Angeles|   80000.0|
|   San Diego|   68000.0|
|     Chicago|   60000.0|
|    San Jose|   82000.0|
|     Houston|   90000.0|
|    New York|   70000.0|
+------------+----------+



In [15]:
# Total salary by city
df.groupBy("city").agg(sum("salary").alias("total_salary")).show()

+------------+------------+
|        city|total_salary|
+------------+------------+
|     Phoenix|       75000|
|      Dallas|       77000|
| San Antonio|       85000|
|Philadelphia|       72000|
| Los Angeles|       80000|
|   San Diego|       68000|
|     Chicago|       60000|
|    San Jose|       82000|
|     Houston|       90000|
|    New York|       70000|
+------------+------------+



In [16]:
df.count()  # Total number of rows

10

In [17]:
# Sort by age in ascending order
df.sort("age").show()

+---+---------------+---+------------+------+
| id|           name|age|        city|salary|
+---+---------------+---+------------+------+
|  3|  Alice Johnson| 22|     Chicago| 60000|
|  8|   Sophia Moore| 26|   San Diego| 68000|
|  1|       John Doe| 28|    New York| 70000|
|  6|     Eva Wilson| 29|Philadelphia| 72000|
|  5|  Charlie Davis| 30|     Phoenix| 75000|
| 10|Olivia Anderson| 31|    San Jose| 82000|
|  9| William Taylor| 33|      Dallas| 77000|
|  2|     Jane Smith| 34| Los Angeles| 80000|
|  7| Michael Miller| 40| San Antonio| 85000|
|  4|      Bob Brown| 45|     Houston| 90000|
+---+---------------+---+------------+------+



In [19]:
# Sort by salary in descending order
df.sort(df.salary.desc()).show()


+---+---------------+---+------------+------+
| id|           name|age|        city|salary|
+---+---------------+---+------------+------+
|  4|      Bob Brown| 45|     Houston| 90000|
|  7| Michael Miller| 40| San Antonio| 85000|
| 10|Olivia Anderson| 31|    San Jose| 82000|
|  2|     Jane Smith| 34| Los Angeles| 80000|
|  9| William Taylor| 33|      Dallas| 77000|
|  5|  Charlie Davis| 30|     Phoenix| 75000|
|  6|     Eva Wilson| 29|Philadelphia| 72000|
|  1|       John Doe| 28|    New York| 70000|
|  8|   Sophia Moore| 26|   San Diego| 68000|
|  3|  Alice Johnson| 22|     Chicago| 60000|
+---+---------------+---+------------+------+



In [20]:
data = [
    (1, "Engineering"),
    (2, "HR"),
    (3, "Finance"),
    (4, "Engineering"),
    (5, "Finance")
]

In [21]:
columns = ["id", "department"]

In [22]:
dept_df = spark.createDataFrame(data, columns)