- Import necessary modules (`pyspark.sql.SparkSession`)
- Create a Spark session
- Load a CSV file into a Spark DataFrame (`df`)
- Display the first few rows of the DataFrame using `show()`
- Check the schema of the DataFrame using `printSchema()`
- Retrieve the type of the DataFrame
- List the columns of the DataFrame
- Select specific columns from the DataFrame
- Add a new column to the DataFrame (`bonus_salary`)
- Rename a column in the DataFrame (`name` to `full_name`)
- Drop a column from the DataFrame (`bonus_salary`)
- Described the DataFrame using `describe().show()`

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("DataFrame").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/20 01:39:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/03/20 01:39:36 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
spark 

In [58]:
df = spark.read.csv("csvFiles/empexp.csv", header=True, inferSchema=True)

In [18]:
df.show(5)

+-------------+---+----------+------+
|         name|age|experience|salary|
+-------------+---+----------+------+
|     John Doe| 28|         5| 50000|
|   Jane Smith| 34|        10| 70000|
|Alice Johnson| 25|         3| 45000|
|    Bob Brown| 45|        20| 90000|
|Charlie Davis| 30|         7| 60000|
+-------------+---+----------+------+
only showing top 5 rows



In [21]:
df.head(3)

[Row(name='John Doe', age=28, experience=5, salary=50000),
 Row(name='Jane Smith', age=34, experience=10, salary=70000),
 Row(name='Alice Johnson', age=25, experience=3, salary=45000)]

In [22]:
df.show(3)

+-------------+---+----------+------+
|         name|age|experience|salary|
+-------------+---+----------+------+
|     John Doe| 28|         5| 50000|
|   Jane Smith| 34|        10| 70000|
|Alice Johnson| 25|         3| 45000|
+-------------+---+----------+------+
only showing top 3 rows



In [24]:
df.printSchema()  # check the data types of the columns(schema)

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [25]:
type(df)

pyspark.sql.dataframe.DataFrame

In [26]:
df.columns

['name', 'age', 'experience', 'salary']

In [30]:
df.select("name").show(5) # select a column

+-------------+
|         name|
+-------------+
|     John Doe|
|   Jane Smith|
|Alice Johnson|
|    Bob Brown|
|Charlie Davis|
+-------------+
only showing top 5 rows



In [31]:
df.select(["name", "salary"]).show(5) # select multiple columns

+-------------+------+
|         name|salary|
+-------------+------+
|     John Doe| 50000|
|   Jane Smith| 70000|
|Alice Johnson| 45000|
|    Bob Brown| 90000|
|Charlie Davis| 60000|
+-------------+------+
only showing top 5 rows



In [34]:
df['name']

Column<'name'>

In [35]:
df.dtypes

[('name', 'string'), ('age', 'int'), ('experience', 'int'), ('salary', 'int')]

In [37]:
df.show(5)

+-------------+---+----------+------+
|         name|age|experience|salary|
+-------------+---+----------+------+
|     John Doe| 28|         5| 50000|
|   Jane Smith| 34|        10| 70000|
|Alice Johnson| 25|         3| 45000|
|    Bob Brown| 45|        20| 90000|
|Charlie Davis| 30|         7| 60000|
+-------------+---+----------+------+
only showing top 5 rows



In [41]:
df.describe().show()

+-------+-------------+-----------------+-----------------+------------------+
|summary|         name|              age|       experience|            salary|
+-------+-------------+-----------------+-----------------+------------------+
|  count|           20|               20|               20|                20|
|   mean|         NULL|             32.4|             8.95|           63850.0|
| stddev|         NULL|5.761944116355173|4.936171540139606|14187.744897028488|
|    min|Alice Johnson|               24|                2|             42000|
|    max|Rachel Carter|               45|               20|             90000|
+-------+-------------+-----------------+-----------------+------------------+



Adding columns in dataframe

In [60]:
df = df.withColumn("bonus_salary", df['salary']+500)

In [61]:
df.show(5)

+-------------+---+----------+------+------------+
|         name|age|experience|salary|bonus_salary|
+-------------+---+----------+------+------------+
|     John Doe| 28|         5| 50000|       50500|
|   Jane Smith| 34|        10| 70000|       70500|
|Alice Johnson| 25|         3| 45000|       45500|
|    Bob Brown| 45|        20| 90000|       90500|
|Charlie Davis| 30|         7| 60000|       60500|
+-------------+---+----------+------+------------+
only showing top 5 rows



In [62]:
df = df.withColumnRenamed("name", "full_name")

In [63]:
df.show(5)

+-------------+---+----------+------+------------+
|    full_name|age|experience|salary|bonus_salary|
+-------------+---+----------+------+------------+
|     John Doe| 28|         5| 50000|       50500|
|   Jane Smith| 34|        10| 70000|       70500|
|Alice Johnson| 25|         3| 45000|       45500|
|    Bob Brown| 45|        20| 90000|       90500|
|Charlie Davis| 30|         7| 60000|       60500|
+-------------+---+----------+------+------------+
only showing top 5 rows



In [67]:
df = df.drop("bonus_salary")

In [68]:
df.show(5)

+-------------+---+----------+------+
|    full_name|age|experience|salary|
+-------------+---+----------+------+
|     John Doe| 28|         5| 50000|
|   Jane Smith| 34|        10| 70000|
|Alice Johnson| 25|         3| 45000|
|    Bob Brown| 45|        20| 90000|
|Charlie Davis| 30|         7| 60000|
+-------------+---+----------+------+
only showing top 5 rows

