In [1]:
import os
import pyspark
from pyspark.sql import SparkSession

In [2]:
os.environ["JAVA_HOME"] = "C:/Program Files/Java/jdk-22"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

In [3]:
spark = SparkSession.builder.appName("Dataframe").getOrCreate()

In [4]:
spark

In [5]:
spark.read.option("header", "true").csv("Datasets/testfile1.csv").show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|     Yash| 23|        10|
|    Mohan| 25|         4|
|Sudhanshu| 30|         8|
+---------+---+----------+



In [6]:
'''
so i was using df_pyspark = spark.read.option("header", "true").csv("Datasets/testfile1.csv").show()
and because of that printSchema() was not working when i removed .show() then it worked
'''

'\nso i was using df_pyspark = spark.read.option("header", "true").csv("Datasets/testfile1.csv").show()\nand because of that printSchema() was not working when i removed .show() then it worked\n'

In [7]:
df_pyspark = spark.read.option("header", "true").csv("Datasets/testfile1.csv")

In [9]:
df_pyspark.printSchema() # its considering datatype of age, experience as a string bydefault for this we will add inferSchema=True

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [10]:
df_pyspark = spark.read.option("header", "true").csv("Datasets/testfile1.csv", inferSchema=True)
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [11]:
df_pyspark = spark.read.csv("Datasets/testfile1.csv", header=True, inferSchema=True)
df_pyspark.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|     Yash| 23|        10|
|    Mohan| 25|         4|
|Sudhanshu| 30|         8|
+---------+---+----------+



In [12]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [13]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [14]:
df_pyspark.columns

['Name', 'Age', 'Experience']

In [17]:
df_pyspark.head()

Row(Name='Yash', Age=23, Experience=10)

In [18]:
df_pyspark.head(3)

[Row(Name='Yash', Age=23, Experience=10),
 Row(Name='Mohan', Age=25, Experience=4),
 Row(Name='Sudhanshu', Age=30, Experience=8)]

In [19]:
df_pyspark.select("Name")

DataFrame[Name: string]

In [21]:
type(df_pyspark.select("Name"))

pyspark.sql.dataframe.DataFrame

In [20]:
df_pyspark.select("Name").show()

+---------+
|     Name|
+---------+
|     Yash|
|    Mohan|
|Sudhanshu|
+---------+



In [22]:
df_pyspark.select(["Name", "Experience"])

DataFrame[Name: string, Experience: int]

In [23]:
df_pyspark.select(["Name", "Experience"]).show()

+---------+----------+
|     Name|Experience|
+---------+----------+
|     Yash|        10|
|    Mohan|         4|
|Sudhanshu|         8|
+---------+----------+



In [24]:
df_pyspark['Name']

Column<'Name'>

In [27]:
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [28]:
df_pyspark.describe()

DataFrame[summary: string, Name: string, Age: string, Experience: string]

In [29]:
df_pyspark.describe().show()

+-------+-----+-----------------+------------------+
|summary| Name|              Age|        Experience|
+-------+-----+-----------------+------------------+
|  count|    3|                3|                 3|
|   mean| NULL|             26.0| 7.333333333333333|
| stddev| NULL|3.605551275463989|3.0550504633038935|
|    min|Mohan|               23|                 4|
|    max| Yash|               30|                10|
+-------+-----+-----------------+------------------+



In [33]:
# ADDING COLUMNS
df_pyspark.withColumn("Experience After 2 Years", df_pyspark["Experience"]+2).show()

+---------+---+----------+------------------------+
|     Name|Age|Experience|Experience After 2 Years|
+---------+---+----------+------------------------+
|     Yash| 23|        10|                      12|
|    Mohan| 25|         4|                       6|
|Sudhanshu| 30|         8|                      10|
+---------+---+----------+------------------------+



In [34]:
df_pyspark.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|     Yash| 23|        10|
|    Mohan| 25|         4|
|Sudhanshu| 30|         8|
+---------+---+----------+



In [35]:
df_pyspark = df_pyspark.withColumn("Experience After 2 Years", df_pyspark["Experience"]+2)

In [36]:
df_pyspark.show()

+---------+---+----------+------------------------+
|     Name|Age|Experience|Experience After 2 Years|
+---------+---+----------+------------------------+
|     Yash| 23|        10|                      12|
|    Mohan| 25|         4|                       6|
|Sudhanshu| 30|         8|                      10|
+---------+---+----------+------------------------+



In [37]:
# DROP COLUMNS
df_pyspark.drop("Experience After 2 Years")

DataFrame[Name: string, Age: int, Experience: int]

In [38]:
df_pyspark.show()

+---------+---+----------+------------------------+
|     Name|Age|Experience|Experience After 2 Years|
+---------+---+----------+------------------------+
|     Yash| 23|        10|                      12|
|    Mohan| 25|         4|                       6|
|Sudhanshu| 30|         8|                      10|
+---------+---+----------+------------------------+



In [39]:
df_pyspark = df_pyspark.drop("Experience After 2 Years")

In [40]:
df_pyspark.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|     Yash| 23|        10|
|    Mohan| 25|         4|
|Sudhanshu| 30|         8|
+---------+---+----------+



In [44]:
# RENAME COLUMNS
df_pyspark = df_pyspark.withColumnRenamed("Name", "Employee Name")

In [45]:
df_pyspark.show()

+-------------+---+----------+
|Employee Name|Age|Experience|
+-------------+---+----------+
|         Yash| 23|        10|
|        Mohan| 25|         4|
|    Sudhanshu| 30|         8|
+-------------+---+----------+

