In [2]:
from pyspark.sql import SparkSession
import os
os.environ["JAVA_HOME"] = "C:\Program Files\Java\jdk-22"

In [3]:
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [4]:
spark

In [5]:
##read the dataset
spark.read.option('header','true').csv('age.csv')

DataFrame[Name: string, Age: string, Experience: string]

In [6]:
spark.read.option('header','true').csv('age.csv').show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|    John| 25|         7|
|   Emily| 30|         3|
| Michael| 28|         9|
|   Sarah| 35|         2|
|   David| 40|         8|
| Jessica| 22|         5|
|  Daniel| 33|         1|
|  Rachel| 29|         6|
| Matthew| 26|         4|
|Jennifer| 31|        10|
+--------+---+----------+



In [7]:
df_pyspark = spark.read.option('header','true').csv('age.csv')

In [8]:
#check the schema
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [9]:
df_pyspark = spark.read.option('header','true').csv('age.csv',inferSchema = True)

In [10]:
#check the schema again
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [11]:
df_pyspark = spark.read.csv('age.csv',header = True, inferSchema=True)
df_pyspark.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|    John| 25|         7|
|   Emily| 30|         3|
| Michael| 28|         9|
|   Sarah| 35|         2|
|   David| 40|         8|
| Jessica| 22|         5|
|  Daniel| 33|         1|
|  Rachel| 29|         6|
| Matthew| 26|         4|
|Jennifer| 31|        10|
+--------+---+----------+



In [12]:
#check the schema again
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [13]:
#check type
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [14]:
df_pyspark.columns

['Name', 'Age', 'Experience']

In [15]:
df_pyspark.head(3)

[Row(Name='John', Age=25, Experience=7),
 Row(Name='Emily', Age=30, Experience=3),
 Row(Name='Michael', Age=28, Experience=9)]

In [16]:
#cheack columns
df_pyspark.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|    John| 25|         7|
|   Emily| 30|         3|
| Michael| 28|         9|
|   Sarah| 35|         2|
|   David| 40|         8|
| Jessica| 22|         5|
|  Daniel| 33|         1|
|  Rachel| 29|         6|
| Matthew| 26|         4|
|Jennifer| 31|        10|
+--------+---+----------+



In [17]:
df_pyspark.select('Name')

DataFrame[Name: string]

In [18]:
df_pyspark.select('Name').show()

+--------+
|    Name|
+--------+
|    John|
|   Emily|
| Michael|
|   Sarah|
|   David|
| Jessica|
|  Daniel|
|  Rachel|
| Matthew|
|Jennifer|
+--------+



In [19]:
type(df_pyspark.select('Name'))

pyspark.sql.dataframe.DataFrame

In [20]:
df_pyspark.select(['Name','Experience'])

DataFrame[Name: string, Experience: int]

In [21]:
df_pyspark.select(['Name','Experience']).show()

+--------+----------+
|    Name|Experience|
+--------+----------+
|    John|         7|
|   Emily|         3|
| Michael|         9|
|   Sarah|         2|
|   David|         8|
| Jessica|         5|
|  Daniel|         1|
|  Rachel|         6|
| Matthew|         4|
|Jennifer|        10|
+--------+----------+



In [22]:
df_pyspark['Name']

Column<'Name'>

In [24]:
#check data types
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [26]:
#check describe
df_pyspark.describe()


DataFrame[summary: string, Name: string, Age: string, Experience: string]

In [27]:
df_pyspark.describe().show()

+-------+------+-----------------+------------------+
|summary|  Name|              Age|        Experience|
+-------+------+-----------------+------------------+
|  count|    10|               10|                10|
|   mean|  NULL|             29.9|               5.5|
| stddev|  NULL|5.216427044549852|3.0276503540974917|
|    min|Daniel|               22|                 1|
|    max| Sarah|               40|                10|
+-------+------+-----------------+------------------+



In [28]:
# adding columns
df_pyspark.withColumn('Experience After 2 years', df_pyspark['Experience'] + 2)

DataFrame[Name: string, Age: int, Experience: int, Experience After 2 years: int]

In [30]:
df_pyspark.withColumn('Experience After 2 years', df_pyspark['Experience'] + 2).show()

+--------+---+----------+------------------------+
|    Name|Age|Experience|Experience After 2 years|
+--------+---+----------+------------------------+
|    John| 25|         7|                       9|
|   Emily| 30|         3|                       5|
| Michael| 28|         9|                      11|
|   Sarah| 35|         2|                       4|
|   David| 40|         8|                      10|
| Jessica| 22|         5|                       7|
|  Daniel| 33|         1|                       3|
|  Rachel| 29|         6|                       8|
| Matthew| 26|         4|                       6|
|Jennifer| 31|        10|                      12|
+--------+---+----------+------------------------+



In [31]:
df_pyspark = df_pyspark.withColumn('Experience After 2 years', df_pyspark['Experience'] + 2)

In [33]:
df_pyspark.show()

+--------+---+----------+------------------------+
|    Name|Age|Experience|Experience After 2 years|
+--------+---+----------+------------------------+
|    John| 25|         7|                       9|
|   Emily| 30|         3|                       5|
| Michael| 28|         9|                      11|
|   Sarah| 35|         2|                       4|
|   David| 40|         8|                      10|
| Jessica| 22|         5|                       7|
|  Daniel| 33|         1|                       3|
|  Rachel| 29|         6|                       8|
| Matthew| 26|         4|                       6|
|Jennifer| 31|        10|                      12|
+--------+---+----------+------------------------+



In [34]:
### Drop the columns
df_pyspark.drop('Experience After 2 years')

DataFrame[Name: string, Age: int, Experience: int]

In [35]:
df_pyspark = df_pyspark.drop('Experience After 2 years')
df_pyspark.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|    John| 25|         7|
|   Emily| 30|         3|
| Michael| 28|         9|
|   Sarah| 35|         2|
|   David| 40|         8|
| Jessica| 22|         5|
|  Daniel| 33|         1|
|  Rachel| 29|         6|
| Matthew| 26|         4|
|Jennifer| 31|        10|
+--------+---+----------+



In [36]:
## rename column
df_pyspark.withColumnRenamed('Name','New Name').show()

+--------+---+----------+
|New Name|Age|Experience|
+--------+---+----------+
|    John| 25|         7|
|   Emily| 30|         3|
| Michael| 28|         9|
|   Sarah| 35|         2|
|   David| 40|         8|
| Jessica| 22|         5|
|  Daniel| 33|         1|
|  Rachel| 29|         6|
| Matthew| 26|         4|
|Jennifer| 31|        10|
+--------+---+----------+

