# PySpark Dataframe

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('DatFrame').getOrCreate()

In [3]:
filePath = 'H:\\PySpark\\Tutorial02\\test1.csv'

In [4]:
spark

In [7]:
df_spark = spark.read.option('header', 'true').csv(filePath, inferSchema=True)

In [8]:
df_spark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [9]:
df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [10]:
df_pyspark=spark.read.csv(filePath, header=True,inferSchema=True)
df_pyspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [11]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [12]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [13]:
df_pyspark.columns

['Name', 'age', 'Experience', 'Salary']

In [15]:
df_pyspark.head(3)

[Row(Name='Krish', age=31, Experience=10, Salary=30000),
 Row(Name='Sudhanshu', age=30, Experience=8, Salary=25000),
 Row(Name='Sunny', age=29, Experience=4, Salary=20000)]

In [16]:
df_pyspark.select('Name')

DataFrame[Name: string]

In [17]:
type(df_pyspark.select('Name'))

pyspark.sql.dataframe.DataFrame

In [18]:
df_pyspark.select('Name').show()

+---------+
|     Name|
+---------+
|    Krish|
|Sudhanshu|
|    Sunny|
|     Paul|
|   Harsha|
|  Shubham|
+---------+



In [19]:
df_pyspark.select(['Name', 'Experience'])

DataFrame[Name: string, Experience: int]

In [20]:
df_pyspark.select(['Name', 'Experience']).show()

+---------+----------+
|     Name|Experience|
+---------+----------+
|    Krish|        10|
|Sudhanshu|         8|
|    Sunny|         4|
|     Paul|         3|
|   Harsha|         1|
|  Shubham|         2|
+---------+----------+



In [21]:
type(df_pyspark.select(['Name', 'Experience']))

pyspark.sql.dataframe.DataFrame

In [22]:
df_pyspark['Name']

Column<'Name'>

In [23]:
df_pyspark.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int'), ('Salary', 'int')]

In [24]:
df_pyspark.describe()

DataFrame[summary: string, Name: string, age: string, Experience: string, Salary: string]

# Adding a column to the existing dataframe

In [30]:
df_pyspark.withColumn('Eperience after 2 year', df_pyspark['Experience']+2).show()

+---------+---+----------+------+----------------------+
|     Name|age|Experience|Salary|Eperience after 2 year|
+---------+---+----------+------+----------------------+
|    Krish| 31|        10| 30000|                    12|
|Sudhanshu| 30|         8| 25000|                    10|
|    Sunny| 29|         4| 20000|                     6|
|     Paul| 24|         3| 20000|                     5|
|   Harsha| 21|         1| 15000|                     3|
|  Shubham| 23|         2| 18000|                     4|
+---------+---+----------+------+----------------------+



In [31]:
df_pyspark = df_pyspark.withColumn('Eperience after 2 year', df_pyspark['Experience']+2)

In [33]:
df_pyspark.show()

+---------+---+----------+------+----------------------+
|     Name|age|Experience|Salary|Eperience after 2 year|
+---------+---+----------+------+----------------------+
|    Krish| 31|        10| 30000|                    12|
|Sudhanshu| 30|         8| 25000|                    10|
|    Sunny| 29|         4| 20000|                     6|
|     Paul| 24|         3| 20000|                     5|
|   Harsha| 21|         1| 15000|                     3|
|  Shubham| 23|         2| 18000|                     4|
+---------+---+----------+------+----------------------+



# Drop the column

In [34]:
df_pyspark = df_pyspark.drop('Eperience after 2 year')

In [35]:
df_pyspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [36]:
df_pyspark = df_pyspark.withColumnRenamed('Name', 'New Name')

In [37]:
df_pyspark.show()

+---------+---+----------+------+
| New Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+

