## In the below section we will be covering the following topics.

    *Pyspark Dataframe
    *Reading the data set
    *Checking theDatatypes aof the column (Schema)
    *Selecting columns and indexing 
    *Check describe option similar to pandas
    *Adding and dropping columns
    *Renaming the columns

In [9]:
import pyspark

In [7]:
from pyspark.sql import SparkSession

In [10]:
spark = SparkSession.builder.appName("Dataframe").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/11 11:51:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/07/11 11:51:24 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [11]:
spark

In [14]:
# Read the dataframe
spark.read.csv("test1.csv")

DataFrame[_c0: string, _c1: string, _c2: string]

In [13]:
spark.read.csv("test1.csv").show()

+---------+---+----------+
|      _c0|_c1|       _c2|
+---------+---+----------+
|     Name|Age|Experience|
|sudhanshu| 30|        10|
|      adi| 33|         8|
|    swati| 34|         6|
|     asdi| 56|         9|
+---------+---+----------+



In [16]:
# Setting the first row as the header

spark.read.option('header', 'True').csv('test1.csv').show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|sudhanshu| 30|        10|
|      adi| 33|         8|
|    swati| 34|         6|
|     asdi| 56|         9|
+---------+---+----------+



In [17]:
df_pyspark1 = spark.read.option('header', 'True').csv('test1.csv')

In [18]:
type(df_pyspark1)

pyspark.sql.dataframe.DataFrame

In [20]:
## Check the schema 
df_pyspark1.printSchema()

## By default it is taking as string until and unless we put inferSchema as true in the above csv file.

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [21]:
df_pyspark2 = spark.read.option('header', 'True').csv('test1.csv', inferSchema = True)

In [22]:
df_pyspark2.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [23]:
## Another way 

df_pyspark3 = spark.read.csv('test1.csv', header= True, inferSchema= True)

In [25]:
df_pyspark3.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [26]:
type(df_pyspark3)
# DataFrame is a data structure

pyspark.sql.dataframe.DataFrame

In [27]:
df_pyspark3.columns

['Name', 'Age', 'Experience']

In [28]:
df_pyspark3.head(2)

# Here we get in a list format 

[Row(Name='sudhanshu', Age=30, Experience=10),
 Row(Name='adi', Age=33, Experience=8)]

In [29]:
## How do i select a columns

df_pyspark3.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|sudhanshu| 30|        10|
|      adi| 33|         8|
|    swati| 34|         6|
|     asdi| 56|         9|
+---------+---+----------+



In [30]:
# How to select only name column

df_pyspark3.select('Name')

DataFrame[Name: string]

In [31]:
df_pyspark3.select('Name').show()

+---------+
|     Name|
+---------+
|sudhanshu|
|      adi|
|    swati|
|     asdi|
+---------+



In [32]:
type(df_pyspark3.select('Name'))

pyspark.sql.dataframe.DataFrame

In [33]:
# Select multiple columns
df_pyspark3.select('Name', 'Experience')



DataFrame[Name: string, Experience: int]

In [34]:
df_pyspark3.select('Name', 'Experience').show()

+---------+----------+
|     Name|Experience|
+---------+----------+
|sudhanshu|        10|
|      adi|         8|
|    swati|         6|
|     asdi|         9|
+---------+----------+



In [35]:
# Another way like we do in pandas

df_pyspark3['Name']

Column<'Name'>

In [36]:
df_pyspark3.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [37]:
# Most of the functionalities are similar to pandas

In [39]:
# Describe option

df_pyspark3.describe()

DataFrame[summary: string, Name: string, Age: string, Experience: string]

In [40]:
df_pyspark3.describe().show()

+-------+-----+------------------+-----------------+
|summary| Name|               Age|       Experience|
+-------+-----+------------------+-----------------+
|  count|    4|                 4|                4|
|   mean| null|             38.25|             8.25|
| stddev| null|11.954775893619532|1.707825127659933|
|    min|  adi|                30|                6|
|    max|swati|                56|               10|
+-------+-----+------------------+-----------------+



In [41]:
## Adding columns in data frame pyspark

df_pyspark3.withColumn('Experience after 2 Year', df_pyspark3['Experience']+2)



DataFrame[Name: string, Age: int, Experience: int, Experience after 2 Year: int]

In [42]:
df_pyspark3.withColumn('Experience after 2 Year', df_pyspark3['Experience']+2).show()

+---------+---+----------+-----------------------+
|     Name|Age|Experience|Experience after 2 Year|
+---------+---+----------+-----------------------+
|sudhanshu| 30|        10|                     12|
|      adi| 33|         8|                     10|
|    swati| 34|         6|                      8|
|     asdi| 56|         9|                     11|
+---------+---+----------+-----------------------+



In [43]:
df_pyspark3 =   df_pyspark3.withColumn('Experience after 2 Year', df_pyspark3['Experience']+2)

In [44]:
df_pyspark3.show()

+---------+---+----------+-----------------------+
|     Name|Age|Experience|Experience after 2 Year|
+---------+---+----------+-----------------------+
|sudhanshu| 30|        10|                     12|
|      adi| 33|         8|                     10|
|    swati| 34|         6|                      8|
|     asdi| 56|         9|                     11|
+---------+---+----------+-----------------------+



In [45]:
## Dropping the columns 

df_pyspark3.drop('Experience after 2 Year')

DataFrame[Name: string, Age: int, Experience: int]

In [47]:
df_pyspark3.drop('Experience after 2 Year').show()


+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|sudhanshu| 30|        10|
|      adi| 33|         8|
|    swati| 34|         6|
|     asdi| 56|         9|
+---------+---+----------+



In [48]:
df_pyspark3 = df_pyspark3.drop('Experience after 2 Year')

In [49]:
df_pyspark3.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|sudhanshu| 30|        10|
|      adi| 33|         8|
|    swati| 34|         6|
|     asdi| 56|         9|
+---------+---+----------+



In [50]:
## Rename the columns

df_pyspark3.withColumnRenamed('Name', 'New Name')

DataFrame[New Name: string, Age: int, Experience: int]

In [51]:
df_pyspark3.withColumnRenamed('Name', 'New Name').show()

+---------+---+----------+
| New Name|Age|Experience|
+---------+---+----------+
|sudhanshu| 30|        10|
|      adi| 33|         8|
|    swati| 34|         6|
|     asdi| 56|         9|
+---------+---+----------+

