#### In this Notebook We will go through the following things -
- PySpark Dataframe
- Reading The Dataset
- Checking the Datatypes of the Column(Schema)
- Selecting Columns And Indexing
- Check Describe option similar to Pandas
- Adding Columns
- Dropping columns
- Renaming Columns

In [5]:
# Connecting Google colab with drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
# Import the pyspark library
import pyspark

In [2]:
# Initializing the SparkSession
from pyspark.sql import SparkSession

In [3]:
# Build the spark object with 'Dataframe' app name as spark
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [4]:
# Get details on the spark object just created
spark

In [6]:
# Read the 'PySpark_test1.csv' dataset as df_pyspark with the first row as header
df_pyspark = spark.read.option('header','true').csv('/content/drive/MyDrive/Datasets/PySpark_test1.csv', inferSchema = True)

# The inferschema = True option makes the schema as it should be
# Otherwise, it will give all the column types as string objects

In [7]:
### Check the schema
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [8]:
# Combining the header and inferschema in a more simple way
df_pyspark = spark.read.csv('/content/drive/MyDrive/Datasets/PySpark_test1.csv', header = True, inferSchema = True)
df_pyspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [9]:
# Check the schema for the new data frame df_pyspark, created by combining the codes
df_pyspark.printSchema() # It works perfectly

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [10]:
# Check the type of df_pyspark data frame
print(type(df_pyspark))

<class 'pyspark.sql.dataframe.DataFrame'>


In [11]:
# Check first three records of the df_pyspark dataframe
df_pyspark.head(3)

[Row(Name='Krish', age=31, Experience=10, Salary=30000),
 Row(Name='Sudhanshu', age=30, Experience=8, Salary=25000),
 Row(Name='Sunny', age=29, Experience=4, Salary=20000)]

In [13]:
# Get the column names of the df_pyspark dataframe
df_pyspark.columns

['Name', 'age', 'Experience', 'Salary']

In [17]:
# How to access the columns and their entries for the df_pyspark dataframe
# This is done by using the select() method and the action show() is performed to get the result
# The return type is dataframe

# df_pyspark.select('Name').show()
# df_pyspark.select('age').show()
df_pyspark.select(['Name','Experience']).show() # For more than one column to access, input a list of columns

+---------+----------+
|     Name|Experience|
+---------+----------+
|    Krish|        10|
|Sudhanshu|         8|
|    Sunny|         4|
|     Paul|         3|
|   Harsha|         1|
|  Shubham|         2|
+---------+----------+



In [18]:
# Accessing the 'Name' column as pandas
df_pyspark['Name'] # It is just a column, so .show() method will not work

Column<'Name'>

In [20]:
# Check the data types that lies inside df_pyspark variable
# It is same as the pandas dataframe
# It provides the column name and its type in a tuple like object, that are elements of a list
df_pyspark.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int'), ('Salary', 'int')]

In [22]:
# Get the summary statistics from the df_pyspark column
# We use .describe().show() to obtain the results
# Unlike pandas dataframe, the describe() method of pyspark also returns the result for string variables like 'Name'
# The code: df_pyspark.describe() will return a dataframe without the result: that shows the column name and type
# To resolve this .show() is used
df_pyspark.describe().show()

+-------+------+------------------+-----------------+------------------+
|summary|  Name|               age|       Experience|            Salary|
+-------+------+------------------+-----------------+------------------+
|  count|     6|                 6|                6|                 6|
|   mean|  NULL|26.333333333333332|4.666666666666667|21333.333333333332|
| stddev|  NULL| 4.179314138308661|3.559026084010437| 5354.126134736337|
|    min|Harsha|                21|                1|             15000|
|    max| Sunny|                31|               10|             30000|
+-------+------+------------------+-----------------+------------------+



In [23]:
# Adding Columns in pyspark data frame df_pyspark
# The new column is named as 'Experience2': that captures the experience after 2 years
# To create a new column is pyspark dataframes, 'withColumn' function is used that takes two parameters as input
# The first parameter is the name of the new column
# The second parameter is how the new column is defined
df_pyspark = df_pyspark.withColumn('Experience2', df_pyspark['Experience'] + 2)

# It is not an inplace operation, you need to assign it into another variable

In [24]:
# Get the complete data frame after creating a new column
df_pyspark.show()

+---------+---+----------+------+-----------+
|     Name|age|Experience|Salary|Experience2|
+---------+---+----------+------+-----------+
|    Krish| 31|        10| 30000|         12|
|Sudhanshu| 30|         8| 25000|         10|
|    Sunny| 29|         4| 20000|          6|
|     Paul| 24|         3| 20000|          5|
|   Harsha| 21|         1| 15000|          3|
|  Shubham| 23|         2| 18000|          4|
+---------+---+----------+------+-----------+



In [25]:
# Drop the columns: specifically drop the column 'Experience2'
# The drop() method is used just like pandas data frame
# It takes the column names by default: it can take one column name in (), or multiple column names in list format inside ()
df_pyspark = df_pyspark.drop('Experience2')

# This is also not an inplace operation
# Need to assign the result in a variable

In [26]:
# Check the data frame after droping the 'Experience2' column
df_pyspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [27]:
# Rename the columns: old name is 'Name' and new name is 'New Name'
# It uses the withColumnRenamed function
# Takes two inputs: the first input the the old name, the second input is the new name
df_pyspark = df_pyspark.withColumnRenamed('Name','New Name')

# This is also not an inplace operation
# Need to assign the result in a variable

In [28]:
# Check the renamed pyspark data frame
df_pyspark.show()

+---------+---+----------+------+
| New Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+

