<a href="https://colab.research.google.com/github/banno-0720/big-data/blob/main/PySpark_DataFrames.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This notebook covers
- PySpark Dataframe
- Reading the Dataset
- Checking the Datatypes of the Column(Schema)
- Selecting Columns and Indexing
- Check Describe option similar to Pandas
- Adding Columns
- Dropping Columns
- Renaming Columns

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('DataFrame').getOrCreate()

In [3]:
spark

In [12]:
# Read the dataset(method 1)
df_pyspark = spark.read.option('header','true').csv('test1.csv')
df_pyspark.show()

+----+---+----------+
|Name|Age|Experience|
+----+---+----------+
| abc| 21|         3|
| def| 18|         0|
| xyz| 24|         4|
+----+---+----------+



In [13]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [14]:
# Read the dataset(method 2)
df_pyspark = spark.read.csv('test1.csv',header=True,inferSchema=True)
df_pyspark.show()

+----+---+----------+
|Name|Age|Experience|
+----+---+----------+
| abc| 21|         3|
| def| 18|         0|
| xyz| 24|         4|
+----+---+----------+



In [15]:
# Check the schema
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [16]:
type(df_pyspark)

In [17]:
df_pyspark.head(3)

[Row(Name='abc', Age=21, Experience=3),
 Row(Name='def', Age=18, Experience=0),
 Row(Name='xyz', Age=24, Experience=4)]

In [19]:
df_pyspark.select('Name')

DataFrame[Name: string]

In [20]:
df_pyspark.select('Name').show()

+----+
|Name|
+----+
| abc|
| def|
| xyz|
+----+



In [21]:
type(df_pyspark.select('Name'))

In [22]:
df_pyspark.select(['Name','Experience']).show()

+----+----------+
|Name|Experience|
+----+----------+
| abc|         3|
| def|         0|
| xyz|         4|
+----+----------+



In [23]:
df_pyspark['Name']

Column<'Name'>

In [24]:
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [25]:
df_pyspark.describe()

DataFrame[summary: string, Name: string, Age: string, Experience: string]

In [26]:
df_pyspark.describe().show()

+-------+----+----+------------------+
|summary|Name| Age|        Experience|
+-------+----+----+------------------+
|  count|   3|   3|                 3|
|   mean|NULL|21.0|2.3333333333333335|
| stddev|NULL| 3.0|2.0816659994661326|
|    min| abc|  18|                 0|
|    max| xyz|  24|                 4|
+-------+----+----+------------------+



In [27]:
# Adding Columns in data frame
df_pyspark = df_pyspark.withColumn('Experience After 2 year',df_pyspark['Experience']+2)

In [28]:
df_pyspark.show()

+----+---+----------+-----------------------+
|Name|Age|Experience|Experience After 2 year|
+----+---+----------+-----------------------+
| abc| 21|         3|                      5|
| def| 18|         0|                      2|
| xyz| 24|         4|                      6|
+----+---+----------+-----------------------+



In [29]:
### Drop the columns
df_pyspark = df_pyspark.drop("Experience After 2 year")

In [30]:
df_pyspark.show()

+----+---+----------+
|Name|Age|Experience|
+----+---+----------+
| abc| 21|         3|
| def| 18|         0|
| xyz| 24|         4|
+----+---+----------+



In [31]:
df_pyspark.withColumnRenamed('Name','New Name').show()

+--------+---+----------+
|New Name|Age|Experience|
+--------+---+----------+
|     abc| 21|         3|
|     def| 18|         0|
|     xyz| 24|         4|
+--------+---+----------+

