# PySpark Introduction

In [2]:
# nstall and import Dependencies
%pip install pyspark
%pip install pyspark[sql]
%pip install pyspark[pandas_on_spark] plotly
%pip install pyspark[connect]
%pip install pandas

import pyspark
import pandas as pd




# Tutorial 1: Initiating Spark Session

In [4]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Practice').getOrCreate()
spark

In [23]:
# Load the data from csv file
df_pyspark=spark.read.csv('sample.csv')

# Load and show data from csv file including header
print("Show pySpark data including header")
df_pyspark=spark.read.option('header','true').csv('sample.csv')
print(f"\nType of df_pyspark: {type(df_pyspark)}")

# Print schema
print("\nPrint Schema:")
df_pyspark.printSchema()

Show pySpark data including header

Type of df_pyspark: <class 'pyspark.sql.dataframe.DataFrame'>

Print Schema:
root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)



# Tutorial 2: Basics of pySpark
- PySpark Dataframe
- Reading dataset
- Check the datatypes of column (Schema)
- Selecting columns and indexing
- Check describe option similar to pandas
- Adding columns
- Dropping columns

In [48]:
##$ Getting attributes

# Read the data from csv file with header
df_pyspark=spark.read.option('header','true').csv('sample.csv', inferSchema=True)
df_pyspark.show()

# Check the schema
df_pyspark.printSchema()

# Get column names
print("\nGet column names:")
df_pyspark.columns

df_pyspark.describe().show()


+------+---+----------+
|  Name|Age|Experience|
+------+---+----------+
|Faisal| 27|         5|
| Ifaaf| 27|         4|
|Yunkai| 26|         5|
+------+---+----------+

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)


Get column names:
+-------+------+------------------+------------------+
|summary|  Name|               Age|        Experience|
+-------+------+------------------+------------------+
|  count|     3|                 3|                 3|
|   mean|  NULL|26.666666666666668| 4.666666666666667|
| stddev|  NULL|0.5773502691896258|0.5773502691896258|
|    min|Faisal|                26|                 4|
|    max|Yunkai|                27|                 5|
+-------+------+------------------+------------------+



In [46]:
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [47]:
# Get first 3 rows
print("\nGet first 3 rows:")
df_pyspark.head(3)



Get first 3 rows:


[Row(Name='Faisal', Age=27, Experience=5),
 Row(Name='Ifaaf', Age=27, Experience=4),
 Row(Name='Yunkai', Age=26, Experience=5)]

In [44]:
### Selecting Columns

# Select specific columns
df_pyspark.select('Name').show()

# Select multiple columns
df_pyspark.select(['Name','Age']).show()


+------+
|  Name|
+------+
|Faisal|
| Ifaaf|
|Yunkai|
+------+

+------+---+
|  Name|Age|
+------+---+
|Faisal| 27|
| Ifaaf| 27|
|Yunkai| 26|
+------+---+



In [53]:
### Adding columns in DataFrame
df_pyspark_full = df_pyspark.withColumn('Experience + 2 year', df_pyspark['Experience']+2)
df_pyspark_full.show()

+------+---+----------+-------------------+
|  Name|Age|Experience|Experience + 2 year|
+------+---+----------+-------------------+
|Faisal| 27|         5|                  7|
| Ifaaf| 27|         4|                  6|
|Yunkai| 26|         5|                  7|
+------+---+----------+-------------------+



In [54]:
### Drop columns in DataFrame
df_pyspark_full = df_pyspark.drop('Experience + 2 year')
df_pyspark_full.show()

+------+---+----------+
|  Name|Age|Experience|
+------+---+----------+
|Faisal| 27|         5|
| Ifaaf| 27|         4|
|Yunkai| 26|         5|
+------+---+----------+



In [56]:
### Rename columns in DataFrame
df_pyspark_full = df_pyspark.withColumnRenamed('Experience','Experience (yrs)')
df_pyspark_full.show()

+------+---+----------------+
|  Name|Age|Experience (yrs)|
+------+---+----------------+
|Faisal| 27|               5|
| Ifaaf| 27|               4|
|Yunkai| 26|               5|
+------+---+----------------+

