# PySpark

In [1]:
!pip install pyspark



## Import libraries and packages

In [2]:
import pyspark
import pandas as pd

## Uploading data

In [3]:
# Open the XLSX file
df = pd.read_excel('data/names_and_ages.xlsx')
df.head()

Unnamed: 0,Name,Age,Experience
0,Alice,25,2
1,Bob,30,4
2,Charlie,22,7
3,David,35,12
4,Emma,28,9


## Reading data with SparkSession

In [4]:
from pyspark.sql import SparkSession

In [5]:
# creating the session with its respective name
spark = SparkSession.builder.appName('practice').getOrCreate()
spark

When we execute in our local, we anly work with one cluster. On the other and, working on cloud, we can create multiple cluster and instances.

To see the data in our file, we use the previous session and use the methods according to the PySpark documentation

In [6]:
csv_path = 'data/names_and_ages.csv'
spark.read.csv(csv_path, header = True, sep = ';')

DataFrame[Name: string, Age: string, Experience: string]

In [7]:
csv_path = 'data/names_and_ages.csv'
df_pyspark = spark.read.csv(csv_path, header = True, inferSchema = True ,sep = ';')

# To read the information in the dataframe
df_pyspark.show()

+-------+---+----------+
|   Name|Age|Experience|
+-------+---+----------+
|  Alice| 25|         2|
|    Bob| 30|         4|
|Charlie| 22|         7|
|  David| 35|        12|
|   Emma| 28|         9|
|  Frank| 40|         1|
|  Grace| 23|         3|
|  Henry| 32|        14|
|  Irene| 27|        25|
|   Jack| 33|         2|
|  Karen| 26|         3|
|    Leo| 29|         1|
|  Maria| 31|         0|
| Nathan| 37|         0|
| Olivia| 24|         3|
|   Paul| 38|         1|
|  Quinn| 21|        14|
| Rachel| 34|         3|
|    Sam| 39|         6|
| Taylor| 36|        15|
+-------+---+----------+
only showing top 20 rows



Another way to upload the files, could be using the following

In [8]:
# Other form to read the information in the dataframe
df_pyspark2 = spark.read.option('header', 'true').csv(csv_path, inferSchema = True, sep = ';')

# To show the data in the dataframe
df_pyspark2.show()

+-------+---+----------+
|   Name|Age|Experience|
+-------+---+----------+
|  Alice| 25|         2|
|    Bob| 30|         4|
|Charlie| 22|         7|
|  David| 35|        12|
|   Emma| 28|         9|
|  Frank| 40|         1|
|  Grace| 23|         3|
|  Henry| 32|        14|
|  Irene| 27|        25|
|   Jack| 33|         2|
|  Karen| 26|         3|
|    Leo| 29|         1|
|  Maria| 31|         0|
| Nathan| 37|         0|
| Olivia| 24|         3|
|   Paul| 38|         1|
|  Quinn| 21|        14|
| Rachel| 34|         3|
|    Sam| 39|         6|
| Taylor| 36|        15|
+-------+---+----------+
only showing top 20 rows



In [9]:
# To see te type
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [10]:
# To extract the 4 first rows in the dataframe
df_pyspark.head(4)

[Row(Name='Alice', Age=25, Experience=2),
 Row(Name='Bob', Age=30, Experience=4),
 Row(Name='Charlie', Age=22, Experience=7),
 Row(Name='David', Age=35, Experience=12)]

In [11]:
# To see what is the schema of our dataframe, checking the data types for each column
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



### Selecting columns and indexing

In [12]:
# Reading the dataframe columns 
df_pyspark.columns

['Name', 'Age', 'Experience']

In [13]:
# To select a column
df_pyspark.select('Name').show(5)

+-------+
|   Name|
+-------+
|  Alice|
|    Bob|
|Charlie|
|  David|
|   Emma|
+-------+
only showing top 5 rows



In [14]:
# To select several columns
df_pyspark.select(['Name','Experience']).show(5)

+-------+----------+
|   Name|Experience|
+-------+----------+
|  Alice|         2|
|    Bob|         4|
|Charlie|         7|
|  David|        12|
|   Emma|         9|
+-------+----------+
only showing top 5 rows



In [15]:
# To check the datatypes
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [16]:
# Checkout the describe options
df_pyspark.describe().show()

+-------+-------+------------------+-----------------+
|summary|   Name|               Age|       Experience|
+-------+-------+------------------+-----------------+
|  count|     49|                49|               49|
|   mean|   NULL|30.897959183673468|7.020408163265306|
| stddev|   NULL|6.0904464052289295|6.060424201043904|
|    min|  Alice|                20|                0|
|    max|Zachary|                45|               25|
+-------+-------+------------------+-----------------+



In [17]:
# Adding columns
df_pyspark = df_pyspark.withColumn('Experience After 3 Years', df_pyspark['Experience']+2)

In [18]:
# To check if the changes were made
df_pyspark.show()

+-------+---+----------+------------------------+
|   Name|Age|Experience|Experience After 3 Years|
+-------+---+----------+------------------------+
|  Alice| 25|         2|                       4|
|    Bob| 30|         4|                       6|
|Charlie| 22|         7|                       9|
|  David| 35|        12|                      14|
|   Emma| 28|         9|                      11|
|  Frank| 40|         1|                       3|
|  Grace| 23|         3|                       5|
|  Henry| 32|        14|                      16|
|  Irene| 27|        25|                      27|
|   Jack| 33|         2|                       4|
|  Karen| 26|         3|                       5|
|    Leo| 29|         1|                       3|
|  Maria| 31|         0|                       2|
| Nathan| 37|         0|                       2|
| Olivia| 24|         3|                       5|
|   Paul| 38|         1|                       3|
|  Quinn| 21|        14|                      16|


In [19]:
# Dropping columns
df_pyspark = df_pyspark.drop('Experience After 3 Years')

In [20]:
df_pyspark.show()

+-------+---+----------+
|   Name|Age|Experience|
+-------+---+----------+
|  Alice| 25|         2|
|    Bob| 30|         4|
|Charlie| 22|         7|
|  David| 35|        12|
|   Emma| 28|         9|
|  Frank| 40|         1|
|  Grace| 23|         3|
|  Henry| 32|        14|
|  Irene| 27|        25|
|   Jack| 33|         2|
|  Karen| 26|         3|
|    Leo| 29|         1|
|  Maria| 31|         0|
| Nathan| 37|         0|
| Olivia| 24|         3|
|   Paul| 38|         1|
|  Quinn| 21|        14|
| Rachel| 34|         3|
|    Sam| 39|         6|
| Taylor| 36|        15|
+-------+---+----------+
only showing top 20 rows



In [26]:
# To rename one column use
#df_pyspark = df_pyspark.withColumnRenamed('Name', 'New name')

#To rename all the columns use
list_columns = df_pyspark.columns
list_columns_spanish = ['Nombre' , 'Edad', 'Experiencia']

for name, new_name in zip(list_columns , list_columns_spanish):
    df_pyspark = df_pyspark.withColumnRenamed(name, new_name)

In [27]:
df_pyspark.show()

+-------+----+-----------+
| Nombre|Edad|Experiencia|
+-------+----+-----------+
|  Alice|  25|          2|
|    Bob|  30|          4|
|Charlie|  22|          7|
|  David|  35|         12|
|   Emma|  28|          9|
|  Frank|  40|          1|
|  Grace|  23|          3|
|  Henry|  32|         14|
|  Irene|  27|         25|
|   Jack|  33|          2|
|  Karen|  26|          3|
|    Leo|  29|          1|
|  Maria|  31|          0|
| Nathan|  37|          0|
| Olivia|  24|          3|
|   Paul|  38|          1|
|  Quinn|  21|         14|
| Rachel|  34|          3|
|    Sam|  39|          6|
| Taylor|  36|         15|
+-------+----+-----------+
only showing top 20 rows

