## Welcome to our Pyspark tutorial

### 1. Intalling pyspark and reading dataset

In [27]:
# After installing pyspark we will import it and start playing with
# Please install pyspark first if it is not yet done with the command "pip install pyspark"
import pyspark
import pandas as pd

pd.read_csv("name.csv")

Unnamed: 0,Name,Age,Profession,Income
0,Alex,25,Data Scientist,120000.0
1,Iannis,22,Pro Tennis,9345000.0
2,William,26,Entrepreneur,1400000.0
3,Donald,30,IT,85000.0
4,Cedric,31,Marketing Manager,90000.0
5,Mathew,27,Accountant,78000.0
6,Aboubakar,34,,65000.0
7,,19,Designer,


In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [5]:
spark

In [7]:
df_pyspark = spark.read.csv('name.csv')

In [39]:
# Reading the dataset with the first row as header
df_pyspark = spark.read.option('header','true').csv('name.csv')

In [29]:
# df_pyspark.head(4)
# And now we can check the data types
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Profession: string (nullable = true)
 |-- Income: string (nullable = true)



### 2. Pyspark Dataframes

In [45]:
# Now let's read the dataset with the appropriate data type for each columns
df_pyspark = spark.read.csv('name.csv', header=True, inferSchema=True)
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Profession: string (nullable = true)
 |-- Income: integer (nullable = true)



In [11]:
# Checking the columns
df_pyspark.columns

['Name', 'Age', 'Profession', 'Income']

In [24]:
# How to select a specific column or a list of columns with the "select" operation
df_pyspark.select(['Name','Age']).show()

+---------+---+
|     Name|Age|
+---------+---+
|     Alex| 25|
|   Iannis| 22|
|  William| 26|
|   Donald| 30|
|   Cedric| 31|
|   Mathew| 27|
|Aboubakar| 34|
|     NULL| 19|
+---------+---+



In [30]:
# Checking the data types and describe summary
df_pyspark.dtypes
df_pyspark.describe().show()

+-------+---------+-----------------+----------+------------------+
|summary|     Name|              Age|Profession|            Income|
+-------+---------+-----------------+----------+------------------+
|  count|        7|                8|         7|                 7|
|   mean|     NULL|            26.75|      NULL|1597571.4285714286|
| stddev|     NULL|4.891683905218266|      NULL| 3451169.311848901|
|    min|Aboubakar|               19|Accountant|            120000|
|    max|  William|               34|Pro Tennis|           9345000|
+-------+---------+-----------------+----------+------------------+



In [38]:
### Adding a new column in dataframe, could be also done by aggregating an existing col
from pyspark.sql.functions import when, lit
exp_list = [4, 5, 2, 6, 4, 3]
new_df = df_pyspark.withColumn("Experience",
                              when((df_pyspark.Name == "Alex") | (df_pyspark.Name == "Cedric"), lit(4)).
                               when((df_pyspark.Name == "Iannis"), lit(5)).
                               when((df_pyspark.Name == "William"), lit(2)).
                               when((df_pyspark.Name == "Donald") | (df_pyspark.Name == "Mathew"), lit(3)))
new_df.show()

+-------+---+-----------------+----------+----------+
|   Name|Age|       Profession|    Income|Experience|
+-------+---+-----------------+----------+----------+
|   Alex| 25|   Data Scientist|  $120,000|         4|
| Iannis| 22|       Pro Tennis|$9,345,000|         5|
|William| 26|     Entrepreneur|$1,400,000|         2|
| Donald| 30|               IT|   $85,000|         3|
| Cedric| 31|Marketing Manager|   $90,000|         4|
| Mathew| 27|       Accountant|   $78,000|         3|
+-------+---+-----------------+----------+----------+



### 3. Handling Missing values

In [13]:
#Dropping columns
df_pyspark.drop('Name').show()

+---+-----------------+-----------+
|Age|       Profession|     Income|
+---+-----------------+-----------+
| 25|   Data Scientist|  $120,000 |
| 22|       Pro Tennis|$9,345,000 |
| 26|     Entrepreneur|$1,400,000 |
| 30|               IT|   $85,000 |
| 31|Marketing Manager|   $90,000 |
| 27|       Accountant|   $78,000 |
| 34|             NULL|   $65,000 |
| 19|         Designer|       NULL|
+---+-----------------+-----------+



In [40]:
#Now let's focus on dropping a rows with null values with na
df_pyspark.na.drop().show()

+-------+---+-----------------+-------+
|   Name|Age|       Profession| Income|
+-------+---+-----------------+-------+
|   Alex| 25|   Data Scientist| 120000|
| Iannis| 22|       Pro Tennis|9345000|
|William| 26|     Entrepreneur|1400000|
| Donald| 30|               IT|  85000|
| Cedric| 31|Marketing Manager|  90000|
| Mathew| 27|       Accountant|  78000|
+-------+---+-----------------+-------+



In [22]:
## Dropping with how=any is the default and similar to the previous line of code, let's try any=all
df_pyspark.na.drop(how="all").show() #We notice that nothing happens

+-------+---+-----------------+-----------+
|   Name|Age|       Profession|     Income|
+-------+---+-----------------+-----------+
|   Alex| 25|   Data Scientist|  $120,000 |
| Iannis| 22|       Pro Tennis|$9,345,000 |
|William| 26|     Entrepreneur|$1,400,000 |
| Donald| 30|               IT|   $85,000 |
| Cedric| 31|Marketing Manager|   $90,000 |
| Mathew| 27|       Accountant|   $78,000 |
|   NULL| 34|             NULL|   $65,000 |
|   NULL| 19|         Designer|       NULL|
+-------+---+-----------------+-----------+



In [25]:
## Threshold is the min number of missing values to drop the row
df_pyspark.na.drop(how="any", thresh=2).show()

+---------+---+-----------------+-----------+
|     Name|Age|       Profession|     Income|
+---------+---+-----------------+-----------+
|     Alex| 25|   Data Scientist|  $120,000 |
|   Iannis| 22|       Pro Tennis|$9,345,000 |
|  William| 26|     Entrepreneur|$1,400,000 |
|   Donald| 30|               IT|   $85,000 |
|   Cedric| 31|Marketing Manager|   $90,000 |
|   Mathew| 27|       Accountant|   $78,000 |
|Aboubakar| 34|             NULL|   $65,000 |
|     NULL| 19|         Designer|       NULL|
+---------+---+-----------------+-----------+



In [50]:
## Subset of dropping
df_pyspark.na.drop(how="any", subset=['Income']).show()

+-------+----+-----------------+-------+
|   Name| Age|       Profession| Income|
+-------+----+-----------------+-------+
|   Alex|  25|   Data Scientist| 120000|
| Iannis|  22|       Pro Tennis|9345000|
|William|  26|     Entrepreneur|1400000|
| Donald|  30|               IT|  85000|
| Cedric|  31|Marketing Manager|  90000|
| Mathew|  27|       Accountant|  78000|
|   NULL|NULL|         Designer|  65000|
+-------+----+-----------------+-------+



In [48]:
# Filling the missing values with mean of the column with Imputer, you can also set the strategy to median
from pyspark.ml.feature import Imputer
imputer = Imputer(inputCols=['Age', 'Income'],
                 outputCols=["{}_imputed".format(c) for c in ['Age', 'Income']]).setStrategy("median")

In [49]:
imputer.fit(df_pyspark).transform(df_pyspark).show()

+---------+----+-----------------+-------+-----------+--------------+
|     Name| Age|       Profession| Income|Age_imputed|Income_imputed|
+---------+----+-----------------+-------+-----------+--------------+
|     Alex|  25|   Data Scientist| 120000|         25|        120000|
|   Iannis|  22|       Pro Tennis|9345000|         22|       9345000|
|  William|  26|     Entrepreneur|1400000|         26|       1400000|
|   Donald|  30|               IT|  85000|         30|         85000|
|   Cedric|  31|Marketing Manager|  90000|         31|         90000|
|   Mathew|  27|       Accountant|  78000|         27|         78000|
|Aboubakar|  34|             NULL|   NULL|         34|         90000|
|     NULL|NULL|         Designer|  65000|         27|         65000|
+---------+----+-----------------+-------+-----------+--------------+



### 4. Filter Operations

&copy; This notebook was inspired by the [freeCodeCamp.org Pyshark tutorial on YouTube](https://www.youtube.com/watch?v=_C8kWso4ne4). The original dataset was dropped, the one used was created by the owner of the notebook and some lines of code were intentionally changed for a better hands-on experience of the tool.