## Welcome to our Pyspark tutorial

### 1. Intalling pyspark and reading dataset

In [2]:
# After installing pyspark we will import it and start playing with
# Please install pyspark first if it is not yet done with the command "pip install pyspark"
import pyspark
import pandas as pd

pd.read_csv("name.csv")

Unnamed: 0,Name,Age,Profession,Income
0,Alex,25,Data Scientist,120000
1,Iannis,22,Pro Tennis,934500
2,William,26,Entrepreneur,1400000
3,Donald,30,IT,85000
4,Cedric,31,Marketing Manager,90000
5,Mathew,27,Accountant,78000


In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [5]:
spark

In [6]:
df_pyspark = spark.read.csv('name.csv')

In [11]:
# Reading the dataset with the first row as header
df_pyspark = spark.read.option('header','true').csv('name.csv')

In [14]:
# df_pyspark.head(4)
# And now we can check the data types
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Profession: string (nullable = true)
 |-- Income: string (nullable = true)



### 2. Pyspark Dataframes

In [29]:
# Now let's read the dataset with the appropriate data type for each columns
df_pyspark = spark.read.csv('name.csv', header=True, inferSchema=True)
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Profession: string (nullable = true)
 |-- Income: string (nullable = true)



In [22]:
# Checking the columns
df_pyspark.columns

['Name', 'Age', 'Profession', 'Income']

In [26]:
# How to select a specific column or a list of columns with the "select" operation
df_pyspark.select(['Name','Age']).show()

+-------+---+
|   Name|Age|
+-------+---+
|   Alex| 25|
| Iannis| 22|
|William| 26|
| Donald| 30|
| Cedric| 31|
| Mathew| 27|
+-------+---+



In [30]:
# Checking the data types and describe summary
df_pyspark.dtypes
df_pyspark.describe().show()

+-------+-------+------------------+----------+----------+
|summary|   Name|               Age|Profession|    Income|
+-------+-------+------------------+----------+----------+
|  count|      6|                 6|         6|         6|
|   mean|   NULL|26.833333333333332|      NULL|      NULL|
| stddev|   NULL|3.3115957885386114|      NULL|      NULL|
|    min|   Alex|                22|Accountant|$1,400,000|
|    max|William|                31|Pro Tennis|   $90,000|
+-------+-------+------------------+----------+----------+



In [38]:
### Adding a new column in dataframe, could be also done by aggregating an existing col
from pyspark.sql.functions import when, lit
exp_list = [4, 5, 2, 6, 4, 3]
new_df = df_pyspark.withColumn("Experience",
                              when((df_pyspark.Name == "Alex") | (df_pyspark.Name == "Cedric"), lit(4)).
                               when((df_pyspark.Name == "Iannis"), lit(5)).
                               when((df_pyspark.Name == "William"), lit(2)).
                               when((df_pyspark.Name == "Donald") | (df_pyspark.Name == "Mathew"), lit(3)))
new_df.show()

+-------+---+-----------------+----------+----------+
|   Name|Age|       Profession|    Income|Experience|
+-------+---+-----------------+----------+----------+
|   Alex| 25|   Data Scientist|  $120,000|         4|
| Iannis| 22|       Pro Tennis|$9,345,000|         5|
|William| 26|     Entrepreneur|$1,400,000|         2|
| Donald| 30|               IT|   $85,000|         3|
| Cedric| 31|Marketing Manager|   $90,000|         4|
| Mathew| 27|       Accountant|   $78,000|         3|
+-------+---+-----------------+----------+----------+



&copy; Please note that this notebook was built following the instruction of the [freeCodeCamp.org Pyshark tutorial on YouTube](https://www.youtube.com/watch?v=_C8kWso4ne4). The dataset belongs to the owner of the notebook and some lines of code were changed on purpose for a better hands-on experience of Pyshark.