# What we will learn in this notebook 
* how to deal with pyspark Dataframes 
* reading the dataset 
* checking the data types of the columns, and the schema 
* selecting columns and indexing
* check and describe the option similar to pandas. 
* adding columns
* dropping columns
* Dropping rows 
* Various parameter in dropping functionalities
* handling missing values

In [12]:
# import pyspark and pandas 
import pandas as pd 
from pyspark.sql import SparkSession

### How to deal with pyspark Dataframes & Reading the dataset 

In [13]:
# creating a session for spark 
spark = SparkSession.builder.appName('DataFrame').getOrCreate() 

In [14]:
spark

In [16]:
# reading the dataset 
df_pySpark = spark.read.option('header', 'true').csv('data.csv')

In [17]:
# to show the rows, we use the show  method
df_pySpark.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|    Ahmed| 27|        10|
|Abdelaziz| 23|        20|
|     Eman| 30|         5|
+---------+---+----------+



### 3. Checking the datatypes


In [18]:
# we can print the schema 
df_pySpark.printSchema() 
# below you can notice that everything is of type string, but age and experience are of type int, so how to make them int? 

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [19]:
# there is a property called inferSchema, we should set it to true, to make spark automatically infer the datatype 
df_pySpark = spark.read.option('header', 'true').csv('data.csv', inferSchema = True)
df_pySpark.show()
df_pySpark.printSchema()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|    Ahmed| 27|        10|
|Abdelaziz| 23|        20|
|     Eman| 30|         5|
+---------+---+----------+

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [20]:
 # we can read it also in another way, in one line
df_pySpark = spark.read.csv('data.csv', inferSchema = True, header= True)
df_pySpark.show()
df_pySpark.printSchema()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|    Ahmed| 27|        10|
|Abdelaziz| 23|        20|
|     Eman| 30|         5|
+---------+---+----------+

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



### 4. Check describe option similar to pandas

In [21]:
## this is how to get all the columns in the dataframe -> similar to pandas
df_pySpark.columns

['Name', 'Age', 'Experience']

In [22]:
# get the top 3 rows
df_pySpark.head(3)

[Row(Name='Ahmed', Age=27, Experience=10),
 Row(Name='Abdelaziz', Age=23, Experience=20),
 Row(Name='Eman', Age=30, Experience=5)]

In [23]:
# To select a column, we use the select method 
df_pySpark.select('Name').show() 

+---------+
|     Name|
+---------+
|    Ahmed|
|Abdelaziz|
|     Eman|
+---------+



In [25]:
# multiple cols
df_pySpark.select(['Name', 'Age']).show()

+---------+---+
|     Name|Age|
+---------+---+
|    Ahmed| 27|
|Abdelaziz| 23|
|     Eman| 30|
+---------+---+



> Note that we can not use the same method in pandas by accessing the column using the square brackets [...] -> this will return object of Column, and not all rows in the column

In [27]:
df_pySpark['Name']

Column<'Name'>

In [28]:
# to get all the datatypes 
df_pySpark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [30]:
df_pySpark.describe().show() # statistical operation like in pandas.

+-------+---------+------------------+------------------+
|summary|     Name|               Age|        Experience|
+-------+---------+------------------+------------------+
|  count|        3|                 3|                 3|
|   mean|     NULL|26.666666666666668|11.666666666666666|
| stddev|     NULL| 3.511884584284246| 7.637626158259733|
|    min|Abdelaziz|                23|                 5|
|    max|     Eman|                30|                20|
+-------+---------+------------------+------------------+



# 5. Adding and Dropping columns

In [32]:
# withColumn, returns a new dataframe, by adding a new column, or replacing an existing one.
newDf = df_pySpark.withColumn('Experience After 2 Years', df_pySpark['Experience'] + 2, )
newDf.show()

+---------+---+----------+------------------------+
|     Name|Age|Experience|Experience After 2 Years|
+---------+---+----------+------------------------+
|    Ahmed| 27|        10|                      12|
|Abdelaziz| 23|        20|                      22|
|     Eman| 30|         5|                       7|
+---------+---+----------+------------------------+



In [33]:
# dropping columns 
droped = df_pySpark.drop('Experience')
droped.show()

+---------+---+
|     Name|Age|
+---------+---+
|    Ahmed| 27|
|Abdelaziz| 23|
|     Eman| 30|
+---------+---+



In [34]:
 # renaming columns 
df_pySpark.withColumnRenamed('Name', 'New Name').show()

+---------+---+----------+
| New Name|Age|Experience|
+---------+---+----------+
|    Ahmed| 27|        10|
|Abdelaziz| 23|        20|
|     Eman| 30|         5|
+---------+---+----------+



### 6. Dropping rows 

In [50]:
new_data = spark.read.csv('data.csv', header= True, inferSchema=True)
new_data.show() 

+---------+-----+----------+-------+
|     Name|  Age|Experience| Salary|
+---------+-----+----------+-------+
|    Ahmed| 27.0|      10.0|30000.0|
|Abdelaziz| 23.0|      20.0|25000.0|
|     Eman| 32.0|      50.0|10000.0|
|     Kmal| NULL|      NULL|13000.0|
|     NULL| NULL|      NULL|   NULL|
|     Omar|350.0|       1.0|   NULL|
|  hamdoly| 20.0|      15.0|   NULL|
+---------+-----+----------+-------+



In [47]:
# in order to get the null rows, we use the .na property, then on them, we can drop any row which contains null
new_data.na.drop().show()

+---------+----+----------+-------+
|     Name| Age|Experience| Salary|
+---------+----+----------+-------+
|    Ahmed|27.0|      10.0|30000.0|
|Abdelaziz|23.0|      20.0|25000.0|
|     Eman|32.0|      50.0|10000.0|
+---------+----+----------+-------+



In [52]:
'''
    in drop
        1. how = any -> this means that if we found any null value, we should drop it
        2. how = all -> this means that the entire row should be null in order to drop it.
        2. how = all -> this means that the entire row should be null in order to drop it.


'''
new_data.na.drop(how = 'all').show()

+---------+-----+----------+-------+
|     Name|  Age|Experience| Salary|
+---------+-----+----------+-------+
|    Ahmed| 27.0|      10.0|30000.0|
|Abdelaziz| 23.0|      20.0|25000.0|
|     Eman| 32.0|      50.0|10000.0|
|     Kmal| NULL|      NULL|13000.0|
|     Omar|350.0|       1.0|   NULL|
|  hamdoly| 20.0|      15.0|   NULL|
+---------+-----+----------+-------+



In [54]:
'''
    Threshold: 
        it indicates that atleast Threshold # of elements must be non null, otherwise drop this row
        this mean if threshold was 3 and we have a row which consist of 4 columns, and 2 or more were nulls, it will be droped.

'''
new_data.na.drop(thresh= 3).show()

+---------+-----+----------+-------+
|     Name|  Age|Experience| Salary|
+---------+-----+----------+-------+
|    Ahmed| 27.0|      10.0|30000.0|
|Abdelaziz| 23.0|      20.0|25000.0|
|     Eman| 32.0|      50.0|10000.0|
|     Omar|350.0|       1.0|   NULL|
|  hamdoly| 20.0|      15.0|   NULL|
+---------+-----+----------+-------+



In [56]:
'''
    Subset:
        We check on the null values of certain columns, if there are nulls there, then we can drop, otherwise we will keep them
'''
new_data.na.drop(subset= ['Name']).show()

+---------+-----+----------+-------+
|     Name|  Age|Experience| Salary|
+---------+-----+----------+-------+
|    Ahmed| 27.0|      10.0|30000.0|
|Abdelaziz| 23.0|      20.0|25000.0|
|     Eman| 32.0|      50.0|10000.0|
|     Kmal| NULL|      NULL|13000.0|
|     Omar|350.0|       1.0|   NULL|
|  hamdoly| 20.0|      15.0|   NULL|
+---------+-----+----------+-------+



### 8. Filling the missing values

In [67]:
'''
    To fill the missing values, we first need to access the null elements, then use the fill method
        it takes the filling value as first parameter
        the columns in which we want to fill 

'''
new_data.na.fill('Missing Value', 'Name').show()

+-------------+-----+----------+-------+
|         Name|  Age|Experience| Salary|
+-------------+-----+----------+-------+
|        Ahmed| 27.0|      10.0|30000.0|
|    Abdelaziz| 23.0|      20.0|25000.0|
|         Eman| 32.0|      50.0|10000.0|
|         Kmal| NULL|      NULL|13000.0|
|Missing Value| NULL|      NULL|   NULL|
|         Omar|350.0|       1.0|   NULL|
|      hamdoly| 20.0|      15.0|   NULL|
+-------------+-----+----------+-------+



In [74]:
new_data.na.fill({'Name': 'miss'}).show()


+---------+-----+----------+-------+
|     Name|  Age|Experience| Salary|
+---------+-----+----------+-------+
|    Ahmed| 27.0|      10.0|30000.0|
|Abdelaziz| 23.0|      20.0|25000.0|
|     Eman| 32.0|      50.0|10000.0|
|     Kmal| NULL|      NULL|13000.0|
|     miss| NULL|      NULL|   NULL|
|     Omar|350.0|       1.0|   NULL|
|  hamdoly| 20.0|      15.0|   NULL|
+---------+-----+----------+-------+



+---------+-----+----------+-------+
|     Name|  Age|Experience| Salary|
+---------+-----+----------+-------+
|    Ahmed| 27.0|      10.0|30000.0|
|Abdelaziz| 23.0|      20.0|25000.0|
|     Eman| 32.0|      50.0|10000.0|
|     Kmal| NULL|      NULL|13000.0|
|     NULL| NULL|      NULL|   NULL|
|     Omar|350.0|       1.0|   NULL|
|  hamdoly| 20.0|      15.0|   NULL|
+---------+-----+----------+-------+

