In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Dataframe_Use').getOrCreate()

In [3]:
spark

In [4]:
## Reading the dataset

In [5]:
path = r"C:\Users\Azam\Desktop\Extra Work\Udemy ML\UNZIP_FOR_NOTEBOOKS_FINAL\03-Pandas\movie_scores.csv"

In [6]:
## read the dataset
df_pyspark=spark.read.option('header','true').csv(path,inferSchema=True) # IF inferSchema is not set to true then Pyspark will treat all the columns as string

In [7]:
df_pyspark.show()

+----------+---------+----+----+---------------+----------------+
|first_name|last_name| age| sex|pre_movie_score|post_movie_score|
+----------+---------+----+----+---------------+----------------+
|       Tom|    Hanks|63.0|   m|            8.0|            10.0|
|      null|     null|null|null|           null|            null|
|      Hugh|  Jackman|51.0|   m|           null|            null|
|     Oprah|  Winfrey|66.0|   f|            6.0|             8.0|
|      Emma|    Stone|31.0|   f|            7.0|             9.0|
+----------+---------+----+----+---------------+----------------+



In [8]:
# drop the columns

df_pyspark.drop('last_name').show()

+----------+----+----+---------------+----------------+
|first_name| age| sex|pre_movie_score|post_movie_score|
+----------+----+----+---------------+----------------+
|       Tom|63.0|   m|            8.0|            10.0|
|      null|null|null|           null|            null|
|      Hugh|51.0|   m|           null|            null|
|     Oprah|66.0|   f|            6.0|             8.0|
|      Emma|31.0|   f|            7.0|             9.0|
+----------+----+----+---------------+----------------+



In [9]:
df_pyspark.show()

+----------+---------+----+----+---------------+----------------+
|first_name|last_name| age| sex|pre_movie_score|post_movie_score|
+----------+---------+----+----+---------------+----------------+
|       Tom|    Hanks|63.0|   m|            8.0|            10.0|
|      null|     null|null|null|           null|            null|
|      Hugh|  Jackman|51.0|   m|           null|            null|
|     Oprah|  Winfrey|66.0|   f|            6.0|             8.0|
|      Emma|    Stone|31.0|   f|            7.0|             9.0|
+----------+---------+----+----+---------------+----------------+



In [None]:
# drop based on null values

# wherever there is a null value that rows will be deleated
# df_pyspark.na.drop().show() # if how='any' then if any null value is found then simply drop it ; if how='all' then only those rows will be dropped where all the values are null ; threshold is used to tell that atleast how many non null values should be there; subset param is used to tell that drop only according to this columns

In [11]:
# Filling missing values
'''
The na.fill('Missing values') was not working for the integer columns like Age/Experience/Salary
Only if we read the dataset as 'inferSchema=False' and all the columns are taken in string format by default we can fill
the missing values with a string like 'Missing Values'......same goes for the string columns if we fill the missing values
with a '0'

'''
df_pyspark.na.fill('Missing Values').show()

+--------------+--------------+----+--------------+---------------+----------------+
|    first_name|     last_name| age|           sex|pre_movie_score|post_movie_score|
+--------------+--------------+----+--------------+---------------+----------------+
|           Tom|         Hanks|63.0|             m|            8.0|            10.0|
|Missing Values|Missing Values|null|Missing Values|           null|            null|
|          Hugh|       Jackman|51.0|             m|           null|            null|
|         Oprah|       Winfrey|66.0|             f|            6.0|             8.0|
|          Emma|         Stone|31.0|             f|            7.0|             9.0|
+--------------+--------------+----+--------------+---------------+----------------+



In [12]:
# Imputer function in pyspark

from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['age', 'pre_movie_score', 'post_movie_score'], 
    outputCols=["{}_imputed".format(c) for c in ['age', 'pre_movie_score', 'post_movie_score']]
    ).setStrategy("median")

In [13]:
# Add imputation cols to df
imputer.fit(df_pyspark).transform(df_pyspark).show()

+----------+---------+----+----+---------------+----------------+-----------+-----------------------+------------------------+
|first_name|last_name| age| sex|pre_movie_score|post_movie_score|age_imputed|pre_movie_score_imputed|post_movie_score_imputed|
+----------+---------+----+----+---------------+----------------+-----------+-----------------------+------------------------+
|       Tom|    Hanks|63.0|   m|            8.0|            10.0|       63.0|                    8.0|                    10.0|
|      null|     null|null|null|           null|            null|       51.0|                    7.0|                     9.0|
|      Hugh|  Jackman|51.0|   m|           null|            null|       51.0|                    7.0|                     9.0|
|     Oprah|  Winfrey|66.0|   f|            6.0|             8.0|       66.0|                    6.0|                     8.0|
|      Emma|    Stone|31.0|   f|            7.0|             9.0|       31.0|                    7.0|          