# Apache Spark
* Apache Spark is an analytics engine and framework to perform cluster computing. The inherent nature of Spark is parallel processing which allows it to complete the tasks in the fastest way possible. 
* Spark is **polyglot** which means it can be run on top of many programming languages such as Java, Scala, Python, and R.
* Spark runs everywhere. Spark runs on Hadoop, Apache Mesos, Kubernetes, standalone, or in the cloud. It can access diverse data sources.
* Spark uses DAG (Directed Acyclic Graph) to schedule tasks.

In [1]:
#install Pyspark~~~

!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.0.1.tar.gz (204.2 MB)
Collecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py): started
  Building wheel for pyspark (setup.py): finished with status 'done'
  Created wheel for pyspark: filename=pyspark-3.0.1-py2.py3-none-any.whl size=204612248 sha256=446dcf0f16b8f56fadbc6658698b60159de8c6231f19b3128d79b242593361a7
  Stored in directory: c:\users\rohitasoliya\appdata\local\pip\cache\wheels\ea\21\84\970b03913d0d6a96ef51c34c878add0de9e4ecbb7c764ea21f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.1


In [2]:
## install other dependencies

!pip install numpy



In [4]:
## install other dependencies

!pip install pandas



In [5]:
## install other dependencies

!pip install pandas



In [6]:
import math
import numpy as np 
import pandas as pd  
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan, when, count, col, isnull, asc, desc, mean

In [7]:
'''Create a spark session'''
spark = SparkSession.builder.master("local").appName("DataWrangling").getOrCreate()
'''Set this configuration to get output similar to pandas'''
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

In [9]:
df = spark.read.csv("test.csv", header = True)
df.limit(5)

PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. Jame...",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas...",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Al...",female,22.0,1,1,3101298,12.2875,,S


In [10]:
'''DF count'''
df.count()

418

In [11]:
'''Count by a coulmn'''
df.groupBy('sex').count()

sex,count
female,152
male,266


In [12]:
'''Distinct values of a particular column'''
df.select('Embarked').distinct()

Embarked
Q
C
S


In [14]:
'''Selecting particular columns'''
df.select('Age','Sex','Fare').limit(5)

Age,Sex,Fare
34.5,male,7.8292
47.0,female,7.0
62.0,male,9.6875
27.0,male,8.6625
22.0,female,12.2875


In [15]:
'''Find the count of missing values'''
df.select([count(when(isnull(column),column)).alias(column) for column in df.columns])


PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,0,0,86,0,0,0,1,327,0


In [16]:
'''Filtering for non null'''
df.filter(col('Age').isNotNull()).limit(5)

PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. Jame...",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas...",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Al...",female,22.0,1,1,3101298,12.2875,,S


In [17]:
'''Find the null values of 'Age' '''
df.filter(col('Age').isNull()).limit(5)

PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
902,3,"Ilieff, Mr. Ylio",male,,0,0,349220,7.8958,,S
914,1,"Flegenheim, Mrs. ...",female,,0,0,PC 17598,31.6833,,S
921,3,"Samaan, Mr. Elias",male,,2,0,2662,21.6792,,C
925,3,"""Johnston, Mrs. A...",female,,1,2,W./C. 6607,23.45,,S
928,3,"Roth, Miss. Sarah A",female,,0,0,342712,8.05,,S


In [19]:
'''Find the mean of the column "Age" '''
mean_ = df.select(mean(col('Age'))).take(1)[0][0]
mean_ = math.ceil(mean_)
print(mean_)

31


In [20]:
'''Find the value counts of Cabin and select the mode'''
df.groupBy(col('Cabin')).count().sort(desc("count")).limit(5)

Cabin,count
,327
B57 B59 B63 B66,3
C23 C25 C27,2
C6,2
C31,2


In [21]:
'''Find the mode of'''
embarked_mode = df.groupBy(col('Embarked')).count().sort(desc("count")).take(1)[0][0]

In [22]:
'''Fill the missing values'''
df = df.fillna({'Age':mean_,'Cabin':'C23','Embarked':embarked_mode})

In [None]:
'''Drop multiple columns'''
#df.drop('Age', 'Parch','Ticket').limit(5)

In [23]:
'''Finding the mean age of male and female'''
df.groupBy('Sex').agg(mean('Age'))

Sex,avg(Age)
female,30.39203947368421
male,30.43951127819549


In [24]:
'''Finding the mean Fare of male and female'''
df.groupBy('Sex').agg(mean('Fare'))

Sex,avg(Fare)
female,49.74769868421052
male,27.52787698113209
