# Predicting Flight Delays with Apache Spark

## CLASSIFICATION

Write the description here.

In [1]:
import os
import pandas as pd
import numpy as np

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.types import *
from pyspark.sql.functions import round, col
import pyspark.sql.functions as F

from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator

from pyspark.mllib.evaluation import RegressionMetrics

from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
from pyspark.ml.feature import Bucketizer, StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

from pyspark.mllib.evaluation import MulticlassMetrics

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

sns.set(context='notebook', style='whitegrid')

In [4]:
from pylab import rcParams
rcParams['figure.figsize'] = 20, 4

In [5]:
# setting random seed for notebook replicability
rnd_seed=42
np.random.seed=42

## 1. Understanding the Data Set

## 2. Creating the Spark Session

In [6]:
os.environ['SPARK_HOME']

'D:\\Work\\spark-2.3.0-bin-hadoop2.7'

In [7]:
spark = SparkSession.builder.master("local[2]").appName("predict-us-census-income").getOrCreate()
spark

In [8]:
sc = spark.sparkContext
sc

In [9]:
sqlContext = SQLContext(spark.sparkContext)
sqlContext

<pyspark.sql.context.SQLContext at 0x17fed5664e0>

## 3. Load The Data From a File Into a Dataframe

In [10]:
ADULT_TRAIN_DATA = 'data/adult-training.csv'
ADULT_TEST_DATA = 'data/adult-test.csv'

In [11]:
# define the schema, corresponding to a line in the csv data file.
schema = StructType([
    StructField("Age", IntegerType(), nullable=True),
    StructField("Workclass", StringType(), nullable=True),
    StructField("fnlgwt", DoubleType(), nullable=True),
    StructField("Education", StringType(), nullable=True),
    StructField("Education Num", DoubleType(), nullable=True),
    StructField("Marital Status", StringType(), nullable=True),
    StructField("Occupation", StringType(), nullable=True),
    StructField("Relationship", StringType(), nullable=True),
    StructField("Race", StringType(), nullable=True),
    StructField("Sex", StringType(), nullable=True),
    StructField("Capital Gain", DoubleType(), nullable=True),
    StructField("Capital Loss", DoubleType(), nullable=True),
    StructField("Hours/Week", DoubleType(), nullable=True),
    StructField("Native Country", StringType(), nullable=True),
    StructField("Income", StringType(), nullable=True)]
)

In [12]:
# Load training data
adult_train_df = (spark
                  .read
                  .csv(path=ADULT_TRAIN_DATA, schema=schema, ignoreLeadingWhiteSpace=True, ignoreTrailingWhiteSpace=True))
adult_train_df = adult_train_df.withColumn('data_set', F.lit('train')).cache()

In [13]:
adult_train_df.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Workclass: string (nullable = true)
 |-- fnlgwt: double (nullable = true)
 |-- Education: string (nullable = true)
 |-- Education Num: double (nullable = true)
 |-- Marital Status: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Relationship: string (nullable = true)
 |-- Race: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Capital Gain: double (nullable = true)
 |-- Capital Loss: double (nullable = true)
 |-- Hours/Week: double (nullable = true)
 |-- Native Country: string (nullable = true)
 |-- Income: string (nullable = true)
 |-- data_set: string (nullable = false)



In [14]:
# Load testing data
adult_test_df = (spark
                  .read
                  .csv(path=ADULT_TEST_DATA, schema=schema, ignoreLeadingWhiteSpace=True, ignoreTrailingWhiteSpace=True))
adult_test_df = adult_test_df.withColumn('data_set', F.lit('test')).cache()

In [15]:
# sample the training data
adult_train_df.sample(withReplacement=False, fraction=0.01, seed=rnd_seed).toPandas().head()

Unnamed: 0,Age,Workclass,fnlgwt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours/Week,Native Country,Income,data_set
0,49,Private,160187.0,9th,5.0,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0.0,0.0,16.0,Jamaica,<=50K,train
1,44,Private,198282.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,15024.0,0.0,60.0,United-States,>50K,train
2,53,Private,95647.0,9th,5.0,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K,train
3,24,Private,388093.0,Bachelors,13.0,Never-married,Exec-managerial,Not-in-family,Black,Male,0.0,0.0,40.0,United-States,<=50K,train
4,20,?,214635.0,Some-college,10.0,Never-married,?,Own-child,White,Male,0.0,0.0,24.0,United-States,<=50K,train


In [16]:
# sample the test data
adult_test_df.sample(withReplacement=False, fraction=0.01, seed=rnd_seed).toPandas().head()

Unnamed: 0,Age,Workclass,fnlgwt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours/Week,Native Country,Income,data_set
0,29,?,227026.0,HS-grad,9.0,Never-married,?,Unmarried,Black,Male,0.0,0.0,40.0,United-States,<=50K.,test
1,33,Private,202191.0,Some-college,10.0,Never-married,Adm-clerical,Unmarried,Black,Female,0.0,0.0,35.0,United-States,<=50K.,test
2,26,Private,206721.0,HS-grad,9.0,Never-married,Handlers-cleaners,Unmarried,White,Male,0.0,0.0,40.0,United-States,<=50K.,test
3,38,?,48976.0,HS-grad,9.0,Married-civ-spouse,?,Wife,White,Female,0.0,1887.0,10.0,United-States,>50K.,test
4,31,Private,213339.0,HS-grad,9.0,Separated,Tech-support,Not-in-family,White,Female,0.0,0.0,40.0,United-States,<=50K.,test


In [17]:
adult_train_df.count()

32561

In [18]:
adult_test_df.count()

16281

In [19]:
adult_union_df = adult_train_df.union(adult_test_df)

In [20]:
adult_union_df.count()

48842

In [21]:
# sample the test data
adult_union_df.stat.sampleBy('data_set', fractions={'test':0.7, 'train':0.3}).sample(withReplacement=False, fraction=0.0006, seed=rnd_seed).toPandas()

Unnamed: 0,Age,Workclass,fnlgwt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours/Week,Native Country,Income,data_set
0,30,Local-gov,352542.0,Bachelors,13.0,Divorced,Prof-specialty,Unmarried,White,Female,0.0,0.0,40.0,United-States,<=50K,train
1,42,Local-gov,227065.0,Masters,14.0,Never-married,Prof-specialty,Not-in-family,White,Male,0.0,0.0,22.0,United-States,<=50K,train
2,35,Private,334999.0,HS-grad,9.0,Married-civ-spouse,Sales,Husband,White,Male,0.0,0.0,45.0,United-States,<=50K,train
3,63,Self-emp-not-inc,28612.0,HS-grad,9.0,Widowed,Sales,Not-in-family,White,Male,0.0,0.0,70.0,United-States,<=50K,train
4,26,Private,181666.0,Assoc-acdm,12.0,Married-civ-spouse,Adm-clerical,Own-child,White,Female,0.0,0.0,40.0,?,<=50K,train
5,28,Private,204734.0,Some-college,10.0,Married-civ-spouse,Tech-support,Wife,White,Female,0.0,0.0,40.0,United-States,<=50K,train
6,43,Private,248186.0,Assoc-acdm,12.0,Never-married,Adm-clerical,Not-in-family,White,Female,0.0,0.0,38.0,United-States,<=50K,train
7,58,Self-emp-inc,204021.0,Bachelors,13.0,Married-civ-spouse,Craft-repair,Husband,White,Male,15024.0,0.0,50.0,United-States,>50K.,test
8,23,Private,164901.0,Some-college,10.0,Never-married,Exec-managerial,Own-child,White,Male,0.0,0.0,20.0,United-States,<=50K.,test
9,53,Private,151411.0,HS-grad,9.0,Married-civ-spouse,Craft-repair,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.,test


In [22]:
adult_union_df = adult_union_df.replace(to_replace='?', value=None)

In [23]:
# There is a typo in the test set the values are '<=50K.' & '>50K.' instead of '<=50K' & '>50K'
adult_union_df = adult_union_df.replace(to_replace='<=50K.', value='<=50K')
adult_union_df = adult_union_df.replace(to_replace='>50K.', value='>50K')

In [24]:
# sample the test data
adult_union_df.stat.sampleBy('data_set', fractions={'test':0.7, 'train':0.3}).sample(withReplacement=False, fraction=0.0006, seed=rnd_seed).toPandas()

Unnamed: 0,Age,Workclass,fnlgwt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours/Week,Native Country,Income,data_set
0,20,Private,179423.0,Some-college,10.0,Never-married,Transport-moving,Own-child,White,Female,0.0,0.0,40.0,United-States,<=50K,train
1,20,Private,209955.0,Some-college,10.0,Never-married,Other-service,Own-child,White,Male,0.0,0.0,25.0,United-States,<=50K,train
2,34,Private,169605.0,10th,6.0,Separated,Other-service,Unmarried,White,Female,0.0,0.0,36.0,United-States,<=50K,train
3,34,Private,107793.0,HS-grad,9.0,Married-civ-spouse,Craft-repair,Husband,White,Male,0.0,0.0,40.0,Germany,>50K,train
4,20,,169184.0,Some-college,10.0,Never-married,,Other-relative,Black,Female,0.0,0.0,40.0,United-States,<=50K,train
5,49,Private,200198.0,HS-grad,9.0,Married-civ-spouse,Other-service,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K,train
6,36,Private,218689.0,Masters,14.0,Married-civ-spouse,Sales,Husband,White,Male,0.0,1977.0,50.0,United-States,>50K,train
7,32,Private,174201.0,HS-grad,9.0,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0.0,0.0,38.0,United-States,>50K,test
8,29,Private,27436.0,HS-grad,9.0,Divorced,Other-service,Not-in-family,White,Female,0.0,0.0,35.0,United-States,<=50K,test
9,50,Self-emp-inc,181498.0,HS-grad,9.0,Married-civ-spouse,Sales,Husband,White,Male,0.0,0.0,40.0,United-States,<=50K,test


In [25]:
income_indexer = StringIndexer(inputCol='Income', outputCol='Label').fit(adult_union_df)

In [26]:
income_indexer.labels

['<=50K', '>50K']

In [27]:
adult_union_df = income_indexer.transform(adult_union_df)

In [28]:
# check the income level and index mapping 
adult_union_df.select(["Income", "Label"]).distinct().show()

+------+-----+
|Income|Label|
+------+-----+
| <=50K|  0.0|
|  >50K|  1.0|
+------+-----+



In [29]:
# check the Education and Education Num 
adult_union_df.select(["Education", "Education Num"]).distinct().show()

+------------+-------------+
|   Education|Education Num|
+------------+-------------+
|   Preschool|          1.0|
|         9th|          5.0|
|   Assoc-voc|         11.0|
|   Bachelors|         13.0|
|     1st-4th|          2.0|
|     7th-8th|          4.0|
|        12th|          8.0|
|     5th-6th|          3.0|
|   Doctorate|         16.0|
| Prof-school|         15.0|
|  Assoc-acdm|         12.0|
|     Masters|         14.0|
|        11th|          7.0|
|     HS-grad|          9.0|
|Some-college|         10.0|
|        10th|          6.0|
+------------+-------------+



There is a one-to-one mapping between Education and Education Num. We will drop Education.

In [30]:
adult_union_df.select(["Relationship"]).distinct().show()

+--------------+
|  Relationship|
+--------------+
|     Own-child|
| Not-in-family|
|     Unmarried|
|          Wife|
|Other-relative|
|       Husband|
+--------------+



In [31]:
adult_union_df.select(["Marital Status"]).distinct().show()

+--------------------+
|      Marital Status|
+--------------------+
|           Separated|
|       Never-married|
|Married-spouse-ab...|
|            Divorced|
|             Widowed|
|   Married-AF-spouse|
|  Married-civ-spouse|
+--------------------+



In [32]:
adult_union_df.select(["Race"]).distinct().show()

+------------------+
|              Race|
+------------------+
|             Other|
|Amer-Indian-Eskimo|
|             White|
|Asian-Pac-Islander|
|             Black|
+------------------+



In [33]:
adult_union_df.select(["Sex"]).distinct().show()

+------+
|   Sex|
+------+
|Female|
|  Male|
+------+



In [34]:
adult_union_df.select(["Native Country"]).distinct().show()

+------------------+
|    Native Country|
+------------------+
|       Philippines|
|           Germany|
|          Cambodia|
|            France|
|            Greece|
|            Taiwan|
|              null|
|           Ecuador|
|         Nicaragua|
|              Hong|
|              Peru|
|             India|
|             China|
|             Italy|
|Holand-Netherlands|
|              Cuba|
|             South|
|              Iran|
|           Ireland|
|          Thailand|
+------------------+
only showing top 20 rows



In [35]:
adult_union_df.na.df.toPandas().head()

Unnamed: 0,Age,Workclass,fnlgwt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours/Week,Native Country,Income,data_set,Label
0,39,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K,train,0.0
1,50,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K,train,0.0
2,38,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K,train,0.0
3,53,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K,train,0.0
4,28,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K,train,0.0


In [36]:
adult_union_df.select([F.count(F.when(F.isnan(c) | col(c).isNull(), c)).alias(c) for c in adult_union_df.columns]).toPandas()

Unnamed: 0,Age,Workclass,fnlgwt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours/Week,Native Country,Income,data_set,Label
0,0,2799,0,0,0,0,2809,0,0,0,0,0,0,857,0,0,0


In [39]:
spark.stop()