<a href="https://colab.research.google.com/github/cagBRT/PySpark/blob/master/PySparkDelayedFlights.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://medium.com/@rmache/big-data-with-spark-in-google-colab-7c046e24b3


Check for the latest version of Spark at https://pypi.org/project/pyspark/#history




In [0]:
#Setting up Apache Spark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark


import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

print("If no error - everything is working")

from pyspark import SparkContext
sc = SparkContext.getOrCreate()

In [0]:
# Point Colaboratory to your Google Drive

from google.colab import drive
drive.mount('/content/gdrive')

Get the data

In [0]:
# Download datasets directly to your Google Drive "Colab Datasets" folder

import requests

# 2007 data

file_url = "http://stat-computing.org/dataexpo/2009/2007.csv.bz2"

r = requests.get(file_url, stream = True) 
with open("/content/gdrive/My Drive/2007.csv.bz2", "wb") as file: 
	for block in r.iter_content(chunk_size = 1024): 
		if block: 
			file.write(block)


In [0]:
# 2008 data

file_url = "http://stat-computing.org/dataexpo/2009/2008.csv.bz2"

r = requests.get(file_url, stream = True) 

with open("/content/gdrive/My Drive/2008.csv.bz2", "wb") as file: 
	for block in r.iter_content(chunk_size = 1024): 
		if block: 
			file.write(block)

In [0]:
# Tools we need to connect to the Spark server, load our data,
# clean it and prepare it
from pyspark import SparkContext
from pyspark.sql import SparkSession

from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.sql.functions import isnan, when, count, col
# Set up constants
CSV_2007= "/content/gdrive/My Drive/2007.csv.bz2" 
CSV_2008= "/content/gdrive/My Drive/2008.csv.bz2"
APP_NAME = "Flight Delays"
SPARK_URL = "local[*]"
RANDOM_SEED = 141109
TRAINING_DATA_RATIO = 0.7
RF_NUM_TREES = 8
RF_MAX_DEPTH = 4
RF_NUM_BINS = 32

In [0]:
# Connect to the Spark server

spark = SparkSession.builder.appName(APP_NAME).master(SPARK_URL).getOrCreate()

# Load datasets

df_2007 = spark.read.options(header="true",inferschema = "true").csv(CSV_2007)
df_2008 = spark.read.options(header="true",inferschema = "true").csv(CSV_2008)

# We concatenate both datasets

df = df_2007.unionAll(df_2008)

In [0]:
print(f"The shape is {df.count():d} rows by {len(df.columns):d} columns.")

In [0]:
null_counts = df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c)for c in df.columns]).toPandas().to_dict(orient='records')
print(f"We have {sum(null_counts[0].values()):d} null values in this dataset.")

In [0]:
df = df.drop(df.CancellationCode)
df = df.na.drop()

In [0]:
df.dtypes

In [0]:
df.select('Cancelled').distinct().rdd.map(lambda r: r[0]).collect()

In [0]:
feature_cols = ['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime', 'CRSArrTime', 'FlightNum', 'Distance', 'Diverted']

In [0]:
df = VectorAssembler(inputCols=feature_cols, outputCol="features").transform(df)


In [0]:
df.select("Cancelled", "features").show(5)


In [0]:
# Generate a labelIndexer
labelIndexer = StringIndexer(inputCol="Cancelled", outputCol="indexedLabel").fit(df)

# Generate the indexed feature vector
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(df)
    
# Split the data into training and tests sets
(trainingData, testData) = df.randomSplit([TRAINING_DATA_RATIO, 1 - TRAINING_DATA_RATIO])

# Train the RandomForest model
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=RF_NUM_TREES)

# Chain indexers and the forest models in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])

In [0]:
# Train model
model = pipeline.fit(trainingData)
# Make predictions
predictions = model.transform(testData)

In [0]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test Error = {(1.0 - accuracy):g}")
print(f"Accuracy = {accuracy:g}")