<a href="https://colab.research.google.com/github/cagBRT/PySpark/blob/master/PySparkDelayedFlights.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://medium.com/@rmache/big-data-with-spark-in-google-colab-7c046e24b3


https://medium.com/grabngoinfo/install-pyspark-3-on-google-colab-the-easy-way-577ec4a2bcd8

Check for the latest version of Spark at https://pypi.org/project/pyspark/#history

https://www.transtats.bts.gov/OT_Delay/OT_DelayCause1.asp


https://grabngoinfo.com/install-pyspark-3-on-google-colab-the-easy-way/



In [None]:
!unset SPARK_HOME

In [None]:
!git clone https://github.com/cagBRT/PySpark.git

In [None]:
!pip install pyspark

In [None]:
#Import SparkSession
from pyspark.sql import SparkSession
# Create a Spark Session
spark = SparkSession.builder.master("local[*]").getOrCreate()
# Check Spark Session Information
spark

In [None]:
#Import a Spark function from library
from pyspark.sql.functions import col

In [None]:
import os
#os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
#os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

print("If no error - everything is working")

from pyspark import SparkContext
sc = SparkContext.getOrCreate()

In [None]:
# Point Colaboratory to your Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# Clone the entire repo.
!git clone -l -s https://github.com/cagBRT/PySpark.git cloned-repo
#%cd cloned-repo
#!ls

Get the data

In [None]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

In [None]:
# Tools we need to connect to the Spark server, load our data,
# clean it and prepare it
from pyspark import SparkContext
from pyspark.sql import SparkSession

from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.sql.functions import isnan, when, count, col
# Set up constants
FLIGHTS= "/content/gdrive/My Drive/flights.csv" 
AIRPORTS= "/content/cloned-repo/airports.csv"
AIRLINES= "/content/cloned-repo/airlines.csv"
APP_NAME = "Flight Delays"
SPARK_URL = "local[*]"
RANDOM_SEED = 141109
TRAINING_DATA_RATIO = 0.7
RF_NUM_TREES = 8
RF_MAX_DEPTH = 4
RF_NUM_BINS = 32

In [None]:
# Connect to the Spark server
spark = SparkSession.builder.appName(APP_NAME).master(FLIGHTS).getOrCreate()

# Load datasets
flights_df = spark.read.options(header="true",inferschema = "true").csv(FLIGHTS)

In [None]:
print(f"The shape is {flights_df.count():d} rows by {len(flights_df.columns):d} columns.")

In [None]:
null_counts = flights_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c)for c in flights_df.columns]).toPandas().to_dict(orient='records')
print(f"We have {sum(null_counts[0].values()):d} null values in this dataset.")

In [None]:
flights_df.columns

In [None]:
flights_df = flights_df.drop(flights_df.CANCELLATION_REASON)
flights_df = flights_df.na.drop()

In [None]:
flights_df.dtypes

In [None]:
flights_df.select('CANCELLED').distinct().rdd.map(lambda r: r[0]).collect()

In [None]:
feature_cols = ['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'DEPARTURE_TIME', 'ARRIVAL_DELAY', 'FLIGHT_NUMBER', 'DISTANCE', 'DIVERTED']

In [None]:
flights_df = VectorAssembler(inputCols=feature_cols, outputCol="features").transform(flights_df)


In [None]:
flights_df.select("Cancelled", "features").show(5)


In [None]:
# Generate a labelIndexer
labelIndexer = StringIndexer(inputCol="CANCELLED", outputCol="indexedLabel").fit(flights_df)

# Generate the indexed feature vector
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(flights_df)
    
# Split the data into training and tests sets
(trainingData, testData) = flights_df.randomSplit([TRAINING_DATA_RATIO, 1 - TRAINING_DATA_RATIO])

# Train the RandomForest model
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=RF_NUM_TREES)

# Chain indexers and the forest models in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])

In [None]:
# Train model
model = pipeline.fit(trainingData)
# Make predictions
predictions = model.transform(testData)

In [None]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test Error = {(1.0 - accuracy):g}")
print(f"Accuracy = {accuracy:g}")