# Exploratory Data Analysis 4
This notebook exports data suitable for input to pyHON. **Don't forget to kill the session at the end with `spark.stop()`!**

In [1]:
import findspark
findspark.init('/usr/hdp/current/spark2-client')

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
conf = (SparkConf().setMaster("yarn-client").setAppName("AirlineDataAnalysis")
        .set("spark.yarn.queue", "eecs598w19")
        .set("spark.executor.memory", "2g")
        .set("spark.executor.instances", "4")
        .set("spark.driver.memory", "2g")
        .set("spark.shuffle.service.enabled", "true")
        .set("spark.dynamicAllocation.enabled", "true")
        .set("spark.dynamicAllocation.minExecutors", "1")
        )

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import signac
from tqdm import tqdm
from util import hdfs_fn

# Show more columns so we can see everything
pd.options.display.max_columns = 50

project = signac.get_project()

In [3]:
job = project.find_jobs({"year": 2011, "quarter": 1}).next()
df = spark.read.csv(hdfs_fn(job, 'Coupon.csv'), header=True, inferSchema=True)

In [4]:
df.dtypes

[('ItinID', 'bigint'),
 ('MktID', 'bigint'),
 ('SeqNum', 'int'),
 ('Coupons', 'int'),
 ('Year', 'int'),
 ('OriginAirportID', 'int'),
 ('OriginAirportSeqID', 'int'),
 ('OriginCityMarketID', 'int'),
 ('Quarter', 'int'),
 ('Origin', 'string'),
 ('OriginCountry', 'string'),
 ('OriginStateFips', 'int'),
 ('OriginState', 'string'),
 ('OriginStateName', 'string'),
 ('OriginWac', 'int'),
 ('DestAirportID', 'int'),
 ('DestAirportSeqID', 'int'),
 ('DestCityMarketID', 'int'),
 ('Dest', 'string'),
 ('DestCountry', 'string'),
 ('DestStateFips', 'int'),
 ('DestState', 'string'),
 ('DestStateName', 'string'),
 ('DestWac', 'int'),
 ('Break', 'string'),
 ('CouponType', 'string'),
 ('TkCarrier', 'string'),
 ('OpCarrier', 'string'),
 ('RPCarrier', 'string'),
 ('Passengers', 'double'),
 ('FareClass', 'string'),
 ('Distance', 'double'),
 ('DistanceGroup', 'int'),
 ('Gateway', 'double'),
 ('ItinGeoType', 'int'),
 ('CouponGeoType', 'int'),
 ('_c36', 'string')]

In [5]:
#spark.stop()