# CV & Grid Search
## Platform: Spark, colab.research.google.com

In [0]:
# install Spark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz
!tar xf spark-2.3.1-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
# init Spark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.3.1-bin-hadoop2.7"
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [0]:
import pandas as pd
import time

from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml import Pipeline as SparkPipeline
from pyspark.ml.classification import LogisticRegression as SparkLogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from sklearn.linear_model import LogisticRegression as sklearnLogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.pipeline import Pipeline as sklearnPipeline

In [12]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [13]:
flights_dfs = spark.read.csv("/content/gdrive/My Drive/Colab Notebooks/SparkAzureTutorial/data/flights.csv", header=True, inferSchema=True)
flights_dfs.describe().show(5, False)

+-------+------------------+-----------------+-------+------------------+------------------+------------------+------------------+
|summary|DayofMonth        |DayOfWeek        |Carrier|OriginAirportID   |DestAirportID     |DepDelay          |ArrDelay          |
+-------+------------------+-----------------+-------+------------------+------------------+------------------+------------------+
|count  |2702218           |2702218          |2702218|2702218           |2702218           |2702218           |2702218           |
|mean   |15.797897875004903|3.899480352806472|null   |12742.597593162358|12743.000197985506|10.510732294729737|6.6550108096386005|
|stddev |8.7988350691642   |1.985924603367557|null   |1501.8408475102513|1501.8014309297723|36.02975608466093 |38.547584236791245|
|min    |1                 |1                |9E     |10140             |10140             |-63               |-94               |
|max    |31                |7                |YV     |15376             |15376     

In [14]:
data = flights_dfs.select("DayofMonth", "DayOfWeek", "OriginAirportID", "DestAirportID", "DepDelay", ((col("ArrDelay") > 15).cast("Int").alias("label")))
splits = data.randomSplit([0.7, 0.3])
train = splits[0]
test = splits[1].withColumnRenamed("label", "trueLabel")
test.show(5, truncate=False)
print("Train len: {}, test len: {}".format(train.count(), test.count()))

+----------+---------+---------------+-------------+--------+---------+
|DayofMonth|DayOfWeek|OriginAirportID|DestAirportID|DepDelay|trueLabel|
+----------+---------+---------------+-------------+--------+---------+
|1         |1        |10140          |10397        |-2      |0        |
|1         |1        |10140          |10397        |0       |0        |
|1         |1        |10140          |10821        |8       |0        |
|1         |1        |10140          |11259        |-5      |0        |
|1         |1        |10140          |11259        |-5      |0        |
+----------+---------+---------------+-------------+--------+---------+
only showing top 5 rows

Train len: 1891290, test len: 810928


In [15]:
assembler = VectorAssembler(inputCols = ["DayofMonth", "DayOfWeek", "OriginAirportID", "DestAirportID", "DepDelay"], outputCol="features")
lr = SparkLogisticRegression(labelCol="label", featuresCol="features")
pipeline = SparkPipeline(stages=[assembler, lr])

print("Training model...")
start_time = time.time()
pipelineModel = pipeline.fit(train)
print("--- {} seconds ---".format(time.time() - start_time))
print("Evaluating model...")
prediction = pipelineModel.transform(test)

predicted = prediction.select("features", "trueLabel", "prediction")
predicted.show(20, truncate=False)

Training model...
--- 28.047178268432617 seconds ---
Evaluating model...
+-------------------------------+---------+----------+
|features                       |trueLabel|prediction|
+-------------------------------+---------+----------+
|[1.0,1.0,10140.0,10397.0,-2.0] |0        |0.0       |
|[1.0,1.0,10140.0,10397.0,0.0]  |0        |0.0       |
|[1.0,1.0,10140.0,10821.0,8.0]  |0        |0.0       |
|[1.0,1.0,10140.0,11259.0,-5.0] |0        |0.0       |
|[1.0,1.0,10140.0,11259.0,-5.0] |0        |0.0       |
|[1.0,1.0,10140.0,11259.0,0.0]  |0        |0.0       |
|[1.0,1.0,10140.0,11259.0,21.0] |1        |1.0       |
|[1.0,1.0,10140.0,11259.0,35.0] |1        |1.0       |
|[1.0,1.0,10140.0,11292.0,-4.0] |0        |0.0       |
|[1.0,1.0,10140.0,11292.0,-2.0] |0        |0.0       |
|[1.0,1.0,10140.0,11292.0,4.0]  |0        |0.0       |
|[1.0,1.0,10140.0,11292.0,22.0] |1        |1.0       |
|[1.0,1.0,10140.0,11292.0,41.0] |1        |1.0       |
|[1.0,1.0,10140.0,11298.0,-1.0] |0        |0.0 

In [0]:
def check_accuracy(y_test, y_pred):
    print("Post grid search\nAccuracy: {}".format(accuracy_score(y_test, y_pred)))
    print("Precision: {}".format(precision_score(y_test, y_pred)))
    print("Recall: {}".format(recall_score(y_test, y_pred)))
    print("F1: {}".format(f1_score(y_test, y_pred)))
    cm = confusion_matrix(y_test, y_pred)
    tn, fn, fp, tp = cm[0][0], cm[1][0], cm[0][1], cm[1][1]
    print("Confusion\t Predicted")
    print("\t\t True \t\t False")
    print("Actual True \t {} \t {}".format(tp, fn))
    print("Actual False \t {} \t\t {}".format(fp, tn))
    print("Confusion precision {}".format(tp/(tp + fp)))
    print("Confusion recall {}".format(tp/(tp + fn)))

In [17]:
predicted_spark = predicted.toPandas()
y_test = predicted_spark["trueLabel"]
y_pred = predicted_spark["prediction"]
check_accuracy(y_test, y_pred)

Post grid search
Accuracy: 0.9265791784227453
Precision: 0.8982061945254795
Recall: 0.7137933373205004
F1: 0.7954513441552864
Confusion	 Predicted
		 True 		 False
Actual True 	 115768 	 46419
Actual False 	 13120 		 635621
Confusion precision 0.8982061945254795
Confusion recall 0.7137933373205004


In [18]:
print("Starting grid search ...")
start_time = time.time()
paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.3, 0.1, 0.01]).addGrid(lr.maxIter, [10, 5]).addGrid(lr.threshold, [0.35, 0.30]).build()
cv = CrossValidator(estimator=pipeline, evaluator=BinaryClassificationEvaluator(), estimatorParamMaps=paramGrid, numFolds=2)
model = cv.fit(train)
print("--- {} seconds ---".format(time.time() - start_time))

Starting grid search ...
--- 471.94495725631714 seconds ---


In [19]:
prediction = model.transform(test)
predicted = prediction.select("features", "trueLabel", "prediction")
predicted_spark = predicted.toPandas()
y_test = predicted_spark["trueLabel"]
y_pred = predicted_spark["prediction"]
check_accuracy(y_test, y_pred)

Post grid search
Accuracy: 0.9258257206558412
Precision: 0.8769403541954502
Recall: 0.7318280749998458
F1: 0.7978395891589589
Confusion	 Predicted
		 True 		 False
Actual True 	 118693 	 43494
Actual False 	 16656 		 632085
Confusion precision 0.8769403541954502
Confusion recall 0.7318280749998458


## Platform: Pandas, scikit-learn, colab.research.google.com

In [20]:
flights_df = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/SparkAzureTutorial/data/flights.csv", header=0)
flights_df.describe()

Unnamed: 0,DayofMonth,DayOfWeek,OriginAirportID,DestAirportID,DepDelay,ArrDelay
count,2702218.0,2702218.0,2702218.0,2702218.0,2702218.0,2702218.0
mean,15.7979,3.89948,12742.6,12743.0,10.51073,6.655011
std,8.798835,1.985925,1501.841,1501.801,36.02976,38.54758
min,1.0,1.0,10140.0,10140.0,-63.0,-94.0
25%,8.0,2.0,11292.0,11292.0,-4.0,-11.0
50%,16.0,4.0,12892.0,12892.0,-1.0,-3.0
75%,23.0,6.0,14057.0,14057.0,9.0,10.0
max,31.0,7.0,15376.0,15376.0,1863.0,1845.0


In [21]:
# create labels
flights_df["late"] = 0
flights_df.loc[flights_df["ArrDelay"] > 15,"late"] = 1
flights_df.head(5)

Unnamed: 0,DayofMonth,DayOfWeek,Carrier,OriginAirportID,DestAirportID,DepDelay,ArrDelay,late
0,19,5,DL,11433,13303,-3,1,0
1,19,5,DL,14869,12478,0,-8,0
2,19,5,DL,14057,14869,-4,-15,0
3,19,5,DL,15016,11433,28,24,1
4,19,5,DL,11193,12892,-6,-11,0


In [0]:
X = flights_df.loc[:, ["DayofMonth", "DayOfWeek", "OriginAirportID", 
                       "DestAirportID", "DepDelay"]]
y = flights_df.loc[:, "late"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

In [0]:
class EncodeNumericalCol():
    def __init__(self, col_name):
        self.col_name = col_name
    
    def fit(self, X, y=None):
        self.ss = StandardScaler()
        self.ss.fit(X.loc[:, self.col_name].values.reshape(-1, 1))
        return self

    def transform(self, X):
        X[self.col_name] = self.ss.transform(X[self.col_name].values.reshape(-1, 1))
        return X

In [24]:
enc_dep_delay = EncodeNumericalCol(col_name="DepDelay")
model = sklearnLogisticRegression()
model.fit(X_train, y_train)
pipeline = sklearnPipeline([("enc_dep_delay", enc_dep_delay),
                            ("model", model)])



In [25]:
start_time = time.time()
print("Training model...")
pipeline.fit(X_train, y_train)
print("--- {} seconds ---".format(time.time() - start_time))
print("Evaluating model...")
y_pred = pipeline.predict(X_test)
check_accuracy(y_test, y_pred)

Training model...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


--- 12.057843446731567 seconds ---
Evaluating model...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


Post grid search
Accuracy: 0.907768427441141
Precision: 0.9791140566589107
Recall: 0.5492193685669042
F1: 0.7037056117171989
Confusion	 Predicted
		 True 		 False
Actual True 	 88789 	 72875
Actual False 	 1894 		 647108
Confusion precision 0.9791140566589107
Confusion recall 0.5492193685669042


In [28]:
parameters = {"solver": ["liblinear", "lbfgs"], "max_iter": [25, 100], 
              "C": [0.3, 1.0]}
model = GridSearchCV(sklearnLogisticRegression(), 
                     parameters, cv=3, scoring="f1")
start_time = time.time()
print("Starting grid search ...")
model.fit(X_train, y_train)
print("--- {} seconds ---".format(time.time() - start_time))
print(model.get_params)
model.cv_results_

Starting grid search ...




--- 278.1793808937073 seconds ---
<bound method BaseEstimator.get_params of GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'solver': ['liblinear', 'lbfgs'], 'max_iter': [25, 100], 'C': [0.3, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=0)>




{'mean_fit_time': array([ 8.69591196,  5.89076058,  8.79040122, 15.87993503,  8.40336013,
         5.89400689,  8.49388742, 16.43181904]),
 'mean_score_time': array([0.23999047, 0.23759659, 0.23413078, 0.24489792, 0.23123884,
        0.23237046, 0.22783597, 0.26777633]),
 'mean_test_score': array([0.73090004, 0.13596726, 0.73090004, 0.79155675, 0.71924914,
        0.13596726, 0.71924914, 0.79390517]),
 'mean_train_score': array([0.73130684, 0.13543593, 0.73130684, 0.79169317, 0.7199702 ,
        0.13543593, 0.7199702 , 0.79394061]),
 'param_C': masked_array(data=[0.3, 0.3, 0.3, 0.3, 1.0, 1.0, 1.0, 1.0],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_max_iter': masked_array(data=[25, 25, 100, 100, 25, 25, 100, 100],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_solver': masked_array(data=['liblinear', 'lbfgs'

In [29]:
y_pred = model.predict(X_test)
check_accuracy(y_test, y_pred)

Post grid search
Accuracy: 0.9265850547574463
Precision: 0.8958250990056807
Recall: 0.715007670229612
F1: 0.795267925021586
Confusion	 Predicted
		 True 		 False
Actual True 	 115591 	 46073
Actual False 	 13442 		 635560
Confusion precision 0.8958250990056807
Confusion recall 0.715007670229612
