# Final Project - Pre-Processing

In [1]:
import re
import ast
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
from pyspark.sql import types, Row, Column
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler, VectorIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
# start Spark Session
from pyspark.sql import SparkSession
app_name = "finalProject"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

In [None]:
spark

In [None]:
# read in raw text set and write to parquet
#train = spark.read.option('header', 'false').csv('data/train.txt', sep='\t')
#train.write.format('parquet').save('data/train.parquet')

In [None]:
# MODULO_NUMBER = 100000
MODULO_NUMBER = 10000

In [None]:
# read in parqet
train = spark.read.parquet('gs://w261_desa2/notebooks/data/train.parquet')

In [None]:
# rename label column
train = train.withColumnRenamed('_c0', 'label')

# remove underscores
for c in train.columns[1:]:
    train = train.withColumnRenamed(c, c.strip('_'))

for i,c in enumerate(train.columns[1:14]):
    newName = 'n' + str(i)
    train = train.withColumnRenamed(c, newName)
    
for i,c in enumerate(train.columns[14:]):
    newName = 'c' + str(i)
    train = train.withColumnRenamed(c, newName)

train.columns

In [None]:
# cast numerical is float
for c in train.columns[:14]:
    train = train.withColumn(c, train[c].cast('float'))
train.printSchema()

In [None]:
# grab a sample
train = train.sample(False, 0.01)
#s = train
#s.count()

In [None]:
#s.head()

# Train/Test Split

In [None]:
# on sample
#trainSample, testSample = s.randomSplit([9.0, 1.0], 666)
#trainSample = trainSample.cache()
#testSample = testSample.cache()

In [None]:
#trainSample.count(), testSample.count()

In [None]:
# on full
train, test = train.randomSplit([9.0, 1.0], 666)

In [None]:
train = train.cache()
test = test.cache()

In [None]:
train.count(), test.count()

### Feature Engineering Section Intro

The main feature engineering challenge with this dataset is how to reduce the number of features from the number that would result if we naively one-hot encoded each categorical variable. We experimented with two distinct solutions to this problem:
- Modulo-based hashing function: We drew inspiration from the writeup of one of the kaggle competition winners, who used a hashing function to reduce the number of categorial variables. For this implementation, we kept all 26 of the initial categorical features, but dropped the infrequent values (any that occurred fewer than 10 times) and then took the variables that still had very large numbers of categories remaining and hashed them to a smaller size (effectively randomly binning multiple values for that feature into a single dummy feature). This methodology is explained in more detail below.
- Random forest feature selection: Our desire to experiment with interaction terms in our model led us to the need to reduce our features much more significantly. We decided to try using feature importances from a Random Forest model in order to completely eliminate some of the categorical variables from the final training set. This also gave us a good excuse to learn how to use another model in the Spark ML package. 

# Random Forest Feature Selector

(I feel like maybe this code for training the RF should go in an appendix or something, and in the main notebook, we should just show the code for loading in the trained RF, getting the features, and then including them)

In [None]:
# vector assembler won't work if any values are null
train = train.na.fill(0, subset=train.columns[1:14])
test = test.na.fill(0, subset=test.columns[1:14])

In [None]:
categorical_cols = [c for c in train.columns if 'c' in c or 'label' in c]
indexed_cols = []
for col in categorical_cols:
    new_col_name = "{}_indexed".format(col)
    vec_indexer = StringIndexer(inputCol=col, outputCol=new_col_name, handleInvalid='keep').fit(train)
    train = vec_indexer.transform(train)
    test = vec_indexer.transform(test)
    indexed_cols.append(new_col_name)

In [None]:
train.head()

In [None]:
cols_for_features = [c for c in train.columns if 'n' in c and c!="label_indexed"]
print(cols_for_features)
assembler = VectorAssembler(inputCols=cols_for_features, outputCol="features")
newTrain = assembler.transform(train)
newTest = assembler.transform(test)

In [None]:
newTrain.head()

In [None]:
# Train a RandomForest model.
start_time = time.time()
rf = RandomForestClassifier(labelCol="label_indexed", featuresCol="features", numTrees=5, maxBins=1300000)
fit_rf = rf.fit(newTrain)
end_time = time.time()
print("Total time taken = {}".format(end_time - start_time))

In [None]:
predictions = fit_rf.transform(newTest)

In [None]:
### probably need to write a function to get accuracy 

In [None]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

In [None]:
evaluator.getMetricName()

In [None]:
predictions.filter(predictions["prediction"]==1).head(100)

In [None]:
# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

In [None]:
fit_rf.save('gs://w261_desa2/notebooks/models/emBaselineRf1PercentSample')

In [None]:
fit_rf.featureImportances

# Normalize numerical data

While normalizing the numeric data isn't strictly necessary for logistic regression, it is helpful to have features be generally in the same scale when using gradient descent, as it prevents the cost function from being significantly steeper around certain parameters due to scale only (can someone explain this better than I can??). 

In [None]:
def normalizeNumeric(trainDf, testDf):
    stats = trainDf[trainDf.columns[1:14]].describe()
    maxes = np.array(stats[stats['summary'] == 'max'].collect())[0][1:]
    mins = np.array(stats[stats['summary'] == 'min'].collect())[0][1:]
    maxes = [float(m) for m in maxes]
    mins = [float(m) for m in mins]
    
    for i,c in enumerate(trainDf.columns[1:14]):
        trainDf = trainDf.withColumn(c, (trainDf[c] - mins[i]) / (maxes[i] - mins[i]))
        testDf = testDf.withColumn(c, (testDf[c] - mins[i]) / (maxes[i] - mins[i]))
        
    trainDf = trainDf.na.fill(0, subset=trainDf.columns[1:14])
    testDf = testDf.na.fill(0, subset=testDf.columns[1:14])
    
    return trainDf, testDf

## on sample

In [None]:
# on sample
#trainSample, testSample = normalizeNumeric(trainSample, testSample)

In [None]:
#trainSample.head()

In [None]:
#testSample.head()

## on full

In [None]:
train, test = normalizeNumeric(train, test)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.cache()
test.cache()

# Categorical manipulation

In [None]:
def createFeatureVector(trainDf, testDf, n=10, modulo=10000):
    
    
    def findInfrequentValues(c, n=10):
        # c is the column that we are operating on
        counts = trainDf.groupBy(c).count()
        infrequentValues = counts.filter(counts['count'] <= n)
        s = infrequentValues.agg(F.collect_set(c)).collect()[0][0]
        return s

    # this value infreq_values is GLOBAL, rather than a param passed into the function
    # i really don't love that, but I can't figure out how to do it otherwise
    def replaceInfrequentValues(row_value):
        if row_value in infreq_values:
            return "infreq"
        else:
            return row_value

    #def replaceInfreqWrapper(infreq_list):
    #    return F.udf(lambda l: replaceInfrequentValues(l, infreq_list))

    #replaceInfreqWrapper=F.udf(lambda x: replaceInfrequentValues(x, infreq_values), types.StringType())

    replace_infreq_udf = F.udf(replaceInfrequentValues)

    # create hash function for binning categorical variables
    def hashValues(row):
        if row == "infreq":
            return str(row)
        elif row != None:
            # return integer value of hex label, modulo by 10000 (keep only the last 4 digits)
            return str(int('0x' + row, 16) % 10000)
        else:
            return str(row)

    # create the udf object from the helper function
    hash_udf = F.udf(hashValues)

    # hash all hex strings in both trainDfand test
    for c in trainDf.columns[14:]:
        print("eliminating infreqs/hashing column {}".format(c))
        infreq_values = findInfrequentValues(trainDf[c])
        trainDf = trainDf.withColumn(c, replace_infreq_udf(trainDf[c]))
        testDf = testDf.withColumn(c, replace_infreq_udf(testDf[c]))
        unique_values = trainDf.agg(F.countDistinct(trainDf[c])).collect()[0][0]
        if unique_values > 10000:
            trainDf = trainDf.withColumn(c, hash_udf(trainDf[c]))
            testDf = testDf.withColumn(c, hash_udf(testDf[c]))
  
    # index the hash values into categories
    for c in trainDf.columns[14:]:
        newCol = c + '_idx'
        indexer = StringIndexer(inputCol=c, outputCol=newCol, handleInvalid='keep')
        f = indexer.fit(trainDf)
        trainDf = f.transform(trainDf)
        testDf = f.transform(testDf)
        
    # One-hot encode the categorical indices
    inputCols = trainDf.columns[40:]
    outputCols = [c.strip('_idx') + '_OHE' for c in inputCols]
    encoder = OneHotEncoderEstimator(inputCols=inputCols, outputCols=outputCols)
    e = encoder.fit(trainDf)
    trainDf = e.transform(trainDf)
    testDf = e.transform(testDf)

    # assemble all features into single SparseVector column
    cols = [c for c in trainDf.columns if 'n' in c or 'OHE' in c]
    v = VectorAssembler(inputCols=cols, outputCol="features")
    trainDf = v.transform(trainDf)
    testDf = v.transform(testDf)
    return trainDf.cache(), testDf.cache()

## on full

In [None]:
train, test = createFeatureVector(train, test)

In [None]:
train.head(1000)

In [None]:
train.select('features').head()

In [None]:
train.columns

In [None]:
test.select('features').head()

In [None]:
train.write.format('parquet').save('gs:/notebooks/data/e412fullTrainMod50k.parquet')

In [None]:
test.write.format('parquet').save('gs:/notebooks/data/e412fullTestMod50k.parquet')

# baseline model run

In [None]:
# trying to get something to run as a proof of concept, then we can iterate longer later
# default value is 100
MAXITER = 10

In [None]:
lr = LogisticRegression(
    featuresCol='features', 
    labelCol='label', 
    regParam=0.01, 
    family='binomial',
    standardization=False,
    maxIter=MAXITER)

In [None]:
#%%time
model = lr.fit(train)

In [None]:
trainingSummary = model.summary
history = trainingSummary.objectiveHistory

In [None]:
plt.plot(history)

In [None]:
trainingSummary.accuracy

In [None]:
testSummary = model.evaluate(test)

In [None]:
testSummary.accuracy

In [None]:
model.save('gs://w261_desa2/notebooks/models/baselineModel')

In [None]:
sameModel = LogisticRegressionModel.load('gs://w261_desa2/notebooks/models/baselineModel')

# pipeline implementation

In [None]:
def hashCategoricals(trainDf, testDf):
    # create hash function for binning categorical variables
    def hashValues(row):
        if row != None:
            # return integer value of hex label, modulo by 10000 (keep only the last 4 digits)
            return str(int('0x' + row, 16) % 10000)
        else:
            return str(row)
    
    # create the udf object from the helper function
    udf_object = F.udf(hashValues)
    
    # hash all hex strings in both train and test
    for c in trainDf.columns[14:]:
        trainDf = trainDf.withColumn(c, udf_object(trainDf[c]))
        testDf = testDf.withColumn(c, udf_object(testDf[c]))
    
    return trainDf.cache(), testDf.cache()

In [None]:
trainSample, testSample = hashCategoricals(trainSample, testSample)

In [None]:
trainSample.head()

In [None]:
testSample.head()

In [None]:
# pipeline implementation
def createFeatureVector2(trainDf, testDf):
    # generate stages for pipeline
    stages = []
    
    # create indexer to hash values into categories
    for c in trainDf.columns[14:]:
        strIdxCol = c + '_idx'
        oheCol = strIdxCol.strip('_idx') + '_OHE'
        indexer = StringIndexer(inputCol=c, outputCol=strIdxCol, handleInvalid='keep')
        OHE = OneHotEncoderEstimator(inputCols=strIdxCol, outputCols=oheCol, dropLast=False)
        stages += [indexer, OHE]
        
    # One-hot encode the categorical indices
#     inputCols = trainDf.columns[40:]
#     outputCols = [c.strip('_idx') + '_OHE' for c in inputCols]
#     encoder = OneHotEncoderEstimator(inputCols=inputCols, outputCols=outputCols, dropLast=False)
#     stages += [encoder]
#     print(stages)
#     e = encoder.fit(trainDf)
#     trainDf = e.transform(trainDf)
#     testDf = e.transform(testDf)
    
    # assemble all features into single SparseVector column
#     cols = [c for c in trainDf.columns if 'n' in c or 'OHE' in c]
#     v = VectorAssembler(inputCols=cols, outputCol="features")
#     stages += [v]
#     trainDf = v.transform(trainDf)
#     testDf = v.transform(testDf)
    
    pipe = Pipeline(stages=stages)
    model = pipe.fit(trainDf)
    trainDf = model.transform(trainDf)
    testDf = model.transform(testDf)
    
    return trainDf.cache(), testDf.cache()

In [None]:
trainSample.head()

In [None]:
trainer, tester = createFeatureVector2(trainSample, testSample)

In [None]:
trainer.columns[-1], tester.columns[-1]

In [None]:
trainer.head()

In [None]:
tester.head()

In [None]:
trainSample.write.format('parquet').save('data/trainSample.parquet')