In [None]:
#fit a binary logistic regression model to the baby name dataset. 
#This model will predict the sex of a person based on the age, name, and state they were born in. 
#To train the model, I used data found in baby-names/names-classifier.

In [None]:
#load file to df
path = "/FileStore/tables/streaming/names_*.csv"
autoschema = spark.read.load(path, format="csv", sep=",", inferSchema=True, header=True)
dataSchema = autoschema.schema
autoschema.count()

In [None]:
#First, we need to prepare each of the input features. 
#While age is a numeric feature, state and name are not. 
#These need to be converted into numeric vectors before we can train the model. 

#Use a StringIndexer along with the OneHotEncoderEstimator to convert the name, state, and sex columns into numeric vectors.
#https://stackoverflow.com/questions/36942233/apply-stringindexer-to-several-columns-in-a-pyspark-dataframe

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer

indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(autoschema) for column in list(set(autoschema.columns)-set(['date'])) ]

pipeline = Pipeline(stages=indexers)
df_r = pipeline.fit(autoschema).transform(autoschema)

df_r.show()


In [None]:
#https://stackoverflow.com/questions/32982425/encode-and-assemble-multiple-features-in-pyspark
from pyspark.ml.feature import OneHotEncoderEstimator

encoder = OneHotEncoderEstimator(inputCols=["name_index", "sex_index", "state_index"], outputCols = ["name_hot", "sex_hot", "state_hot"])
df_e = encoder.fit(df_r)
df_ee = df_e.transform(df_r)
df_ee.show()

In [None]:
#Use the VectorAssembler to combine the name, state, and age vectors into a single features vector. 
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=["name_hot", "sex_hot", "state_hot"], outputCol="features")

df_vector = assembler.transform(df_ee)

df_vector.show()

In [None]:
#Our final dataset should contain a column called features containing the prepared vector and a column called label containing the sex of the person.

df_vector = df_vector.withColumnRenamed("sex_index", "label")

df_vector.show()

In [None]:
thedf = df_vector.select("label", "features")
thedf.show()

In [None]:
#2. Fit and Evaluate the Model

#Fit the model as a logistic regression model with the following parameters. LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8). Provide the area under the ROC curve for the model.

#https://spark.apache.org/docs/2.1.1/ml-classification-regression.html

from pyspark.ml.classification import LogisticRegression

(training, test) = thedf.randomSplit([0.8, 0.2])

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
lrModel = lr.fit(training)

# Print the coefficients and intercept for multinomial logistic regression
print("Coefficients: \n" + str(lrModel.coefficientMatrix))
print("Intercept: " + str(lrModel.interceptVector))

In [None]:
test_prediction = lrModel.transform(test)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()

evaluation = evaluator.evaluate(test_prediction)

print("evaluation (area under ROC): %f" % evaluation)