# Multiple Logistic Regression in Spark  - College Admission

### Overview
Predict college admission using Multiple Logistic Regression
 
### Builds on
None

### Run time
approx. 10-20 minutes

### Notes



In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


## Step 1: College Admission Data

Let's look at the college admission data.  Here, we have some student test scores, GPA, and Rank, followed by whether the student was admitted or not.


|gre  |gpa  |rank |  admitted |
|-----------------------------|
|380  |3.61 | 3   |    no     |
|660  |3.67 | 1   |    yes    |
|800  |4.0  | 1   |    yes    |
|640  |3.19 | 4   |    yes    |
|520  |2.93 | 4   |    no     |
|760  |3.0  | 2   |    yes    |

In [None]:
admissions = spark.read.csv("/data/college-admissions/admission-data.csv", header=True, inferSchema=True)
admissions.show()

## Let's do a 4D plot !

We will use a 3d plot, and encode the fourth dimension as color.

In [None]:
admissions_pd = admissions.toPandas()

from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

x = np.random.standard_normal(100)
y = np.random.standard_normal(100)
z = np.random.standard_normal(100)
c = np.random.standard_normal(100)

ax.scatter(x, y, z, c=c, cmap=plt.hot())
plt.show()

## Step 3: Convert dataframe to Spark and Prepare feature vector

We need to firstconvert the dataframe to spark, and then prepare the feature vector.

**=> TODO: Select all columns except for "admit" to be in features **

**=> TODO: Make a new column called "label" with same value as "admit" **



In [None]:
assembler = VectorAssembler(inputCols=["???", "???","???"], outputCol="features")
featureVector = assembler.transform(admissions)
featureVector = featureVector.withColumn("label",featureVector["???"])
featureVector.show()


## Step 4: Split Data into training and Test

We will split our data into training and test so we can see how it performs.

**=> TODO: Use training / test split of 60%/40% **



In [None]:
# Split the data into train and test
splits = featureVector.randomSplit([???, ???], 1234)
train = splits[0]
test = splits[1]




## Step 5: Run logistic regression

**=> TODO: Run with 50 iteraitons **



In [None]:

lr = LogisticRegression(maxIter=???, regParam=0.3, elasticNetParam=0.8)


# Fit the model
lrModel = lr.fit(train)

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

In [None]:
lrModel.summary.predictions.show()

The output lists approval & estimated probabilities

In [None]:

# Extract the summary from the returned LogisticRegressionModel instance trained
# in the earlier example
trainingSummary = lrModel.summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)
print()
# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

# Set the model threshold to maximize F-Measure
fMeasure = trainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()
bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
    .select('threshold').head()['threshold']
lr.setThreshold(bestThreshold)

In [None]:
# ROC

roc_df = trainingSummary.roc.toPandas()

plt.plot(roc_df['FPR'], roc_df['TPR'])
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")
plt.plot([0.0, 1.0], [0.0, 1.0], 'r')

## Step 6: Run on the test data

In [None]:
## Step 4: Run on the test data

predictions = lrModel.transform(test)


## Step 7: Calcuate Accuracy on Test Data


In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

## Step 8: Run some predictions on new data

Let's take some new data, and run predictions on that.

In [None]:
newdata = pd.DataFrame({'gre' : [600, 700, 800], 
                        'gpa' : [4.0, 3.5, 3.2],
                        'rank': [1,   2,   3]}
             )
print(newdata)

spark_newdata = spark.createDataFrame(newdata)
newfeatures = assembler.transform(spark_newdata)
lrModel.transform(newfeatures).show()