# Predicting Graduate School Admissions with PySpark ML

### Yu chien (Calvin) Ma

## Installing Dependencies & Creating a SparkSession


In [2]:
#create a sparksession

In [3]:
import findspark
findspark.init()
findspark.find()

'C:\\Users\\calvi\\anaconda3\\lib\\site-packages\\pyspark'

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
.appName('spark') \
.master('local[*]') \
.config('spark.sql.execution.arrow.pyspark.enabled', True) \
.config('spark.sql.session.timeZone', 'UTC') \
.config('spark.driver.memory','16G') \
.config('spark.ui.showConsoleProgress', True) \
.config('spark.sql.repl.eagerEval.enabled', True) \
.getOrCreate()

## Cloning & Exploring dataset

In [86]:
#clone the dataset
!git clone https://github.com/calvinma888/PySparkML_GradAdmissionsChance.git

Cloning into 'PySparkMLGradAdmissions'...


In [13]:
#path of dataset
path = r"C:\Users\calvi\Documents\Portfolio Projects\admission_dataset\Grad_Admissions.csv"

In [14]:
#create a spark dataframe
df = spark.read.csv(path, header=True, inferSchema= True)

In [15]:
#display dataframe
df.show(5)

+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|Serial No|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|        1|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|        2|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|        3|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|        4|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|        5|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
only showing top 5 rows



In [16]:
#get the no.of rows & columns
print((df.count(), len(df.columns)))

(500, 9)


In [18]:
#print schema 
df.printSchema()

root
 |-- Serial No: integer (nullable = true)
 |-- GRE Score: integer (nullable = true)
 |-- TOEFL Score: integer (nullable = true)
 |-- University Rating: integer (nullable = true)
 |-- SOP: double (nullable = true)
 |-- LOR: double (nullable = true)
 |-- CGPA: double (nullable = true)
 |-- Research: integer (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [21]:
#get the summary statistics
df.describe()

summary,Serial No,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,250.5,316.472,107.192,3.114,3.374,3.484,8.576440000000003,0.56,0.7217399999999996
stddev,144.4818327679989,11.295148372354712,6.081867659564538,1.143511800759815,0.9910036207566072,0.9254495738978192,0.6048128003332054,0.4968840786090358,0.1411404039503022
min,1.0,290.0,92.0,1.0,1.0,1.0,6.8,0.0,0.34
max,500.0,340.0,120.0,5.0,5.0,5.0,9.92,1.0,0.97


## Data Cleaning

In [22]:
#drop the unnecessary column
df = df.drop('Serial No')

In [23]:
#display the dataframe
df.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|
|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|
|      308|        101|                2|3.0|4.0| 7.9|       0|           0.68|
|      302|        102|                1|2.0|1.5| 8.0|       0|            0.5|
|      323|        108|                3

In [24]:
#check for null values
for i in df.columns:
    print (i+":",df[df[i].isNull()].count())

GRE Score: 0
TOEFL Score: 0
University Rating: 0
SOP: 0
LOR: 0
CGPA: 0
Research: 0
Chance of Admit: 0


## Correlation Analysis & Feature Selection

In [27]:
# correlation analysis
for col in df.columns:
    print('Correlation to chance of admit for {} is {}'.format(col, 
                                df.stat.corr('Chance of Admit',col)))

Correlation to chance of admit for GRE Score is 0.8103506354632598
Correlation to chance of admit for TOEFL Score is 0.7922276143050823
Correlation to chance of admit for University Rating is 0.6901323687886892
Correlation to chance of admit for SOP is 0.6841365241316723
Correlation to chance of admit for LOR is 0.6453645135280112
Correlation to chance of admit for CGPA is 0.882412574904574
Correlation to chance of admit for Research is 0.5458710294711379
Correlation to chance of admit for Chance of Admit is 1.0


In [29]:
# feature selection, selecting the top three features with the highest correlation
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['GRE Score','TOEFL Score','CGPA'],outputCol='features')

In [30]:
#display dataframe
output_data = assembler.transform(df)

In [31]:
#The selected features are appended onto the last column as 'features'
output_data.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|          features|
+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|[337.0,118.0,9.65]|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|[324.0,107.0,8.87]|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72| [316.0,104.0,8.0]|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|[322.0,110.0,8.67]|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|[314.0,103.0,8.21]|
|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|[330.0,115.0,9.34]|
|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75| [321.0,109.0,8.2]|
|      308

## Building the Linear Regression Model

In [32]:
#import Linearregression and create final data
from pyspark.ml.regression import LinearRegression
final_data = output_data.select('features','Chance of Admit')

In [33]:
#print schema of final data
final_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [35]:
#split the dataset into training and testing set
train, test = final_data.randomSplit([0.7,0.3], seed =100)

In [41]:
#build & train the model
model = LinearRegression(featuresCol='features',labelCol='Chance of Admit').fit(train)

In [42]:
#get coefficients & intercept
print('coefficients',model.coefficients)
print('intercept',model.intercept)

coefficients [0.002470874627298453,0.0027173207275365614,0.14657855695408284]
intercept -1.6081093437904634


In [53]:
#get summary of the model
summary = model.summary

In [57]:
#print the rmse & r2 score
print ('RMSE: ',summary.rootMeanSquaredError)
print ('R2: ',summary.r2)

RMSE:  0.06365691008911853
R2:  0.7911588829079539


## Evaluating & Saving the Model

In [58]:
#transform on the test data
predictions = model.transform(test)

In [59]:
#display the predictions
predictions.show(20)

+------------------+---------------+-------------------+
|          features|Chance of Admit|         prediction|
+------------------+---------------+-------------------+
| [294.0,93.0,7.36]|           0.46|0.44985680347823154|
| [296.0,95.0,7.54]|           0.44|0.48661733443963673|
|  [296.0,97.0,7.8]|           0.49| 0.5301624007027712|
|[296.0,101.0,7.68]|            0.6| 0.5234422567784276|
| [297.0,96.0,7.43]|           0.34|0.47568188852952265|
| [297.0,100.0,7.9]|           0.52| 0.5554430932080878|
|[297.0,101.0,7.67]|           0.57| 0.5244473458361849|
| [298.0,99.0,7.46]|           0.53|  0.490702082048053|
|[298.0,101.0,7.86]|           0.54| 0.5547681462847591|
|[299.0,100.0,7.42]|           0.42|0.49002713512472473|
|[299.0,100.0,7.88]|           0.51| 0.5574532713236025|
|[299.0,100.0,7.89]|           0.59| 0.5589190568931435|
|[299.0,100.0,8.02]|           0.63| 0.5779742692971741|
| [299.0,106.0,8.4]|           0.64| 0.6499780453049455|
| [300.0,95.0,8.22]|           

In [64]:
#evaluate the model 
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='Chance of Admit', metricName='rmse')

In [66]:
print ('RMSE on the test data:',evaluator.evaluate(predictions))

RMSE on the test data: 0.05930006971640625


In [79]:
#save the model
# model.write().overwrite().save(r"C:\Users\calvi\Documents\Portfolio Projects\predicting_grad_admissions_model")

In [80]:
#load the model
# from pyspark.ml.regression import LinearRegressionModel
# model = LinearRegressionModel.load('predicting_grad_admissions_model')