# Install Dependencies & Run a SparkSession


In [14]:
#create a sparksession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("sparkML").getOrCreate()

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression, LinearRegressionModel


# Clone & Explore dataset

In [3]:
#clone the dataset
! git clone https://github.com/education454/admission_dataset

fatal: destination path 'admission_dataset' already exists and is not an empty directory.


In [2]:
#check the presence of dataset/
! ls admission_dataset/

Admission_Predict_Ver1.1.csv


24/06/10 18:12:48 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [3]:
#create a spark dataframe
df = spark.read.csv('admission_dataset/Admission_Predict_Ver1.1.csv', header=True, inferSchema=True)

In [4]:
#display dataframe
df.show(5)

+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|Serial No|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|        1|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|        2|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|        3|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|        4|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|        5|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
only showing top 5 rows



In [5]:
#get the no.of rows & columns
print(df.count(),len(df.columns))

500 9


In [8]:
#print schema 
df.printSchema()

root
 |-- Serial No: integer (nullable = true)
 |-- GRE Score: integer (nullable = true)
 |-- TOEFL Score: integer (nullable = true)
 |-- University Rating: integer (nullable = true)
 |-- SOP: double (nullable = true)
 |-- LOR: double (nullable = true)
 |-- CGPA: double (nullable = true)
 |-- Research: integer (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [9]:
#get the summary statistics
df.describe().show()

24/06/10 14:14:42 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|summary|        Serial No|         GRE Score|      TOEFL Score|University Rating|               SOP|               LOR|              CGPA|          Research|    Chance of Admit|
+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|  count|              500|               500|              500|              500|               500|               500|               500|               500|                500|
|   mean|            250.5|           316.472|          107.192|            3.114|             3.374|             3.484| 8.576440000000003|              0.56| 0.7217399999999996|
| stddev|144.4818327679989|11.295148372354712|6.081867659564538|1.143511800759815|0.9910036207566072|0.92

# Data Cleaning

In [6]:
#drop the unnecessary column
df = df.drop("Serial No")

In [7]:
#display the dataframe
df.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|
|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|
|      308|        101|                2|3.0|4.0| 7.9|       0|           0.68|
|      302|        102|                1|2.0|1.5| 8.0|       0|            0.5|
|      323|        108|                3

In [8]:
#check for null values
for i in df.columns:
    print(i+":",df[df[i].isNull()].count() )

# if any null values:
#df.na.drop() 

GRE Score: 0
TOEFL Score: 0
University Rating: 0
SOP: 0
LOR: 0
CGPA: 0
Research: 0
Chance of Admit: 0


# Correlation Analysis & Feature Selection

In [39]:
# correlation analysis
for col in df.columns:
    print('Correlation col vs. chance of admit for {} is {}'.format(col, df.stat.corr("Chance of Admit",col) ))

Correlation col vs. chance of admit for GRE Score is 0.8103506354632598
Correlation col vs. chance of admit for TOEFL Score is 0.7922276143050823
Correlation col vs. chance of admit for University Rating is 0.6901323687886892
Correlation col vs. chance of admit for SOP is 0.6841365241316723
Correlation col vs. chance of admit for LOR is 0.6453645135280112
Correlation col vs. chance of admit for CGPA is 0.882412574904574
Correlation col vs. chance of admit for Research is 0.5458710294711379
Correlation col vs. chance of admit for Chance of Admit is 1.0


In [9]:
# feature selection
#from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['GRE Score','TOEFL Score','CGPA'], outputCol='features')

In [10]:
#display dataframe
output_data = assembler.transform(df)
output_data.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|          features|
+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|[337.0,118.0,9.65]|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|[324.0,107.0,8.87]|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72| [316.0,104.0,8.0]|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|[322.0,110.0,8.67]|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|[314.0,103.0,8.21]|
|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|[330.0,115.0,9.34]|
|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75| [321.0,109.0,8.2]|
|      308

# Build the Linear Regression Model

In [11]:
#import Linearregression and create final data
from pyspark.ml.regression import LinearRegression

In [12]:
final_data = output_data.select('features','Chance of Admit')

In [45]:
#print schema of final data
final_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [13]:
#split the dataset into training and testing set
train, test = final_data.randomSplit([0.7,0.3])

In [48]:
#build & train the model
models = LinearRegression(featuresCol='features',labelCol='Chance of Admit')

In [49]:
#get coefficients & intercept
model = models.fit(train)

24/06/10 16:18:20 WARN Instrumentation: [c8c4cde8] regParam is zero, which might cause numerical instability and overfitting.
24/06/10 16:18:20 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/06/10 16:18:20 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/06/10 16:18:20 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [50]:
#get summary of the model
print('coefficients:',model.coefficients)
print('intercept:',model.intercept) 

coefficients: [0.002345191517329855,0.00399346951952843,0.13680450832377303]
intercept: -1.6220523591377327


In [55]:
summary = model.summary

In [57]:
#print the rmse & r2 score
print("RMSE:", summary.rootMeanSquaredError)
print("R2:", summary.r2)

RMSE: 0.060371673132144456
R2: 0.8157135573777037


# Evaluate & Save the Model

In [16]:
#transform on the test data
predictions = model.transform(test)

In [17]:
#display the predictions
predictions.show()


+------------------+---------------+-------------------+
|          features|Chance of Admit|         prediction|
+------------------+---------------+-------------------+
| [294.0,95.0,7.64]|           0.49| 0.4919999949060714|
| [295.0,99.0,7.57]|           0.37|  0.500742748918851|
|[295.0,101.0,7.86]|           0.69| 0.5484029953718019|
| [296.0,95.0,7.54]|           0.44| 0.4830099271083539|
|[296.0,101.0,7.68]|            0.6| 0.5261233753908525|
| [297.0,96.0,7.43]|           0.34|  0.474300092229597|
| [297.0,96.0,7.89]|           0.43| 0.5372301660585326|
| [297.0,100.0,7.9]|           0.52| 0.5545720892198844|
| [298.0,92.0,7.88]|           0.51| 0.5222334344145112|
|  [298.0,98.0,7.5]|           0.44|0.49420853836864764|
|[298.0,101.0,7.69]|           0.53|   0.53218180350875|
|[298.0,105.0,8.54]|           0.69| 0.6644395136620707|
| [299.0,97.0,7.66]|           0.38| 0.5144489816982529|
|[299.0,100.0,7.42]|           0.42| 0.4935963082591328|
|[299.0,100.0,7.88]|           

24/06/10 18:15:32 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/06/10 18:15:32 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


In [18]:
#evaluate the model 
#from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(predictionCol = 'prediction', labelCol = 'Chance of Admit', 
                                metricName='r2')

print('R2 on test data: ', evaluator.evaluate(predictions))

R2 on test data:  0.8657211555386073


In [62]:
#save the model
model.save('LRmodel')

In [15]:
#load the model
#from pyspark.ml.regression import LinearRegressionModel

model = LinearRegressionModel.load("LRmodel")