In [3]:
import pandas as pd
import numpy as np
import pyspark.pandas as ps
from pyspark.sql import SparkSession
import warnings
warnings.filterwarnings('ignore')

In [4]:
#Create a Spark Session
spark= SparkSession.builder.appName("ml_project").getOrCreate()

In [5]:
spark

In [9]:
#Clone Dataset from Git repository
! git clone https://github.com/education454/admission_dataset

Cloning into 'admission_dataset'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
Unpacking objects: 100% (3/3), done.
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0[K


In [10]:
#Read the csv file
df = spark.read.csv('admission_dataset/Admission_Predict_Ver1.1.csv', header=True, inferSchema=True)

                                                                                

In [11]:
df.show()

+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|Serial No|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|        1|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|        2|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|        3|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|        4|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|        5|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
|        6|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|
|        7|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|
|        8|      308|        101|                2|3.0|4.0| 7.9|       0|           0.68|
|        9

In [12]:
#Shape of the dataset along with columns
shape = (df.count(), len(df.columns))

print(shape)

(500, 9)


In [13]:
# Print Schema 
df.printSchema()

root
 |-- Serial No: integer (nullable = true)
 |-- GRE Score: integer (nullable = true)
 |-- TOEFL Score: integer (nullable = true)
 |-- University Rating: integer (nullable = true)
 |-- SOP: double (nullable = true)
 |-- LOR: double (nullable = true)
 |-- CGPA: double (nullable = true)
 |-- Research: integer (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [14]:
df.describe().show()

+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|summary|        Serial No|         GRE Score|      TOEFL Score|University Rating|               SOP|               LOR|              CGPA|          Research|    Chance of Admit|
+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|  count|              500|               500|              500|              500|               500|               500|               500|               500|                500|
|   mean|            250.5|           316.472|          107.192|            3.114|             3.374|             3.484| 8.576440000000003|              0.56| 0.7217399999999996|
| stddev|144.4818327679989|11.295148372354712|6.081867659564538|1.143511800759815|0.9910036207566072|0.92

In [15]:
# Drop Column Serial No from dataset
df= df.drop('Serial No')

In [17]:
df.show(5)

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
+---------+-----------+-----------------+---+---+----+--------+---------------+
only showing top 5 rows



In [18]:
#Print Count of  Null values for the GRE Score 
print(df[df['GRE Score'].isNull()].count())

0


In [20]:
# Print Null values for all columns
for col in df.columns:
    print(col + ":", df[df[col].isNull()].count())

GRE Score: 0
TOEFL Score: 0
University Rating: 0
SOP: 0
LOR: 0
CGPA: 0
Research: 0
Chance of Admit: 0


In [21]:
# Check the Correlation between GRE Score and Chance of Admit
print(df.stat.corr('GRE Score', 'Chance of Admit'))

0.8103506354632601


Check the Correlation with the target variables 

In [23]:
for col in df.columns:
    print(f"{col} is {round(df.stat.corr(col, 'Chance of Admit'),3)} with target variable Chance of Admit")

GRE Score is 0.81 with target variable Chance of Admit
TOEFL Score is 0.792 with target variable Chance of Admit
University Rating is 0.69 with target variable Chance of Admit
SOP is 0.684 with target variable Chance of Admit
LOR is 0.645 with target variable Chance of Admit
CGPA is 0.882 with target variable Chance of Admit
Research is 0.546 with target variable Chance of Admit
Chance of Admit is 1.0 with target variable Chance of Admit


We can see that GRE, TOEFL and CGPA have a high correlation with the target variable.

Hence, The chances of getting admitted with the high scores in these variables would be high

In [24]:
#Import Vector Assembler
#Vector Assembler = A feature transformer that merges multiple columns into a vector column.

from pyspark.ml.feature import VectorAssembler 
assembler = VectorAssembler(inputCols=['GRE Score','TOEFL Score', 'CGPA'],outputCol='features' )

In [25]:
output_data = assembler.transform(df)
output_data.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|          features|
+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|[337.0,118.0,9.65]|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|[324.0,107.0,8.87]|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72| [316.0,104.0,8.0]|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|[322.0,110.0,8.67]|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|[314.0,103.0,8.21]|
|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|[330.0,115.0,9.34]|
|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75| [321.0,109.0,8.2]|
|      308

In [26]:
#Linear Regression Model
from pyspark.ml.regression import LinearRegression
final_data = output_data.select('features', 'Chance of Admit')

In [27]:
final_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [28]:
# Split the dataset in train and test with 70:30
train, test = final_data.randomSplit([0.7,0.3])

In [30]:
#Fit Linear Regression on the model
import warnings
warnings.filterwarnings('ignore')
models = LinearRegression(featuresCol= 'features', labelCol='Chance of Admit')
model= models.fit(train)

22/04/18 07:06:27 WARN Instrumentation: [ccd100d1] regParam is zero, which might cause numerical instability and overfitting.


In [31]:
# Print Model Coefficients and Intercept
print('Coefficients:', model.coefficients)
print('Intercept:', model.intercept)

Coefficients: [0.0018758928779171247,0.003241792355587123,0.1485568077070551]
Intercept: -1.4965862078755756


In [32]:
summary = model.summary

In [33]:
#Print RMSE and R2 
print('RMSE :', summary.rootMeanSquaredError)
print('r2 :', summary.r2)

RMSE : 0.06287960259466743
r2 : 0.8001852184369991


In [34]:
#Evaluate & Save the Model
predictions= model.transform(test)
predictions.show()

+------------------+---------------+-------------------+
|          features|Chance of Admit|         prediction|
+------------------+---------------+-------------------+
|  [293.0,97.0,7.8]|           0.64| 0.5262473639611227|
| [295.0,99.0,7.57]|           0.37| 0.5023146686555084|
| [296.0,95.0,7.54]|           0.44| 0.4867666878798653|
|  [296.0,97.0,7.8]|           0.49| 0.5318750425948742|
| [296.0,99.0,8.03]|           0.61| 0.5725266930786708|
| [297.0,98.0,7.67]|           0.59| 0.5176803428264614|
| [297.0,99.0,7.81]|           0.54|  0.541720088261036|
|[297.0,101.0,7.67]|           0.57| 0.5274057198932227|
| [298.0,97.0,7.21]|           0.45|0.44797831180354564|
|[298.0,101.0,7.86]|           0.54| 0.5575074062354801|
|[298.0,105.0,8.54]|           0.69|  0.671493204898626|
| [299.0,97.0,7.66]|           0.38| 0.5167047681496375|
|[299.0,100.0,7.42]|           0.42|0.49077651136670575|
|[299.0,100.0,7.89]|           0.59| 0.5605982109890215|
| [300.0,98.0,8.02]|           

In [35]:
#Import Regression Evaluator
#Regression Evaluator = Evaluator for Regression, which expects input columns prediction, label and an optional weight column.
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='Chance of Admit', metricName='r2')
print('r2 score:', evaluator.evaluate(predictions))

r2 score: 0.8102661457916644


Our r2 has increased the score from 0.800 to 0.810

In [36]:
#Save Model
model.save('model')

                                                                                

In [37]:
from pyspark.ml.regression import LinearRegressionModel

model_new= LinearRegressionModel.load('model')

In [38]:
model_new

LinearRegressionModel: uid=LinearRegression_18c8b037fada, numFeatures=3

In [40]:
# Print Model Coefficients and Intercept
print('Coefficients:', model_new.coefficients)
print('Intercept:', model_new.intercept)

Coefficients: [0.0018758928779171247,0.003241792355587123,0.1485568077070551]
Intercept: -1.4965862078755756


In [42]:
summary1 = model.summary

In [43]:
#Print RMSE and R2 
print('RMSE :', summary1.rootMeanSquaredError)
print('r2 :', summary1.r2)

RMSE : 0.06287960259466743
r2 : 0.8001852184369991


In [39]:
predictions= model_new.transform(test)
predictions.show()

+------------------+---------------+-------------------+
|          features|Chance of Admit|         prediction|
+------------------+---------------+-------------------+
|  [293.0,97.0,7.8]|           0.64| 0.5262473639611227|
| [295.0,99.0,7.57]|           0.37| 0.5023146686555084|
| [296.0,95.0,7.54]|           0.44| 0.4867666878798653|
|  [296.0,97.0,7.8]|           0.49| 0.5318750425948742|
| [296.0,99.0,8.03]|           0.61| 0.5725266930786708|
| [297.0,98.0,7.67]|           0.59| 0.5176803428264614|
| [297.0,99.0,7.81]|           0.54|  0.541720088261036|
|[297.0,101.0,7.67]|           0.57| 0.5274057198932227|
| [298.0,97.0,7.21]|           0.45|0.44797831180354564|
|[298.0,101.0,7.86]|           0.54| 0.5575074062354801|
|[298.0,105.0,8.54]|           0.69|  0.671493204898626|
| [299.0,97.0,7.66]|           0.38| 0.5167047681496375|
|[299.0,100.0,7.42]|           0.42|0.49077651136670575|
|[299.0,100.0,7.89]|           0.59| 0.5605982109890215|
| [300.0,98.0,8.02]|           