# TASK 1 : Install Dependencies & Run a SparkSession


In [1]:
#install pyspark
! pip install pyspark



In [2]:
#create a sparksession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("graduate").getOrCreate()

# TASK 2 : Clone & Explore dataset

In [3]:
#clone the dataset
! git clone https://github.com/education454/admission_dataset

fatal: destination path 'admission_dataset' already exists and is not an empty directory.


In [5]:
#create a spark dataframe
file = 'Admission_Predict_Ver1.1.csv'
df = spark.read.csv(file, header=True, inferSchema=True)

In [6]:
#display dataframe
df.show()

+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|Serial No|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|        1|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|        2|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|        3|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|        4|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|        5|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
|        6|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|
|        7|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|
|        8|      308|        101|                2|3.0|4.0| 7.9|       0|           0.68|
|        9

In [7]:
#get the no.of rows & columns
df.count(), len(df.columns)

(500, 9)

In [8]:
#print schema 
df.printSchema()

root
 |-- Serial No: integer (nullable = true)
 |-- GRE Score: integer (nullable = true)
 |-- TOEFL Score: integer (nullable = true)
 |-- University Rating: integer (nullable = true)
 |-- SOP: double (nullable = true)
 |-- LOR: double (nullable = true)
 |-- CGPA: double (nullable = true)
 |-- Research: integer (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [9]:
#get the summary statistics
df.describe().show()

+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|summary|        Serial No|         GRE Score|      TOEFL Score|University Rating|               SOP|               LOR|              CGPA|          Research|    Chance of Admit|
+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|  count|              500|               500|              500|              500|               500|               500|               500|               500|                500|
|   mean|            250.5|           316.472|          107.192|            3.114|             3.374|             3.484| 8.576440000000003|              0.56| 0.7217399999999996|
| stddev|144.4818327679989|11.295148372354712|6.081867659564538|1.143511800759815|0.9910036207566072|0.92

# TASK 3 : Data Cleaning

In [10]:
#drop the unnecessary column
df = df.drop('Serial No.')

In [11]:
#display the dataframe
df.show()

+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|Serial No|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|        1|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|        2|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|        3|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|        4|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|        5|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
|        6|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|
|        7|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|
|        8|      308|        101|                2|3.0|4.0| 7.9|       0|           0.68|
|        9

In [12]:
#check for null values
for i in df.columns:
    print(i+":" ,df[df[i].isNull()].count())

Serial No: 0
GRE Score: 0
TOEFL Score: 0
University Rating: 0
SOP: 0
LOR: 0
CGPA: 0
Research: 0
Chance of Admit: 0


# TASK 4 : Correlation Analysis & Feature Selection

In [13]:
# correlation analysis
for col in df.columns:
    print('Correlation of chance to admit col for {} is {}'.format(col,df.stat.corr('Chance of Admit', col)))

Correlation of chance to admit col for Serial No is 0.00850504936113174
Correlation of chance to admit col for GRE Score is 0.8103506354632598
Correlation of chance to admit col for TOEFL Score is 0.7922276143050823
Correlation of chance to admit col for University Rating is 0.6901323687886892
Correlation of chance to admit col for SOP is 0.6841365241316723
Correlation of chance to admit col for LOR is 0.6453645135280112
Correlation of chance to admit col for CGPA is 0.882412574904574
Correlation of chance to admit col for Research is 0.5458710294711379
Correlation of chance to admit col for Chance of Admit is 1.0


In [14]:
# feature selection
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['GRE Score', 'TOEFL Score', 'CGPA'], outputCol='features')

In [16]:
#display dataframe
output_data=assembler.transform(df)
output_data.show()

+---------+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|Serial No|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|          features|
+---------+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|        1|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|[337.0,118.0,9.65]|
|        2|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|[324.0,107.0,8.87]|
|        3|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72| [316.0,104.0,8.0]|
|        4|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|[322.0,110.0,8.67]|
|        5|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|[314.0,103.0,8.21]|
|        6|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|[330.0,115.0,9.34]|
|        7|      32

# TASK 5 : Build the Linear Regression Model

In [17]:
#import Linearregression and create final data
from pyspark.ml.regression import LinearRegression
final_df = output_data.select('features','Chance of Admit')

In [18]:
#print schema of final data
final_df.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [19]:
#split the dataset into training and testing set
train, test = final_df.randomSplit([0.7,0.3])

In [24]:
#build & train the model
models = LinearRegression(featuresCol='features', labelCol='Chance of Admit')
model = models.fit(train)

In [25]:
#get coefficients & intercept
print("Coefficients:",model.coefficients)
print("Intercept:",model.intercept)

Coefficients: [0.0020456484394370082,0.003559155058733461,0.1476993697184929]
Intercept: -1.571734159696103


In [28]:
#get summary of the model
summary = model.summary
summary

<pyspark.ml.regression.LinearRegressionTrainingSummary at 0x20f93fd7070>

In [27]:
#print the rmse & r2 score
print("RMSE", summary.rootMeanSquaredError)
print("r2 score:",summary.r2)

RMSE 0.05913735430017819
r2 score: 0.8229184987147455


# TASK 6 : Evaluate & Save the Model

In [29]:
#transform on the test data
predictions = model.transform(test)

In [30]:
#display the predictions
predictions.show()

+------------------+---------------+-------------------+
|          features|Chance of Admit|         prediction|
+------------------+---------------+-------------------+
|[290.0,104.0,7.46]|           0.45| 0.4934933119488665|
|  [293.0,97.0,7.8]|           0.64| 0.5249339575603307|
| [295.0,96.0,7.34]|           0.47|0.45752438930996475|
| [296.0,99.0,8.03]|           0.61| 0.5721600680313619|
|[296.0,101.0,7.68]|            0.6| 0.5275835987473567|
| [297.0,96.0,7.43]|           0.34|  0.474908629463503|
| [297.0,99.0,7.81]|           0.54| 0.5417118551327305|
|[297.0,101.0,7.67]|           0.57| 0.5281522534896086|
| [298.0,92.0,7.88]|           0.51|  0.529182374041328|
| [298.0,97.0,7.21]|           0.45|0.44801957162360506|
| [298.0,98.0,8.03]|           0.34| 0.5726922098515026|
|[298.0,100.0,7.95]|           0.58| 0.5679945703914904|
| [299.0,97.0,7.66]|           0.38|  0.516529936436364|
|[299.0,100.0,7.89]|           0.59| 0.5611782566478174|
| [301.0,96.0,7.56]|           

In [31]:
#evaluate the model
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='Chance of Admit', metricName='r2')
print("r2 on the test data",evaluator.evaluate(predictions)) 

r2 on the test data 0.7550065757897368


In [None]:
#save the model
model.save("model")

In [None]:
#load the model
from pyspark.ml.regression import LinearRegressionModel
model = LinearRegressionModel.load('model')