# TASK 1 : Install Dependencies & Run a SparkSession


In [1]:
#install pyspark
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=7b3d7e0df0395560194179b93b4a84c35497d6792c0989faa164699edab6086b
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
#create a sparksession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('spark').getOrCreate() # create a new spark session or get existing one.

# TASK 2 : Clone & Explore dataset

In [3]:
#clone the dataset
! git clone https://github.com/education454/admission_dataset

Cloning into 'admission_dataset'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (3/3), 5.60 KiB | 5.60 MiB/s, done.


In [4]:
#check the presence of dataset
! ls admission_dataset

Admission_Predict_Ver1.1.csv


In [6]:
#create a spark dataframe
df = spark.read.csv('/content/admission_dataset/Admission_Predict_Ver1.1.csv', header=True, inferSchema=True)

In [8]:
#display dataframe
df.show()

+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|Serial No|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|        1|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|        2|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|        3|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|        4|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|        5|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
|        6|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|
|        7|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|
|        8|      308|        101|                2|3.0|4.0| 7.9|       0|           0.68|
|        9

In [9]:
#get the no.of rows & columns
print(df.count(), len(df.columns))

500 9


In [10]:
#print schema
df.printSchema()

root
 |-- Serial No: integer (nullable = true)
 |-- GRE Score: integer (nullable = true)
 |-- TOEFL Score: integer (nullable = true)
 |-- University Rating: integer (nullable = true)
 |-- SOP: double (nullable = true)
 |-- LOR: double (nullable = true)
 |-- CGPA: double (nullable = true)
 |-- Research: integer (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [11]:
#get the summary statistics
df.describe().show()

+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|summary|        Serial No|         GRE Score|      TOEFL Score|University Rating|               SOP|               LOR|              CGPA|          Research|    Chance of Admit|
+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|  count|              500|               500|              500|              500|               500|               500|               500|               500|                500|
|   mean|            250.5|           316.472|          107.192|            3.114|             3.374|             3.484| 8.576440000000003|              0.56| 0.7217399999999996|
| stddev|144.4818327679989|11.295148372354712|6.081867659564538|1.143511800759815|0.9910036207566072|0.92

# TASK 3 : Data Cleaning

In [12]:
#drop the unnecessary column
df = df.drop('Serial No')

In [13]:
#display the dataframe
df.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|
|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|
|      308|        101|                2|3.0|4.0| 7.9|       0|           0.68|
|      302|        102|                1|2.0|1.5| 8.0|       0|            0.5|
|      323|        108|                3

In [16]:
#check for null values

for col in df.columns:
  print( col , " : " , df[df[col].isNull()].count() )

GRE Score  :  0
TOEFL Score  :  0
University Rating  :  0
SOP  :  0
LOR  :  0
CGPA  :  0
Research  :  0
Chance of Admit  :  0


# TASK 4 : Correlation Analysis & Feature Selection

In [19]:
# correlation analysis
for col in df.columns:
  print( 'Correlation bwtween "Chance of Admit" column to {} column: '.format(col), df.stat.corr('Chance of Admit', col) )

Correlation bwtween "Chance of Admit" column to GRE Score column:  0.8103506354632598
Correlation bwtween "Chance of Admit" column to TOEFL Score column:  0.7922276143050823
Correlation bwtween "Chance of Admit" column to University Rating column:  0.6901323687886892
Correlation bwtween "Chance of Admit" column to SOP column:  0.6841365241316723
Correlation bwtween "Chance of Admit" column to LOR column:  0.6453645135280112
Correlation bwtween "Chance of Admit" column to CGPA column:  0.882412574904574
Correlation bwtween "Chance of Admit" column to Research column:  0.5458710294711379
Correlation bwtween "Chance of Admit" column to Chance of Admit column:  1.0


In [20]:
 # feature selection
 from pyspark.ml.feature import VectorAssembler

 assembler = VectorAssembler(inputCols=['GRE Score', 'TOEFL Score', 'CGPA'], outputCol='features')

In [21]:
#display dataframe
output_data = assembler.transform(df)
output_data.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|          features|
+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|[337.0,118.0,9.65]|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|[324.0,107.0,8.87]|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72| [316.0,104.0,8.0]|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|[322.0,110.0,8.67]|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|[314.0,103.0,8.21]|
|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|[330.0,115.0,9.34]|
|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75| [321.0,109.0,8.2]|
|      308

# TASK 5 : Build the Linear Regression Model

In [22]:
#import Linearregression and create final data

from pyspark.ml.regression import LinearRegression

final_data = output_data.select(['features', 'Chance of Admit'])


In [23]:
#print schema of final data
final_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [24]:
#split the dataset into training and testing set
train, test = final_data.randomSplit([0.8, 0.2])

In [25]:
#build & train the model
models = LinearRegression(featuresCol='features', labelCol='Chance of Admit')
model = models.fit(train)

In [27]:
#get coefficients & intercept

print('coefficients: ', model.coefficients)
print('intercept: ', model.intercept)


coefficients:  [0.0021802975901129835,0.0027259758155214883,0.14842642702057202]
intercept:  -1.5318226293126815


In [30]:
#get summary of the model
summary = model.summary

In [32]:
#print the rmse & r2 score
print('RMSE: ', summary.rootMeanSquaredError)
print('r2: ', summary.r2)
print('adjusted r2: ', summary.r2adj)

RMSE:  0.06273978967690652
r2:  0.7966592219769466
adjusted r2:  0.7950788532358348


# TASK 6 : Evaluate & Save the Model

In [33]:
#transform on the test data
prediction = model.transform(test)

In [35]:
#display the predictions
prediction.show()

+------------------+---------------+------------------+
|          features|Chance of Admit|        prediction|
+------------------+---------------+------------------+
| [295.0,99.0,7.57]|           0.37|0.5048248180530064|
| [297.0,100.0,7.9]|           0.52|0.5608921099655424|
| [298.0,98.0,8.03]|           0.34|0.5769158914372867|
|[298.0,101.0,7.86]|           0.54| 0.559861326290354|
|[299.0,100.0,7.88]|           0.51| 0.562284176605357|
|[299.0,100.0,7.88]|           0.68| 0.562284176605357|
|[299.0,100.0,7.89]|           0.59|0.5637684408755628|
|[299.0,102.0,8.62]|           0.56|0.6775716842316228|
|[300.0,101.0,7.88]|           0.59|0.5671904500109912|
|[301.0,106.0,8.47]|           0.57|0.6705722186208494|
|[301.0,107.0,8.34]|           0.62|0.6540027589236965|
| [302.0,99.0,7.25]|           0.57| 0.472590444537214|
| [303.0,98.0,7.65]|           0.56|0.5314153371200341|
| [303.0,99.0,7.66]|           0.36|0.5356255772057614|
|[303.0,105.0,8.65]|           0.77|0.6989235948

In [36]:
#evaluate the model
from pyspark.ml.evaluation import RegressionEvaluator

eval = RegressionEvaluator(predictionCol='prediction', labelCol='Chance of Admit', metricName='r2')
print('r2: ', eval.evaluate(prediction))

r2:  0.8259803450683699


In [37]:
#save the model
model.save('linear_regression_model')

In [38]:
#load the model

from pyspark.ml.regression import LinearRegressionModel
model = LinearRegressionModel.load('linear_regression_model')