In [1]:
## Converting String in to Float
## Checking Missing Values 
## Treating Missing Values 
## Statistics
## checking correlation using pearson method
## VectorAssembler
## Standard Scaling
## PCA 

In [2]:
import pyspark
import numpy as np
import pandas as pd
import findspark
findspark.init()
findspark.find()
from pyspark.sql import SparkSession

from pyspark.mllib.feature import StandardScaler,PCA # mllib modules can be only applied on RDDs
from pyspark.mllib.stat import Statistics

from pyspark.ml.feature import VectorAssembler

In [3]:
spark = SparkSession.builder.appName("DataFrame Preprocessing").getOrCreate()

In [4]:
dataset = spark.read.csv("Admission_Prediction.csv",header=True)

In [5]:
dataset.show()

+---------+-----------+-----------------+----+----+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating| SOP| LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+----+----+----+--------+---------------+
|   337.00|     118.00|                4|4.50|4.50|9.65|    1.00|           0.92|
|   324.00|     107.00|                4|4.00|4.50|8.87|    1.00|           0.76|
|     null|     104.00|                3|3.00|3.50|8.00|    1.00|           0.72|
|   322.00|     110.00|                3|3.50|2.50|8.67|    1.00|           0.80|
|   314.00|     103.00|                2|2.00|3.00|8.21|    0.00|           0.65|
|   330.00|     115.00|                5|4.50|3.00|9.34|    1.00|           0.90|
|   321.00|     109.00|             null|3.00|4.00|8.20|    1.00|           0.75|
|   308.00|     101.00|                2|3.00|4.00|7.90|    0.00|           0.68|
|   302.00|     102.00|                1|2.00|1.50|8.00|    0.00|           0.50|
|   323.00|     

In [6]:
dataset.printSchema()

root
 |-- GRE Score: string (nullable = true)
 |-- TOEFL Score: string (nullable = true)
 |-- University Rating: string (nullable = true)
 |-- SOP: string (nullable = true)
 |-- LOR: string (nullable = true)
 |-- CGPA: string (nullable = true)
 |-- Research: string (nullable = true)
 |-- Chance of Admit: string (nullable = true)



## Converting String in to Float

In [7]:
from pyspark.sql.functions import col
new_data = dataset.select(*(col(c).cast("float") for c in dataset.columns))

In [8]:
new_data.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|    337.0|      118.0|              4.0|4.5|4.5|9.65|     1.0|           0.92|
|    324.0|      107.0|              4.0|4.0|4.5|8.87|     1.0|           0.76|
|     null|      104.0|              3.0|3.0|3.5| 8.0|     1.0|           0.72|
|    322.0|      110.0|              3.0|3.5|2.5|8.67|     1.0|            0.8|
|    314.0|      103.0|              2.0|2.0|3.0|8.21|     0.0|           0.65|
|    330.0|      115.0|              5.0|4.5|3.0|9.34|     1.0|            0.9|
|    321.0|      109.0|             null|3.0|4.0| 8.2|     1.0|           0.75|
|    308.0|      101.0|              2.0|3.0|4.0| 7.9|     0.0|           0.68|
|    302.0|      102.0|              1.0|2.0|1.5| 8.0|     0.0|            0.5|
|    323.0|      108.0|              3.0

In [9]:
new_data.printSchema()

root
 |-- GRE Score: float (nullable = true)
 |-- TOEFL Score: float (nullable = true)
 |-- University Rating: float (nullable = true)
 |-- SOP: float (nullable = true)
 |-- LOR: float (nullable = true)
 |-- CGPA: float (nullable = true)
 |-- Research: float (nullable = true)
 |-- Chance of Admit: float (nullable = true)



## Checking Missing Values 

In [10]:
# delete the missing value if any row has one 
#data_without_missing = dataset.dropna(how='any') # if there is any missing value in any row ,it will drop that row
#data_without_missing = dataset.dropna(how='all') # if all rows are missing ,delete that row

In [11]:
from pyspark.sql.functions import col, count, isnan, when
#checking for null ir nan type values in our columns
new_data.select([count(when(col(c).isNull(), c)).alias(c) for c in new_data.columns]).show()

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|       15|         10|               15|  0|  0|   0|       0|              0|
+---------+-----------+-----------------+---+---+----+--------+---------------+



## Treating Missing Values 

In [12]:
from pyspark.ml.feature import Imputer
imputer = Imputer(inputCols=["GRE Score", "TOEFL Score","University Rating"], 
                  outputCols=["GRE Score", "TOEFL Score","University Rating"])
model = imputer.fit(new_data)

imputed_data = model.transform(new_data)

In [13]:
imputed_data.select([count(when(col(c).isNull(), c)).alias(c) for c in new_data.columns]).show()

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|        0|          0|                0|  0|  0|   0|       0|              0|
+---------+-----------+-----------------+---+---+----+--------+---------------+



In [14]:
imputed_data.show()


+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|    337.0|      118.0|              4.0|4.5|4.5|9.65|     1.0|           0.92|
|    324.0|      107.0|              4.0|4.0|4.5|8.87|     1.0|           0.76|
|316.55878|      104.0|              3.0|3.0|3.5| 8.0|     1.0|           0.72|
|    322.0|      110.0|              3.0|3.5|2.5|8.67|     1.0|            0.8|
|    314.0|      103.0|              2.0|2.0|3.0|8.21|     0.0|           0.65|
|    330.0|      115.0|              5.0|4.5|3.0|9.34|     1.0|            0.9|
|    321.0|      109.0|        3.1216495|3.0|4.0| 8.2|     1.0|           0.75|
|    308.0|      101.0|              2.0|3.0|4.0| 7.9|     0.0|           0.68|
|    302.0|      102.0|              1.0|2.0|1.5| 8.0|     0.0|            0.5|
|    323.0|      108.0|              3.0

In [15]:
#data_without_missing.show()

In [16]:
imputed_data.count()

500

In [17]:
imputed_data.corr('SOP','Research') # correlation

0.40811584579179017

In [18]:
features = imputed_data.drop('Chance of Admit')
features

DataFrame[GRE Score: float, TOEFL Score: float, University Rating: float, SOP: float, LOR: float, CGPA: float, Research: float]

In [19]:
# we need to convert dataframe intp a RDD to check for correlation
col_names = features.columns
features_rdd = features.rdd

In [20]:
features_rdd.collect()

[Row(GRE Score=337.0, TOEFL Score=118.0, University Rating=4.0, SOP=4.5, LOR=4.5, CGPA=9.649999618530273, Research=1.0),
 Row(GRE Score=324.0, TOEFL Score=107.0, University Rating=4.0, SOP=4.0, LOR=4.5, CGPA=8.869999885559082, Research=1.0),
 Row(GRE Score=316.55877685546875, TOEFL Score=104.0, University Rating=3.0, SOP=3.0, LOR=3.5, CGPA=8.0, Research=1.0),
 Row(GRE Score=322.0, TOEFL Score=110.0, University Rating=3.0, SOP=3.5, LOR=2.5, CGPA=8.670000076293945, Research=1.0),
 Row(GRE Score=314.0, TOEFL Score=103.0, University Rating=2.0, SOP=2.0, LOR=3.0, CGPA=8.210000038146973, Research=0.0),
 Row(GRE Score=330.0, TOEFL Score=115.0, University Rating=5.0, SOP=4.5, LOR=3.0, CGPA=9.34000015258789, Research=1.0),
 Row(GRE Score=321.0, TOEFL Score=109.0, University Rating=3.1216495037078857, SOP=3.0, LOR=4.0, CGPA=8.199999809265137, Research=1.0),
 Row(GRE Score=308.0, TOEFL Score=101.0, University Rating=2.0, SOP=3.0, LOR=4.0, CGPA=7.900000095367432, Research=0.0),
 Row(GRE Score=302.

In [21]:
features_rdd = features.rdd.map(lambda row: row[0:]) # to retrieve the values

In [22]:
features_rdd.collect()


[(337.0, 118.0, 4.0, 4.5, 4.5, 9.649999618530273, 1.0),
 (324.0, 107.0, 4.0, 4.0, 4.5, 8.869999885559082, 1.0),
 (316.55877685546875, 104.0, 3.0, 3.0, 3.5, 8.0, 1.0),
 (322.0, 110.0, 3.0, 3.5, 2.5, 8.670000076293945, 1.0),
 (314.0, 103.0, 2.0, 2.0, 3.0, 8.210000038146973, 0.0),
 (330.0, 115.0, 5.0, 4.5, 3.0, 9.34000015258789, 1.0),
 (321.0, 109.0, 3.1216495037078857, 3.0, 4.0, 8.199999809265137, 1.0),
 (308.0, 101.0, 2.0, 3.0, 4.0, 7.900000095367432, 0.0),
 (302.0, 102.0, 1.0, 2.0, 1.5, 8.0, 0.0),
 (323.0, 108.0, 3.0, 3.5, 3.0, 8.600000381469727, 0.0),
 (325.0, 106.0, 3.0, 3.5, 4.0, 8.399999618530273, 1.0),
 (327.0, 111.0, 4.0, 4.0, 4.5, 9.0, 1.0),
 (316.55877685546875, 112.0, 4.0, 4.0, 4.5, 9.100000381469727, 1.0),
 (307.0, 109.0, 3.0, 4.0, 3.0, 8.0, 1.0),
 (311.0, 104.0, 3.0, 3.5, 2.0, 8.199999809265137, 1.0),
 (314.0, 105.0, 3.0, 3.5, 2.5, 8.300000190734863, 0.0),
 (317.0, 107.0, 3.0, 4.0, 3.0, 8.699999809265137, 0.0),
 (319.0, 106.0, 3.0, 4.0, 3.0, 8.0, 1.0),
 (318.0, 110.0, 3.0, 4

## Statistics

In [23]:
summary = Statistics.colStats(features_rdd)
print(summary.mean())  # a dense vector containing the mean value for each column
print(summary.variance())  # column-wise variance
print(summary.numNonzeros())  # number of nonzeros in each column
print(summary.normL1())# return a column of normL1 summary

[316.55876331 107.18775504   3.12164949   3.374        3.484
   8.57644001   0.56      ]
[123.29774601  36.61869044   1.27419375   0.98208818   0.85645691
   0.3657985    0.24689379]
[500. 500. 500. 500. 500. 500. 280.]
[158279.38165283  53593.8775177    1560.82474256   1687.
   1742.           4288.22000313    280.        ]


# checking correlation using pearson method

In [24]:

corr_mat=Statistics.corr(features_rdd, method="pearson")
corr_df = pd.DataFrame(corr_mat)
corr_df.index, corr_df.columns = col_names, col_names

In [25]:
corr_df.columns

Index(['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR', 'CGPA',
       'Research'],
      dtype='object')

In [26]:
corr_df.index

Index(['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR', 'CGPA',
       'Research'],
      dtype='object')

In [27]:
corr_df

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
GRE Score,1.0,0.819663,0.623084,0.608372,0.527623,0.818347,0.544993
TOEFL Score,0.819663,1.0,0.643555,0.643155,0.535785,0.805557,0.464913
University Rating,0.623084,0.643555,1.0,0.712991,0.606291,0.696592,0.424429
SOP,0.608372,0.643155,0.712991,1.0,0.663707,0.712154,0.408116
LOR,0.527623,0.535785,0.606291,0.663707,1.0,0.637469,0.372526
CGPA,0.818347,0.805557,0.696592,0.712154,0.637469,1.0,0.501311
Research,0.544993,0.464913,0.424429,0.408116,0.372526,0.501311,1.0


In [28]:
## spearman rank 

In [29]:
corr_mat=Statistics.corr(features_rdd, method="spearman")
corr_df = pd.DataFrame(corr_mat)
corr_df.index, corr_df.columns = col_names, col_names

In [30]:
corr_df

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
GRE Score,1.0,0.816869,0.626194,0.61394,0.515421,0.820734,0.56394
TOEFL Score,0.816869,1.0,0.633268,0.64397,0.518249,0.804149,0.472988
University Rating,0.626194,0.633268,1.0,0.707774,0.594477,0.68864,0.428541
SOP,0.61394,0.64397,0.707774,1.0,0.662653,0.717384,0.409088
LOR,0.515421,0.518249,0.594477,0.662653,1.0,0.639563,0.376166
CGPA,0.820734,0.804149,0.68864,0.717384,0.639563,1.0,0.509264
Research,0.56394,0.472988,0.428541,0.409088,0.376166,0.509264,1.0


## VectorAssembler

In [31]:
imputed_data.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|    337.0|      118.0|              4.0|4.5|4.5|9.65|     1.0|           0.92|
|    324.0|      107.0|              4.0|4.0|4.5|8.87|     1.0|           0.76|
|316.55878|      104.0|              3.0|3.0|3.5| 8.0|     1.0|           0.72|
|    322.0|      110.0|              3.0|3.5|2.5|8.67|     1.0|            0.8|
|    314.0|      103.0|              2.0|2.0|3.0|8.21|     0.0|           0.65|
|    330.0|      115.0|              5.0|4.5|3.0|9.34|     1.0|            0.9|
|    321.0|      109.0|        3.1216495|3.0|4.0| 8.2|     1.0|           0.75|
|    308.0|      101.0|              2.0|3.0|4.0| 7.9|     0.0|           0.68|
|    302.0|      102.0|              1.0|2.0|1.5| 8.0|     0.0|            0.5|
|    323.0|      108.0|              3.0

In [32]:
type(imputed_data)

pyspark.sql.dataframe.DataFrame

In [33]:
features = imputed_data.drop('Chance of Admit')

In [34]:
assembler = VectorAssembler(inputCols=features.columns,outputCol="features")

In [35]:
output = assembler.transform(imputed_data)

In [36]:
output.select("features", "Chance of Admit").show(truncate=False)

+--------------------------------------------------------------+---------------+
|features                                                      |Chance of Admit|
+--------------------------------------------------------------+---------------+
|[337.0,118.0,4.0,4.5,4.5,9.649999618530273,1.0]               |0.92           |
|[324.0,107.0,4.0,4.0,4.5,8.869999885559082,1.0]               |0.76           |
|[316.55877685546875,104.0,3.0,3.0,3.5,8.0,1.0]                |0.72           |
|[322.0,110.0,3.0,3.5,2.5,8.670000076293945,1.0]               |0.8            |
|[314.0,103.0,2.0,2.0,3.0,8.210000038146973,0.0]               |0.65           |
|[330.0,115.0,5.0,4.5,3.0,9.34000015258789,1.0]                |0.9            |
|[321.0,109.0,3.1216495037078857,3.0,4.0,8.199999809265137,1.0]|0.75           |
|[308.0,101.0,2.0,3.0,4.0,7.900000095367432,0.0]               |0.68           |
|[302.0,102.0,1.0,2.0,1.5,8.0,0.0]                             |0.5            |
|[323.0,108.0,3.0,3.5,3.0,8.

# Standard Scaling

In [37]:
label = imputed_data.select('Chance of Admit')

In [38]:
label.show()

+---------------+
|Chance of Admit|
+---------------+
|           0.92|
|           0.76|
|           0.72|
|            0.8|
|           0.65|
|            0.9|
|           0.75|
|           0.68|
|            0.5|
|           0.45|
|           0.52|
|           0.84|
|           0.78|
|           0.62|
|           0.61|
|           0.54|
|           0.66|
|           0.65|
|           0.63|
|           0.62|
+---------------+
only showing top 20 rows



In [39]:
features = imputed_data.drop('Chance of Admit')

In [40]:
col_names = features.columns
features_rdd = features.rdd.map(lambda row: row[0:])

In [41]:
features_rdd.collect()

[(337.0, 118.0, 4.0, 4.5, 4.5, 9.649999618530273, 1.0),
 (324.0, 107.0, 4.0, 4.0, 4.5, 8.869999885559082, 1.0),
 (316.55877685546875, 104.0, 3.0, 3.0, 3.5, 8.0, 1.0),
 (322.0, 110.0, 3.0, 3.5, 2.5, 8.670000076293945, 1.0),
 (314.0, 103.0, 2.0, 2.0, 3.0, 8.210000038146973, 0.0),
 (330.0, 115.0, 5.0, 4.5, 3.0, 9.34000015258789, 1.0),
 (321.0, 109.0, 3.1216495037078857, 3.0, 4.0, 8.199999809265137, 1.0),
 (308.0, 101.0, 2.0, 3.0, 4.0, 7.900000095367432, 0.0),
 (302.0, 102.0, 1.0, 2.0, 1.5, 8.0, 0.0),
 (323.0, 108.0, 3.0, 3.5, 3.0, 8.600000381469727, 0.0),
 (325.0, 106.0, 3.0, 3.5, 4.0, 8.399999618530273, 1.0),
 (327.0, 111.0, 4.0, 4.0, 4.5, 9.0, 1.0),
 (316.55877685546875, 112.0, 4.0, 4.0, 4.5, 9.100000381469727, 1.0),
 (307.0, 109.0, 3.0, 4.0, 3.0, 8.0, 1.0),
 (311.0, 104.0, 3.0, 3.5, 2.0, 8.199999809265137, 1.0),
 (314.0, 105.0, 3.0, 3.5, 2.5, 8.300000190734863, 0.0),
 (317.0, 107.0, 3.0, 4.0, 3.0, 8.699999809265137, 0.0),
 (319.0, 106.0, 3.0, 4.0, 3.0, 8.0, 1.0),
 (318.0, 110.0, 3.0, 4

In [42]:
scaler1 = StandardScaler().fit(features_rdd)

In [43]:
scaled_features=scaler1.transform(features_rdd)

In [44]:
for data in scaled_features.collect():
    print(data)

[30.349555318793534,19.49982001499076,3.5435801183264024,4.540851219659883,4.8625015634799515,15.955349944926029,2.0125418443661425]
[29.178800959314852,17.68204018308484,3.5435801183264024,4.036312195253229,4.8625015634799515,14.665695106742765,2.0125418443661425]
[28.508659079598416,17.1862820471105,2.6576850887448016,3.027234146439922,3.7819456604844066,13.227233637844293,2.0125418443661425]
[28.99868490401044,18.177798319059182,2.6576850887448016,3.5317731708465754,2.7013897574888617,14.335014581158482,2.0125418443661425]
[28.27822068279279,17.021029335119053,1.7717900591632012,2.0181560976266146,3.2416677089866344,13.57444858391007,0.0]
[29.71914912522809,19.00406187901642,4.429475147908003,4.540851219659883,3.2416677089866344,15.442795524472672,2.0125418443661425]
[28.90862687635823,18.012545607067736,2.765453779430686,3.027234146439922,4.322223611982179,13.557914163428576,2.0125418443661425]
[27.73787251687955,16.690523911136157,1.7717900591632012,3.027234146439922,4.32222361198

## PCA 

In [45]:
pca = PCA(k=3)
model = pca.fit(scaled_features)

In [46]:
result = model.transform(scaled_features)

In [47]:
result.collect()

[DenseVector([-32.1456, 7.3588, -19.6832]),
 DenseVector([-30.2067, 6.9994, -18.1881]),
 DenseVector([-28.0273, 7.7661, -18.1389]),
 DenseVector([-28.9069, 8.3115, -19.4452]),
 DenseVector([-26.5188, 6.9718, -19.7434]),
 DenseVector([-31.2492, 7.5998, -19.7401]),
 DenseVector([-28.889, 7.7125, -18.4905]),
 DenseVector([-26.7185, 5.9865, -18.5943]),
 DenseVector([-24.9673, 7.5756, -20.0127]),
 DenseVector([-28.3698, 6.5547, -20.2198]),
 DenseVector([-29.1277, 7.6111, -18.3983]),
 DenseVector([-30.6713, 7.1478, -18.6459]),
 DenseVector([-30.4291, 6.9257, -18.4024]),
 DenseVector([-28.2128, 7.5254, -18.2616]),
 DenseVector([-27.594, 8.1595, -18.6771]),
 DenseVector([-27.4469, 6.5101, -19.8144]),
 DenseVector([-28.3502, 6.2282, -19.8872]),
 DenseVector([-28.4486, 7.749, -18.4444]),
 DenseVector([-28.6551, 6.3104, -20.1887]),
 DenseVector([-27.1775, 5.9732, -19.0378]),
 DenseVector([-27.4256, 8.4016, -18.9142]),
 DenseVector([-28.4672, 7.1182, -21.156]),
 DenseVector([-32.3034, 6.488, -18.7

In [48]:
type(result)

pyspark.rdd.RDD

In [49]:
#store dense vector in a dataframe

In [50]:
df =result.map(lambda x: (x, )).toDF(["PCA_Features"])

In [51]:
df.show(truncate=False)

+------------------------------------------------------------+
|PCA_Features                                                |
+------------------------------------------------------------+
|[-32.14562159872273,7.358842768599982,-19.683151279391495]  |
|[-30.206707592204925,6.999409087702219,-18.18806626850379]  |
|[-28.027284320328878,7.766088794270562,-18.13886374243249]  |
|[-28.906856296599237,8.311469473077842,-19.44521822446721]  |
|[-26.518753760828194,6.9718238794906675,-19.74342299183181] |
|[-31.2492418167569,7.599840297233542,-19.740064595616744]   |
|[-28.88899790407368,7.7125113689247495,-18.490479969478677] |
|[-26.7184698144026,5.9865102952512945,-18.59431589637633]   |
|[-24.96731221949702,7.575574813240397,-20.012664992679415]  |
|[-28.3697531921725,6.554720479316407,-20.219849597738893]   |
|[-29.12770454007821,7.61109256675221,-18.398338333733804]   |
|[-30.671347379201677,7.1477680934341326,-18.64592179822874] |
|[-30.429117021950262,6.925654582973335,-18.40240295100

In [52]:
type(df)

pyspark.sql.dataframe.DataFrame

In [53]:
label.show()

+---------------+
|Chance of Admit|
+---------------+
|           0.92|
|           0.76|
|           0.72|
|            0.8|
|           0.65|
|            0.9|
|           0.75|
|           0.68|
|            0.5|
|           0.45|
|           0.52|
|           0.84|
|           0.78|
|           0.62|
|           0.61|
|           0.54|
|           0.66|
|           0.65|
|           0.63|
|           0.62|
+---------------+
only showing top 20 rows



In [54]:
type(label)

pyspark.sql.dataframe.DataFrame

In [55]:
new_df = df.join(label)
new_df.show()

+--------------------+---------------+
|        PCA_Features|Chance of Admit|
+--------------------+---------------+
|[-32.145621598722...|           0.92|
|[-32.145621598722...|           0.76|
|[-32.145621598722...|           0.72|
|[-32.145621598722...|            0.8|
|[-32.145621598722...|           0.65|
|[-32.145621598722...|            0.9|
|[-32.145621598722...|           0.75|
|[-32.145621598722...|           0.68|
|[-32.145621598722...|            0.5|
|[-32.145621598722...|           0.45|
|[-32.145621598722...|           0.52|
|[-32.145621598722...|           0.84|
|[-32.145621598722...|           0.78|
|[-32.145621598722...|           0.62|
|[-32.145621598722...|           0.61|
|[-32.145621598722...|           0.54|
|[-32.145621598722...|           0.66|
|[-32.145621598722...|           0.65|
|[-32.145621598722...|           0.63|
|[-32.145621598722...|           0.62|
+--------------------+---------------+
only showing top 20 rows



In [56]:
new_df.select("PCA_Features").show(5)

+--------------------+
|        PCA_Features|
+--------------------+
|[-32.145621598722...|
|[-32.145621598722...|
|[-32.145621598722...|
|[-32.145621598722...|
|[-32.145621598722...|
+--------------------+
only showing top 5 rows



In [57]:
type(new_df)

pyspark.sql.dataframe.DataFrame

In [59]:
new_df.dtypes

[('PCA_Features', 'vector'), ('Chance of Admit', 'float')]

In [None]:
features = new_df.drop('Chance of Admit')
features

In [None]:
assembler = VectorAssembler(
    inputCols=features.columns,
    outputCol="features")

In [None]:
output = assembler.transform(new_df)

In [None]:
output.select("features").show()

In [None]:
training, test = new_df.randomSplit([0.7, 0.3])

In [None]:
from pyspark.ml.regression import RandomForestRegressor

In [None]:
random_forest_reg = RandomForestRegressor(featuresCol="PCA_Features",labelCol="Chance of Admit" )

In [None]:
# Train model.  This also runs the indexer.
model = random_forest_reg.fit(training)