<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-data" data-toc-modified-id="Load-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load data</a></span></li><li><span><a href="#Look-at-data-types" data-toc-modified-id="Look-at-data-types-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Look at data types</a></span><ul class="toc-item"><li><span><a href="#categorical-values" data-toc-modified-id="categorical-values-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>categorical values</a></span></li></ul></li><li><span><a href="#Modelling" data-toc-modified-id="Modelling-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Modelling</a></span></li><li><span><a href="#Model-Predictions" data-toc-modified-id="Model-Predictions-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Model Predictions</a></span></li><li><span><a href="#Model-evaluation" data-toc-modified-id="Model-evaluation-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Model evaluation</a></span></li></ul></div>

In [1]:
import numpy as np
import pandas as pd
import pyspark
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf # @udf("integer") def myfunc(x,y): return x - y
from pyspark.sql import functions as F # stddev format_number date_format, dayofyear, when
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

print([(x.__name__,x.__version__) for x in [np, pd, pyspark]])

spark = pyspark.sql.SparkSession.builder.appName('tree').getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc) # spark_df = sqlContext.createDataFrame(pandas_df)
sc.setLogLevel("INFO")

[('numpy', '1.17.1'), ('pandas', '0.25.1'), ('pyspark', '2.4.4')]


In [25]:
from pyspark.ml.feature import StringIndexer, VectorIndexer,OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import GBTRegressor

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier

from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Load data

In [3]:
!head -2 ../data/College.csv

School,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F_Undergrad,P_Undergrad,Outstate,Room_Board,Books,Personal,PhD,Terminal,S_F_Ratio,perc_alumni,Expend,Grad_Rate
Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60


In [4]:
data = spark.read.csv('../data/College.csv', header=True, inferSchema=True)
print(data.count())
print(data.printSchema())
data.limit(5).toPandas().T

777
root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)

None


Unnamed: 0,0,1,2,3,4
School,Abilene Christian University,Adelphi University,Adrian College,Agnes Scott College,Alaska Pacific University
Private,Yes,Yes,Yes,Yes,Yes
Apps,1660,2186,1428,417,193
Accept,1232,1924,1097,349,146
Enroll,721,512,336,137,55
Top10perc,23,16,22,60,16
Top25perc,52,29,50,89,44
F_Undergrad,2885,2683,1036,510,249
P_Undergrad,537,1227,99,63,869
Outstate,7440,12280,11250,12960,7560


# Look at data types

In [5]:
data.dtypes

[('School', 'string'),
 ('Private', 'string'),
 ('Apps', 'int'),
 ('Accept', 'int'),
 ('Enroll', 'int'),
 ('Top10perc', 'int'),
 ('Top25perc', 'int'),
 ('F_Undergrad', 'int'),
 ('P_Undergrad', 'int'),
 ('Outstate', 'int'),
 ('Room_Board', 'int'),
 ('Books', 'int'),
 ('Personal', 'int'),
 ('PhD', 'int'),
 ('Terminal', 'int'),
 ('S_F_Ratio', 'double'),
 ('perc_alumni', 'int'),
 ('Expend', 'int'),
 ('Grad_Rate', 'int')]

In [6]:
numeric_features = [t[0] for t in data.dtypes if t[1] == 'int']
data.select(numeric_features).describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
Apps,777,3001.6383526383524,3870.2014844352884,81,48094
Accept,777,2018.8043758043757,2451.11397099263,72,26330
Enroll,777,779.972972972973,929.17619013287,35,6392
Top10perc,777,27.55855855855856,17.640364385452134,1,96
Top25perc,777,55.7966537966538,19.804777595131373,9,100
F_Undergrad,777,3699.907335907336,4850.420530887386,139,31643
P_Undergrad,777,855.2985842985843,1522.431887295513,1,21836
Outstate,777,10440.66924066924,4023.0164841119727,2340,21700
Room_Board,777,4357.526383526383,1096.6964155935289,1780,8124


In [7]:
print(data.columns)

['School', 'Private', 'Apps', 'Accept', 'Enroll', 'Top10perc', 'Top25perc', 'F_Undergrad', 'P_Undergrad', 'Outstate', 'Room_Board', 'Books', 'Personal', 'PhD', 'Terminal', 'S_F_Ratio', 'perc_alumni', 'Expend', 'Grad_Rate']


In [8]:
input_cols = [ 'Apps', 'Accept', 'Enroll', 'Top10perc', 'Top25perc',
              'F_Undergrad', 'P_Undergrad', 'Outstate', 'Room_Board',
              'Books', 'Personal', 'PhD', 'Terminal', 'S_F_Ratio',
              'perc_alumni', 'Expend', 'Grad_Rate']

assembler = VectorAssembler(inputCols=input_cols, outputCol='features')

In [9]:
output = assembler.transform(data)

## categorical values

In [10]:
categorical_features = [t[0] for t in data.dtypes if t[1] == 'string']
categorical_features

['School', 'Private']

In [11]:
data.select('Private').distinct().show()

+-------+
|Private|
+-------+
|     No|
|    Yes|
+-------+



In [12]:
data.select('School').distinct().show()

+--------------------+
|              School|
+--------------------+
|    Colorado College|
|Fresno Pacific Co...|
| Mount Marty College|
|University of Okl...|
|  Widener University|
|Bethune Cookman C...|
|Marquette University|
| New York University|
|  Norwich University|
|SUNY College  at ...|
|University of Neb...|
|  Lindenwood College|
|Auburn University...|
|   Butler University|
|     Carroll College|
|       Smith College|
|University of Cal...|
|  Adelphi University|
|   Blackburn College|
|  Fordham University|
+--------------------+
only showing top 20 rows



In [13]:
indexer = StringIndexer(inputCol='Private', outputCol='PrivateIndex')
output_fixed = indexer.fit(output).transform(output)
output_fixed.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)



In [14]:
final_data = output_fixed.select('features','PrivateIndex')

In [15]:
train_data, test_data = final_data.randomSplit([0.7, 0.3],seed=100)

# Modelling

In [16]:
from pyspark.ml.classification import DecisionTreeClassifier,GBTClassifier,RandomForestClassifier

In [17]:
dtc = DecisionTreeClassifier(labelCol='PrivateIndex', featuresCol='features')
rfc = RandomForestClassifier(labelCol='PrivateIndex', featuresCol='features',numTrees=200)
gbt = GBTClassifier(labelCol='PrivateIndex', featuresCol='features')

In [18]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

# Model Predictions

In [19]:
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [20]:
dtc_preds.show(2)

+--------------------+------------+-------------+-----------+----------+
|            features|PrivateIndex|rawPrediction|probability|prediction|
+--------------------+------------+-------------+-----------+----------+
|[152.0,128.0,75.0...|         0.0|  [241.0,0.0]|  [1.0,0.0]|       0.0|
|[191.0,165.0,63.0...|         0.0|  [241.0,0.0]|  [1.0,0.0]|       0.0|
+--------------------+------------+-------------+-----------+----------+
only showing top 2 rows



# Model evaluation

In [21]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [22]:
acc_evaluator = MulticlassClassificationEvaluator(labelCol="PrivateIndex",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")

In [23]:
dtc_acc = acc_evaluator.evaluate(dtc_preds)
rfc_acc = acc_evaluator.evaluate(rfc_preds)
gbt_acc = acc_evaluator.evaluate(gbt_preds)

In [24]:
output_df = pd.DataFrame({'DT': dtc_acc, 'RF': rfc_acc, 'GBT': gbt_acc},
                         index=['accuracy']).mul(100).round(2)
output_df

Unnamed: 0,DT,RF,GBT
accuracy,89.52,93.55,90.32
