<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Project" data-toc-modified-id="Project-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Project</a></span></li><li><span><a href="#Load-the-data" data-toc-modified-id="Load-the-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load the data</a></span></li><li><span><a href="#Create-final-data-using-VectorAssembler" data-toc-modified-id="Create-final-data-using-VectorAssembler-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Create final data using VectorAssembler</a></span></li><li><span><a href="#Modelling" data-toc-modified-id="Modelling-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Modelling</a></span></li></ul></div>

In [1]:
import numpy as np
import pandas as pd
import pyspark
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf # @udf("integer") def myfunc(x,y): return x - y
from pyspark.sql import functions as F # stddev format_number date_format, dayofyear, when
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

print([(x.__name__,x.__version__) for x in [np, pd, pyspark]])

spark = pyspark.sql.SparkSession.builder.appName('tree').getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc) # spark_df = sqlContext.createDataFrame(pandas_df)
sc.setLogLevel("INFO")

[('numpy', '1.17.1'), ('pandas', '0.25.1'), ('pyspark', '2.4.4')]


In [2]:
from pyspark.ml.feature import StringIndexer, VectorIndexer,OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import GBTRegressor

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier

from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Project

> The dog food company first mixes up a
batch of preservative that contains 4
different preservative chemicals A, B, C, D
and then is completed with a "filler"
chemical.

> The food scientists believe one of the
A, B, C, or D preservatives is causing the
problem, but need your help to figure out
which one!

> Use Machine Learning with RF to find out
which parameter had the most predictive
power, thus finding out which chemical
causes the early spoiling!
So create a model and then find out how
you can decide which chemical is the
problem!

# Load the data

In [3]:
!ls ../data/

[32mCollege.csv[m[m                       [32mcruise_ship_info.csv[m[m              [32mnew_customers.csv[m[m                 [32mseeds_dataset.csv[m[m
[32mContainsNull.csv[m[m                  [32mcustomer_churn.csv[m[m                [32mpeople.json[m[m                       [32mseeds_dataset.txt[m[m
Ecommerce-Customers.csv           [32mdog_food.csv[m[m                      [32msales_info.csv[m[m                    [32mtitanic.csv[m[m
[32mEcommerce_Customers.csv[m[m           [32mfake_customers.csv[m[m                [32msample_kmeans_data.txt[m[m            [32mwalmart_stock.csv[m[m
[32mMeal_Info.csv[m[m                     [32mhack_data.csv[m[m                     [32msample_libsvm_data.txt[m[m
[32mappl_stock.csv[m[m                    [32mmovielens_ratings.csv[m[m             [32msample_linear_regression_data.txt[m[m


In [5]:
!head -2 ../data/dog_food.csv

A,B,C,D,Spoiled
4,2,12.0,3,1.0


In [7]:
df = spark.read.csv('../data/dog_food.csv', header=True, inferSchema=True)
print(df.count())
print(df.printSchema())

df.limit(5).toPandas().T

490
root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)

None


Unnamed: 0,0,1,2,3,4
A,4.0,5.0,6.0,4.0,4.0
B,2.0,6.0,2.0,2.0,2.0
C,12.0,12.0,13.0,12.0,12.0
D,3.0,7.0,6.0,1.0,3.0
Spoiled,1.0,1.0,1.0,1.0,1.0


# Create final data using VectorAssembler

In [8]:
df.columns

['A', 'B', 'C', 'D', 'Spoiled']

In [10]:
assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'], outputCol='features')

In [11]:
output = assembler.transform(df)

In [12]:
output.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)
 |-- features: vector (nullable = true)



In [13]:
final_data = output.select(['features','Spoiled'])
final_data.show()

+-------------------+-------+
|           features|Spoiled|
+-------------------+-------+
| [4.0,2.0,12.0,3.0]|    1.0|
| [5.0,6.0,12.0,7.0]|    1.0|
| [6.0,2.0,13.0,6.0]|    1.0|
| [4.0,2.0,12.0,1.0]|    1.0|
| [4.0,2.0,12.0,3.0]|    1.0|
|[10.0,3.0,13.0,9.0]|    1.0|
| [8.0,5.0,14.0,5.0]|    1.0|
| [5.0,8.0,12.0,8.0]|    1.0|
| [6.0,5.0,12.0,9.0]|    1.0|
| [3.0,3.0,12.0,1.0]|    1.0|
| [9.0,8.0,11.0,3.0]|    1.0|
|[1.0,10.0,12.0,3.0]|    1.0|
|[1.0,5.0,13.0,10.0]|    1.0|
|[2.0,10.0,12.0,6.0]|    1.0|
|[1.0,10.0,11.0,4.0]|    1.0|
| [5.0,3.0,12.0,2.0]|    1.0|
| [4.0,9.0,11.0,8.0]|    1.0|
| [5.0,1.0,11.0,1.0]|    1.0|
|[4.0,9.0,12.0,10.0]|    1.0|
| [5.0,8.0,10.0,9.0]|    1.0|
+-------------------+-------+
only showing top 20 rows



# Modelling

In [14]:
clf_dt = DecisionTreeClassifier(labelCol='Spoiled',featuresCol='features')

In [17]:
model_dt = clf_dt.fit(final_data)

In [32]:
sparse_vector = model_dt.featureImportances
sparse_vector

SparseVector(4, {1: 0.0019, 2: 0.9832, 3: 0.0149})

In [31]:
df_feature = pd.DataFrame(sparse_vector.toArray(),
                          columns=['feature_importance'],
                          index=list('ABCD'))

df_feature.sort_values('feature_importance')

Unnamed: 0,feature_importance
A,0.0
B,0.001911
D,0.014922
C,0.983168
