In [1]:
import os, sys
from pyspark.sql import SparkSession

In [2]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
spark = SparkSession.builder.master('local[*]').appName('app_1').getOrCreate()

In [7]:
df = spark.read.csv('headbrain.csv', header=True)

In [8]:
df.printSchema()

root
 |-- Gender: string (nullable = true)
 |-- Age Range: string (nullable = true)
 |-- Head Size(cm^3): string (nullable = true)
 |-- Brain Weight(grams): string (nullable = true)



In [10]:
df.show(5)

+------+---------+---------------+-------------------+
|Gender|Age Range|Head Size(cm^3)|Brain Weight(grams)|
+------+---------+---------------+-------------------+
|     1|        1|           4512|               1530|
|     1|        1|           3738|               1297|
|     1|        1|           4261|               1335|
|     1|        1|           3777|               1282|
|     1|        1|           4177|               1590|
+------+---------+---------------+-------------------+
only showing top 5 rows



In [11]:
import pyspark.sql.types as tp

In [19]:
schema = tp.StructType([
    tp.StructField(name='Gender', dataType=tp.IntegerType()),
    tp.StructField(name='Age Range', dataType=tp.IntegerType()),
    tp.StructField(name='Head Size(cm^3)', dataType=tp.IntegerType()),
    tp.StructField(name='Brain Weight(grams)', dataType=tp.IntegerType())
])

In [20]:
my_df = spark.read.csv('headbrain.csv', schema=schema, header=True)

In [21]:
my_df.printSchema()

root
 |-- Gender: integer (nullable = true)
 |-- Age Range: integer (nullable = true)
 |-- Head Size(cm^3): integer (nullable = true)
 |-- Brain Weight(grams): integer (nullable = true)



In [22]:
my_df.show(5)

+------+---------+---------------+-------------------+
|Gender|Age Range|Head Size(cm^3)|Brain Weight(grams)|
+------+---------+---------------+-------------------+
|     1|        1|           4512|               1530|
|     1|        1|           3738|               1297|
|     1|        1|           4261|               1335|
|     1|        1|           3777|               1282|
|     1|        1|           4177|               1590|
+------+---------+---------------+-------------------+
only showing top 5 rows



In [23]:
# df.show(5)
new_df = df.drop(*['Gender', 'Age Range'])

In [24]:
new_df.show(5)

+---------------+-------------------+
|Head Size(cm^3)|Brain Weight(grams)|
+---------------+-------------------+
|           4512|               1530|
|           3738|               1297|
|           4261|               1335|
|           3777|               1282|
|           4177|               1590|
+---------------+-------------------+
only showing top 5 rows



In [33]:
new_df = my_df.drop(*['Gender', 'Age Range'])

## Checking shape of data frame 

In [34]:
new_df.count(), len(new_df.columns)

(237, 2)

In [35]:
new_df.show(5)

+---------------+-------------------+
|Head Size(cm^3)|Brain Weight(grams)|
+---------------+-------------------+
|           4512|               1530|
|           3738|               1297|
|           4261|               1335|
|           3777|               1282|
|           4177|               1590|
+---------------+-------------------+
only showing top 5 rows



## Select Individual column 

In [37]:
my_df.select('Head Size(cm^3)').show(5)

+---------------+
|Head Size(cm^3)|
+---------------+
|           4512|
|           3738|
|           4261|
|           3777|
|           4177|
+---------------+
only showing top 5 rows



## Describe DataFrame 

In [38]:
my_df.select('Head Size(cm^3)', 'Brain Weight(grams)').describe().show()

+-------+------------------+-------------------+
|summary|   Head Size(cm^3)|Brain Weight(grams)|
+-------+------------------+-------------------+
|  count|               237|                237|
|   mean|3633.9915611814345|  1282.873417721519|
| stddev| 365.2614224198132| 120.34044578645734|
|    min|              2720|                955|
|    max|              4747|               1635|
+-------+------------------+-------------------+



## Checking Null Values 

In [39]:
import pyspark.sql.functions as fun

In [42]:
null_count = my_df.agg(*[fun.count(fun.when(fun.isnull(col), col)).alias(col) for col in new_df.columns])

In [43]:
null_count.show()

+---------------+-------------------+
|Head Size(cm^3)|Brain Weight(grams)|
+---------------+-------------------+
|              0|                  0|
+---------------+-------------------+



In [44]:
my_df.groupBy('Gender').count().show()

+------+-----+
|Gender|count|
+------+-----+
|     1|  134|
|     2|  103|
+------+-----+



In [45]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

In [46]:
StringIndexGender = StringIndexer(inputCol='Gender', outputCol='Gender_2')

In [47]:
my_df = StringIndexGender.fit(my_df).transform(my_df)

In [48]:
my_df.show(5)

+------+---------+---------------+-------------------+--------+
|Gender|Age Range|Head Size(cm^3)|Brain Weight(grams)|Gender_2|
+------+---------+---------------+-------------------+--------+
|     1|        1|           4512|               1530|     0.0|
|     1|        1|           3738|               1297|     0.0|
|     1|        1|           4261|               1335|     0.0|
|     1|        1|           3777|               1282|     0.0|
|     1|        1|           4177|               1590|     0.0|
+------+---------+---------------+-------------------+--------+
only showing top 5 rows



In [49]:
my_df.groupBy('Gender_2').count().show()

+--------+-----+
|Gender_2|count|
+--------+-----+
|     0.0|  134|
|     1.0|  103|
+--------+-----+



In [50]:
onehot = OneHotEncoder(inputCols=['Gender_2'], outputCols=['Gender_3'])
my_df = onehot.fit(my_df).transform(my_df)

In [51]:
my_df.show(5)

+------+---------+---------------+-------------------+--------+-------------+
|Gender|Age Range|Head Size(cm^3)|Brain Weight(grams)|Gender_2|     Gender_3|
+------+---------+---------------+-------------------+--------+-------------+
|     1|        1|           4512|               1530|     0.0|(1,[0],[1.0])|
|     1|        1|           3738|               1297|     0.0|(1,[0],[1.0])|
|     1|        1|           4261|               1335|     0.0|(1,[0],[1.0])|
|     1|        1|           3777|               1282|     0.0|(1,[0],[1.0])|
|     1|        1|           4177|               1590|     0.0|(1,[0],[1.0])|
+------+---------+---------------+-------------------+--------+-------------+
only showing top 5 rows



In [54]:
# from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

In [62]:
linear = LinearRegression(featuresCol='Head Size(cm^3)', labelCol='Brain Weight(grams)')
pipeline = Pipeline(stages=[linear])
pipelineModel = pipeline.fit(new_df)
train_model = pipelineModel.transform(new_df)

IllegalArgumentException: requirement failed: Column Head Size(cm^3) must be of type class org.apache.spark.ml.linalg.VectorUDT:struct<type:tinyint,size:int,indices:array<int>,values:array<double>> but was actually class org.apache.spark.sql.types.IntegerType$:int.