In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
import numpy as np
import pandas as pd

In [2]:
#conf = SparkConf().set("spark.cores.max", "4")
conf = SparkConf().setMaster("local[*]")
sc = SparkContext(conf=conf)

In [3]:
spark = SparkSession\
    .builder\
    .appName("MyApp")\
    .getOrCreate()
    #.config("spark.cores.max", "4")\

## Read file as text

In [4]:
data = sc.textFile("Datasets/Data-1.csv", 3) #3 partitions

In [5]:
data.getNumPartitions()

3

In [6]:
data.count()

1600

In [7]:
data.take(10)

[u'x_1;x_2;x_3;x_4;x_5;x_6;x_7;x_8;x_9;x_10;x_11;y',
 u'7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5',
 u'7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5',
 u'7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;9.8;5',
 u'11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58;9.8;6',
 u'7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5',
 u'7.4;0.66;0;1.8;0.075;13;40;0.9978;3.51;0.56;9.4;5',
 u'7.9;0.6;0.06;1.6;0.069;15;59;0.9964;3.3;0.46;9.4;5',
 u'7.3;0.65;0;1.2;0.065;15;21;0.9946;3.39;0.47;10;7',
 u'7.8;0.58;0.02;2;0.073;9;18;0.9968;3.36;0.57;9.5;7']

In [8]:
data.saveAsTextFile("saved-data")

In [9]:
data2 = sc.textFile("saved-data/part-*")

In [10]:
data2.count()

1600

## Load dataset 1

In [11]:
df1 = spark.read\
    .format("csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter",";")\
    .load("Datasets/Data-1.csv").repartition(3)

In [12]:
df1.rdd.getNumPartitions()

3

## Explore dataset 1

In [13]:
df1.show(5)

+---+-----+----+---+-----+----+----+-------+----+----+----+---+
|x_1|  x_2| x_3|x_4|  x_5| x_6| x_7|    x_8| x_9|x_10|x_11|  y|
+---+-----+----+---+-----+----+----+-------+----+----+----+---+
|7.3| 0.98|0.05|2.1|0.061|20.0|49.0|0.99705|3.31|0.55| 9.7|  3|
|8.2| 0.31| 0.4|2.2|0.058| 6.0|10.0|0.99536|3.31|0.68|11.2|  7|
|6.8| 0.59| 0.1|1.7|0.063|34.0|53.0| 0.9958|3.41|0.67| 9.7|  5|
|8.6|0.685| 0.1|1.6|0.092| 3.0|12.0|0.99745|3.31|0.65|9.55|  6|
|9.8|  0.5|0.49|2.6| 0.25| 5.0|20.0|  0.999|3.31|0.79|10.7|  6|
+---+-----+----+---+-----+----+----+-------+----+----+----+---+
only showing top 5 rows



In [14]:
df1.printSchema()

root
 |-- x_1: double (nullable = true)
 |-- x_2: double (nullable = true)
 |-- x_3: double (nullable = true)
 |-- x_4: double (nullable = true)
 |-- x_5: double (nullable = true)
 |-- x_6: double (nullable = true)
 |-- x_7: double (nullable = true)
 |-- x_8: double (nullable = true)
 |-- x_9: double (nullable = true)
 |-- x_10: double (nullable = true)
 |-- x_11: double (nullable = true)
 |-- y: integer (nullable = true)



In [15]:
df1.columns

['x_1',
 'x_2',
 'x_3',
 'x_4',
 'x_5',
 'x_6',
 'x_7',
 'x_8',
 'x_9',
 'x_10',
 'x_11',
 'y']

In [16]:
len(df1.columns)

12

In [17]:
df1.count()

1599

In [18]:
df1.describe("y").show()

+-------+------------------+
|summary|                 y|
+-------+------------------+
|  count|              1599|
|   mean|5.6360225140712945|
| stddev|0.8075694397347051|
|    min|                 3|
|    max|                 8|
+-------+------------------+



In [19]:
df1.describe("x_1","x_2").show()

+-------+------------------+-------------------+
|summary|               x_1|                x_2|
+-------+------------------+-------------------+
|  count|              1599|               1599|
|   mean| 8.319637273295813| 0.5278205128205123|
| stddev|1.7410963181276953|0.17905970415353545|
|    min|               4.6|               0.12|
|    max|              15.9|               1.58|
+-------+------------------+-------------------+



In [20]:
df1.select(mean("y"),max("y"),min("y")).show()

+------------------+------+------+
|            avg(y)|max(y)|min(y)|
+------------------+------+------+
|5.6360225140712945|     8|     3|
+------------------+------+------+



In [21]:
df1.select("y").distinct().sort("y").show()

+---+
|  y|
+---+
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
+---+



In [22]:
df1.groupBy("y").count().sort("y").show()

+---+-----+
|  y|count|
+---+-----+
|  3|   10|
|  4|   53|
|  5|  681|
|  6|  638|
|  7|  199|
|  8|   18|
+---+-----+



In [23]:
df1.stat.corr("y","x_1")

0.12405164911322429

In [24]:
#L = [[cn,df1.stat.corr("quality",cn)] for cn in df1.columns]
L =[]
for cn in df1.columns:
    L.append([cn,df1.stat.corr("y",cn)])
L

[['x_1', 0.12405164911322432],
 ['x_2', -0.3905577802640072],
 ['x_3', 0.22637251431804142],
 ['x_4', 0.013731637340066192],
 ['x_5', -0.12890655993005248],
 ['x_6', -0.050656057244276104],
 ['x_7', -0.18510028892653782],
 ['x_8', -0.1749192277833488],
 ['x_9', -0.05773139120538207],
 ['x_10', 0.25139707906926134],
 ['x_11', 0.4761663240011359],
 ['y', 1.0]]

In [25]:
df_corr = spark.createDataFrame(L).toDF("colname","correlation")

In [26]:
df_corr.sort("correlation").show()

+-------+--------------------+
|colname|         correlation|
+-------+--------------------+
|    x_2| -0.3905577802640072|
|    x_7|-0.18510028892653782|
|    x_8| -0.1749192277833488|
|    x_5|-0.12890655993005248|
|    x_9|-0.05773139120538207|
|    x_6|-0.05065605724427...|
|    x_4|0.013731637340066192|
|    x_1| 0.12405164911322432|
|    x_3| 0.22637251431804142|
|   x_10| 0.25139707906926134|
|   x_11|  0.4761663240011359|
|      y|                 1.0|
+-------+--------------------+



In [27]:
df_corr.select(col("colname"),abs(col("correlation")))\
    .sort(col("abs(correlation)").desc()).show()

+-------+--------------------+
|colname|    abs(correlation)|
+-------+--------------------+
|      y|                 1.0|
|   x_11|  0.4761663240011359|
|    x_2|  0.3905577802640072|
|   x_10| 0.25139707906926134|
|    x_3| 0.22637251431804142|
|    x_7| 0.18510028892653782|
|    x_8|  0.1749192277833488|
|    x_5| 0.12890655993005248|
|    x_1| 0.12405164911322432|
|    x_9| 0.05773139120538207|
|    x_6|0.050656057244276104|
|    x_4|0.013731637340066192|
+-------+--------------------+



## Load dataset 2

In [28]:
df2 = spark.read\
    .format("csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter",";")\
    .load("Datasets/Data-2.csv")

## Explore dataset 2

In [29]:
df2.show(5)

+---+----+----+----+-----+----+-----+------+----+----+----+---+
|x_1| x_2| x_3| x_4|  x_5| x_6|  x_7|   x_8| x_9|x_10|x_11|  y|
+---+----+----+----+-----+----+-----+------+----+----+----+---+
|7.0|0.27|0.36|20.7|0.045|45.0|170.0| 1.001| 3.0|0.45| 8.8|  6|
|6.3| 0.3|0.34| 1.6|0.049|14.0|132.0| 0.994| 3.3|0.49| 9.5|  6|
|8.1|0.28| 0.4| 6.9| 0.05|30.0| 97.0|0.9951|3.26|0.44|10.1|  6|
|7.2|0.23|0.32| 8.5|0.058|47.0|186.0|0.9956|3.19| 0.4| 9.9|  6|
|7.2|0.23|0.32| 8.5|0.058|47.0|186.0|0.9956|3.19| 0.4| 9.9|  6|
+---+----+----+----+-----+----+-----+------+----+----+----+---+
only showing top 5 rows



In [30]:
df2.printSchema()

root
 |-- x_1: double (nullable = true)
 |-- x_2: double (nullable = true)
 |-- x_3: double (nullable = true)
 |-- x_4: double (nullable = true)
 |-- x_5: double (nullable = true)
 |-- x_6: double (nullable = true)
 |-- x_7: double (nullable = true)
 |-- x_8: double (nullable = true)
 |-- x_9: double (nullable = true)
 |-- x_10: double (nullable = true)
 |-- x_11: double (nullable = true)
 |-- y: integer (nullable = true)



In [31]:
len(df2.columns)

12

In [32]:
df2.count()

4898

In [33]:
df2.groupBy("y").count().sort("y").show()

+---+-----+
|  y|count|
+---+-----+
|  3|   20|
|  4|  163|
|  5| 1457|
|  6| 2198|
|  7|  880|
|  8|  175|
|  9|    5|
+---+-----+



## Concatenate the two datasets

In [34]:
df = df1.unionAll(df2)
df.show(10)

+---+-----+----+---+-----+----+-----+-------+----+----+----+---+
|x_1|  x_2| x_3|x_4|  x_5| x_6|  x_7|    x_8| x_9|x_10|x_11|  y|
+---+-----+----+---+-----+----+-----+-------+----+----+----+---+
|7.3| 0.98|0.05|2.1|0.061|20.0| 49.0|0.99705|3.31|0.55| 9.7|  3|
|8.2| 0.31| 0.4|2.2|0.058| 6.0| 10.0|0.99536|3.31|0.68|11.2|  7|
|6.8| 0.59| 0.1|1.7|0.063|34.0| 53.0| 0.9958|3.41|0.67| 9.7|  5|
|8.6|0.685| 0.1|1.6|0.092| 3.0| 12.0|0.99745|3.31|0.65|9.55|  6|
|9.8|  0.5|0.49|2.6| 0.25| 5.0| 20.0|  0.999|3.31|0.79|10.7|  6|
|6.8| 0.65|0.02|2.1|0.078| 8.0| 15.0|0.99498|3.35|0.62|10.4|  6|
|9.2| 0.43|0.49|2.4|0.086|23.0|116.0| 0.9976|3.23|0.64| 9.5|  5|
|7.2|  0.6|0.04|2.5|0.076|18.0| 88.0|0.99745|3.53|0.55| 9.5|  5|
|7.9| 0.69|0.21|2.1| 0.08|33.0|141.0| 0.9962|3.25|0.51| 9.9|  5|
|9.8| 0.66|0.39|3.2|0.083|21.0| 59.0| 0.9989|3.37|0.71|11.5|  7|
+---+-----+----+---+-----+----+-----+-------+----+----+----+---+
only showing top 10 rows



In [35]:
df.count()

6497

## Repartition and save the new dataset

In [36]:
df.repartition(1).write\
    .format("csv")\
    .option("header", "true")\
    .mode("overwrite")\
    .save("concatenated-data")

## Merge the two datasets

In [37]:
Columns_1 = []
for name in df1.columns:
    name_1 = name + "_A"
    Columns_1.append(name_1)

In [38]:
Columns_1

['x_1_A',
 'x_2_A',
 'x_3_A',
 'x_4_A',
 'x_5_A',
 'x_6_A',
 'x_7_A',
 'x_8_A',
 'x_9_A',
 'x_10_A',
 'x_11_A',
 'y_A']

In [39]:
F1 = df1.toDF(*Columns_1)

In [40]:
Columns_2 = []
for name in df2.columns:
    name_2 = name + "_B"
    Columns_2.append(name_2)

In [41]:
Columns_2

['x_1_B',
 'x_2_B',
 'x_3_B',
 'x_4_B',
 'x_5_B',
 'x_6_B',
 'x_7_B',
 'x_8_B',
 'x_9_B',
 'x_10_B',
 'x_11_B',
 'y_B']

In [42]:
F2 = df2.toDF(*Columns_2)

In [43]:
dfm = F1.join(F2, F1.y_A==F2.y_B)
dfm.show(5)

+-----+-----+-----+-----+-----+-----+-----+-------+-----+------+----------------+---+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+------+---+
|x_1_A|x_2_A|x_3_A|x_4_A|x_5_A|x_6_A|x_7_A|  x_8_A|x_9_A|x_10_A|          x_11_A|y_A|x_1_B|x_2_B|x_3_B|x_4_B|x_5_B|x_6_B|x_7_B|x_8_B|x_9_B|x_10_B|x_11_B|y_B|
+-----+-----+-----+-----+-----+-----+-----+-------+-----+------+----------------+---+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+------+---+
| 12.9|  0.5| 0.55|  2.8|0.072|  7.0| 24.0|1.00012| 3.09|  0.68|            10.9|  6|  7.0| 0.27| 0.36| 20.7|0.045| 45.0|170.0|1.001|  3.0|  0.45|   8.8|  6|
|  7.3| 0.67| 0.02|  2.2|0.072| 31.0| 92.0|0.99566| 3.32|  0.68|11.0666666666667|  6|  7.0| 0.27| 0.36| 20.7|0.045| 45.0|170.0|1.001|  3.0|  0.45|   8.8|  6|
|  6.9| 0.36| 0.25|  2.4|0.098|  5.0| 16.0| 0.9964| 3.41|   0.6|            10.1|  6|  7.0| 0.27| 0.36| 20.7|0.045| 45.0|170.0|1.001|  3.0|  0.45|   8.8|  6|
| 11.3| 0.34| 0.45|  2.0|0.082|  6.0| 15.0| 0.9988| 

## Load dataset 3

In [44]:
df3 = spark.read\
    .format("csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter",",")\
    .load("Datasets/Data-3.csv")

In [45]:
df3.show(30)

+---+---+---+---+---+---+---+---+---+-----+
|x_1|x_2|x_3|x_4|x_5|x_6|x_7|x_8|x_9|class|
+---+---+---+---+---+---+---+---+---+-----+
|  5|  1|  1|  1|  2|  1|  3|  1|  1|    0|
|  5|  4|  4|  5|  7| 10|  3|  2|  1|    0|
|  3|  1|  1|  1|  2|  2|  3|  1|  1|    0|
|  6|  8|  8|  1|  3|  4|  3|  7|  1|    0|
|  4|  1|  1|  3|  2|  1|  3|  1|  1|    0|
|  8| 10| 10|  8|  7| 10|  9|  7|  1|    1|
|  1|  1|  1|  1|  2| 10|  3|  1|  1|    0|
|  2|  1|  2|  1|  2|  1|  3|  1|  1|    0|
|  2|  1|  1|  1|  2|  1|  1|  1|  5|    0|
|  4|  2|  1|  1|  2|  1|  2|  1|  1|    0|
|  1|  1|  1|  1|  1|  1|  3|  1|  1|    0|
|  2|  1|  1|  1|  2|  1|  2|  1|  1|    0|
|  5|  3|  3|  3|  2|  3|  4|  4|  1|    1|
|  1|  1|  1|  1|  2|  3|  3|  1|  1|    0|
|  8|  7|  5| 10|  7|  9|  5|  5|  4|    1|
|  7|  4|  6|  4|  6|  1|  4|  3|  1|    1|
|  4|  1|  1|  1|  2|  1|  2|  1|  1|    0|
|  4|  1|  1|  1|  2|  1|  3|  1|  1|    0|
| 10|  7|  7|  6|  4| 10|  4|  1|  2|    1|
|  6|  1|  1|  1|  2|  1|  3|  1

In [46]:
df3.printSchema()

root
 |-- x_1: integer (nullable = true)
 |-- x_2: integer (nullable = true)
 |-- x_3: integer (nullable = true)
 |-- x_4: integer (nullable = true)
 |-- x_5: integer (nullable = true)
 |-- x_6: string (nullable = true)
 |-- x_7: integer (nullable = true)
 |-- x_8: integer (nullable = true)
 |-- x_9: integer (nullable = true)
 |-- class: integer (nullable = true)



In [47]:
df3.filter(col("x_6") == "?").show(5)

+---+---+---+---+---+---+---+---+---+-----+
|x_1|x_2|x_3|x_4|x_5|x_6|x_7|x_8|x_9|class|
+---+---+---+---+---+---+---+---+---+-----+
|  8|  4|  5|  1|  2|  ?|  7|  3|  1|    1|
|  6|  6|  6|  9|  6|  ?|  7|  8|  1|    0|
|  1|  1|  1|  1|  1|  ?|  2|  1|  1|    0|
|  1|  1|  3|  1|  2|  ?|  2|  1|  1|    0|
|  1|  1|  2|  1|  3|  ?|  1|  1|  1|    0|
+---+---+---+---+---+---+---+---+---+-----+
only showing top 5 rows



In [48]:
df3.filter(col("x_6") == "?").count()

16

## Cast columns to double

In [49]:
df3.columns

['x_1', 'x_2', 'x_3', 'x_4', 'x_5', 'x_6', 'x_7', 'x_8', 'x_9', 'class']

In [50]:
df3.select(col('x_1').cast('double')).show(5)#.collect()

+---+
|x_1|
+---+
|5.0|
|5.0|
|3.0|
|6.0|
|4.0|
+---+
only showing top 5 rows



In [51]:
CD = []
for name in df3.columns:
    CD.append(col(name).cast('double'))

In [52]:
df4 = df3.select(*CD)

In [53]:
df4.show(30)

+----+----+----+----+---+----+---+----+---+-----+
| x_1| x_2| x_3| x_4|x_5| x_6|x_7| x_8|x_9|class|
+----+----+----+----+---+----+---+----+---+-----+
| 5.0| 1.0| 1.0| 1.0|2.0| 1.0|3.0| 1.0|1.0|  0.0|
| 5.0| 4.0| 4.0| 5.0|7.0|10.0|3.0| 2.0|1.0|  0.0|
| 3.0| 1.0| 1.0| 1.0|2.0| 2.0|3.0| 1.0|1.0|  0.0|
| 6.0| 8.0| 8.0| 1.0|3.0| 4.0|3.0| 7.0|1.0|  0.0|
| 4.0| 1.0| 1.0| 3.0|2.0| 1.0|3.0| 1.0|1.0|  0.0|
| 8.0|10.0|10.0| 8.0|7.0|10.0|9.0| 7.0|1.0|  1.0|
| 1.0| 1.0| 1.0| 1.0|2.0|10.0|3.0| 1.0|1.0|  0.0|
| 2.0| 1.0| 2.0| 1.0|2.0| 1.0|3.0| 1.0|1.0|  0.0|
| 2.0| 1.0| 1.0| 1.0|2.0| 1.0|1.0| 1.0|5.0|  0.0|
| 4.0| 2.0| 1.0| 1.0|2.0| 1.0|2.0| 1.0|1.0|  0.0|
| 1.0| 1.0| 1.0| 1.0|1.0| 1.0|3.0| 1.0|1.0|  0.0|
| 2.0| 1.0| 1.0| 1.0|2.0| 1.0|2.0| 1.0|1.0|  0.0|
| 5.0| 3.0| 3.0| 3.0|2.0| 3.0|4.0| 4.0|1.0|  1.0|
| 1.0| 1.0| 1.0| 1.0|2.0| 3.0|3.0| 1.0|1.0|  0.0|
| 8.0| 7.0| 5.0|10.0|7.0| 9.0|5.0| 5.0|4.0|  1.0|
| 7.0| 4.0| 6.0| 4.0|6.0| 1.0|4.0| 3.0|1.0|  1.0|
| 4.0| 1.0| 1.0| 1.0|2.0| 1.0|2.0| 1.0|1.0|  0.0|


In [54]:
df4.printSchema()

root
 |-- x_1: double (nullable = true)
 |-- x_2: double (nullable = true)
 |-- x_3: double (nullable = true)
 |-- x_4: double (nullable = true)
 |-- x_5: double (nullable = true)
 |-- x_6: double (nullable = true)
 |-- x_7: double (nullable = true)
 |-- x_8: double (nullable = true)
 |-- x_9: double (nullable = true)
 |-- class: double (nullable = true)



## Replace NaNs with the mean

In [55]:
df4.filter(col("x_6").isNull()).count()

16

In [56]:
M = df4.select(mean("x_6")).collect()[0][0] #take(1)[0][0]
M

3.5446559297218156

In [57]:
df5 = df4.na.fill(M)

In [58]:
df5.filter(col("x_6").isNull()).count()

0

In [59]:
sc.stop()