In [1]:
from pyspark import SparkContext
from pyspark.sql import SQLContext, Row
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import warnings
warnings.filterwarnings(action='ignore') 
plt.style.use('ggplot')

In [2]:
sc = SparkContext('local')

In [3]:
sqlCtx = SQLContext(sc)

In [4]:
df = sqlCtx.read.csv('data/concrete_data.csv', 
                     header=True, inferSchema=True) #inferSchema : 데이터 타입 자동으로 맞춰줌
                                                    #header와 inferSchema는 필수
df

DataFrame[Cement: double, Blast Furnace Slag: double, Fly Ash: double, Water: double, Superplasticizer: double, Coarse Aggregate: double, Fine Aggregate: double, Age: double, Strength: double]

    * features
        -Cement
        -Blast Furnace Slag (고로슬래그 : 용광로에서 나온 용철슬래그를 물이나 증기로 급냉하여 
                           유리질의 입상물을 만든 다음 건조시켜 미세한 분말로 분쇄하여 얻은 것)
        -Fly Ash (화력발전소에서 석탄을 원료로 하는 미분탄을 약 1,400℃-1,500℃의 
                 고온으로 소각시켰을 때 발생되는 먼지)
        -Water
        -Superplasticizer (고강도 콘크리트를 만드는 데 사용되는 첨가제)
        -Coarse Aggregate (콘크리트를 만들 때 사용되는 모래, 자갈, 쇄석, 슬래그, 재활용
                           콘크리트 및 토목 인조 골재 등등)
        -Fine Aggregate (Coarse Aggregate보다 고운 입자를 가지는 모래, 수르키(surki), 
                        석재(石砲), 탄 점토, 재, 비산회 등등)
        -Age

    * target
        -Strength

In [5]:
df.show()

+------+------------------+-------+-----+----------------+----------------+--------------+-----+--------+
|Cement|Blast Furnace Slag|Fly Ash|Water|Superplasticizer|Coarse Aggregate|Fine Aggregate|  Age|Strength|
+------+------------------+-------+-----+----------------+----------------+--------------+-----+--------+
| 540.0|               0.0|    0.0|162.0|             2.5|          1040.0|         676.0| 28.0|   79.99|
| 540.0|               0.0|    0.0|162.0|             2.5|          1055.0|         676.0| 28.0|   61.89|
| 332.5|             142.5|    0.0|228.0|             0.0|           932.0|         594.0|270.0|   40.27|
| 332.5|             142.5|    0.0|228.0|             0.0|           932.0|         594.0|365.0|   41.05|
| 198.6|             132.4|    0.0|192.0|             0.0|           978.4|         825.5|360.0|    44.3|
| 266.0|             114.0|    0.0|228.0|             0.0|           932.0|         670.0| 90.0|   47.03|
| 380.0|              95.0|    0.0|228.0|     

In [6]:
df.printSchema()

root
 |-- Cement: double (nullable = true)
 |-- Blast Furnace Slag: double (nullable = true)
 |-- Fly Ash: double (nullable = true)
 |-- Water: double (nullable = true)
 |-- Superplasticizer: double (nullable = true)
 |-- Coarse Aggregate: double (nullable = true)
 |-- Fine Aggregate: double (nullable = true)
 |-- Age: double (nullable = true)
 |-- Strength: double (nullable = true)



#### spark sql을 이용한 EDA

In [8]:
df.createOrReplaceTempView('my')

In [9]:
# WC_ratio 컬럼 추가하기 (Water / Cement)
sql = "select Cement, Water, Water/Cement as WC_ratio from my"
sqlDF = sqlCtx.sql(sql)
sqlDF.show()

+------+-----+------------------+
|Cement|Water|          WC_ratio|
+------+-----+------------------+
| 540.0|162.0|               0.3|
| 540.0|162.0|               0.3|
| 332.5|228.0|0.6857142857142857|
| 332.5|228.0|0.6857142857142857|
| 198.6|192.0|0.9667673716012085|
| 266.0|228.0|0.8571428571428571|
| 380.0|228.0|               0.6|
| 380.0|228.0|               0.6|
| 266.0|228.0|0.8571428571428571|
| 475.0|228.0|              0.48|
| 198.6|192.0|0.9667673716012085|
| 198.6|192.0|0.9667673716012085|
| 427.5|228.0|0.5333333333333333|
| 190.0|228.0|               1.2|
| 304.0|228.0|              0.75|
| 380.0|228.0|               0.6|
| 139.6|192.0|1.3753581661891119|
| 342.0|228.0|0.6666666666666666|
| 380.0|228.0|               0.6|
| 475.0|228.0|              0.48|
+------+-----+------------------+
only showing top 20 rows



In [10]:
sql = "select * from my where Superplasticizer=2.5"
sqlDF = sqlCtx.sql(sql)
sqlDF.show()

+------+------------------+-------+-----+----------------+----------------+--------------+----+--------+
|Cement|Blast Furnace Slag|Fly Ash|Water|Superplasticizer|Coarse Aggregate|Fine Aggregate| Age|Strength|
+------+------------------+-------+-----+----------------+----------------+--------------+----+--------+
| 540.0|               0.0|    0.0|162.0|             2.5|          1040.0|         676.0|28.0|   79.99|
| 540.0|               0.0|    0.0|162.0|             2.5|          1055.0|         676.0|28.0|   61.89|
+------+------------------+-------+-----+----------------+----------------+--------------+----+--------+



In [13]:
sql = "select * from my where `Fine Aggregate` >= 600.0"
sqlDF = sqlCtx.sql(sql)
sqlDF.show()

+------+------------------+-------+-----+----------------+----------------+--------------+-----+--------+
|Cement|Blast Furnace Slag|Fly Ash|Water|Superplasticizer|Coarse Aggregate|Fine Aggregate|  Age|Strength|
+------+------------------+-------+-----+----------------+----------------+--------------+-----+--------+
| 540.0|               0.0|    0.0|162.0|             2.5|          1040.0|         676.0| 28.0|   79.99|
| 540.0|               0.0|    0.0|162.0|             2.5|          1055.0|         676.0| 28.0|   61.89|
| 198.6|             132.4|    0.0|192.0|             0.0|           978.4|         825.5|360.0|    44.3|
| 266.0|             114.0|    0.0|228.0|             0.0|           932.0|         670.0| 90.0|   47.03|
| 266.0|             114.0|    0.0|228.0|             0.0|           932.0|         670.0| 28.0|   45.85|
| 198.6|             132.4|    0.0|192.0|             0.0|           978.4|         825.5| 90.0|   38.07|
| 198.6|             132.4|    0.0|192.0|     

In [36]:
sql = "select `Blast Furnace Slag`, `Fly Ash`, `Coarse Aggregate` from my where `Coarse Aggregate` > 1000"
sqlDF = sqlCtx.sql(sql)
sqlDF.show()

+------------------+-------+----------------+
|Blast Furnace Slag|Fly Ash|Coarse Aggregate|
+------------------+-------+----------------+
|               0.0|    0.0|          1040.0|
|               0.0|    0.0|          1055.0|
|             209.4|    0.0|          1047.0|
|             209.4|    0.0|          1047.0|
|             209.4|    0.0|          1047.0|
|             209.4|    0.0|          1047.0|
|               0.0|    0.0|          1047.0|
|             209.4|    0.0|          1047.0|
|             209.4|    0.0|          1047.0|
|               0.0|    0.0|          1120.0|
|             262.2|    0.0|          1046.9|
|             151.2|    0.0|          1134.3|
|             200.9|    0.0|          1004.6|
|             262.2|    0.0|          1046.9|
|             151.2|    0.0|          1134.3|
|             200.9|    0.0|          1004.6|
|             262.2|    0.0|          1046.9|
|             151.2|    0.0|          1134.3|
|             200.9|    0.0|      

In [15]:
sql = "select * from my order by Age limit 5"
sqlDF = sqlCtx.sql(sql)
sqlDF.show()

+------+------------------+-------+-----+----------------+----------------+--------------+---+--------+
|Cement|Blast Furnace Slag|Fly Ash|Water|Superplasticizer|Coarse Aggregate|Fine Aggregate|Age|Strength|
+------+------------------+-------+-----+----------------+----------------+--------------+---+--------+
| 500.0|               0.0|    0.0|200.0|             0.0|          1125.0|         613.0|1.0|   12.64|
| 385.0|               0.0|    0.0|186.0|             0.0|           966.0|         763.0|1.0|    6.27|
| 198.6|             132.4|    0.0|192.0|             0.0|           978.4|         825.5|3.0|    9.13|
| 425.0|             106.3|    0.0|153.5|            16.5|           852.1|         887.1|3.0|    33.4|
| 310.0|               0.0|    0.0|192.0|             0.0|           971.0|         850.6|3.0|    9.87|
+------+------------------+-------+-----+----------------+----------------+--------------+---+--------+



In [16]:
sql = "select * from my order by Age desc limit 5"
sqlDF = sqlCtx.sql(sql)
sqlDF.show()

+------+------------------+-------+-----+----------------+----------------+--------------+-----+--------+
|Cement|Blast Furnace Slag|Fly Ash|Water|Superplasticizer|Coarse Aggregate|Fine Aggregate|  Age|Strength|
+------+------------------+-------+-----+----------------+----------------+--------------+-----+--------+
| 342.0|              38.0|    0.0|228.0|             0.0|           932.0|         670.0|365.0|   56.14|
| 190.0|             190.0|    0.0|228.0|             0.0|           932.0|         670.0|365.0|   53.69|
| 380.0|               0.0|    0.0|228.0|             0.0|           932.0|         670.0|365.0|   52.52|
| 332.5|             142.5|    0.0|228.0|             0.0|           932.0|         594.0|365.0|   41.05|
| 304.0|              76.0|    0.0|228.0|             0.0|           932.0|         670.0|365.0|   55.26|
+------+------------------+-------+-----+----------------+----------------+--------------+-----+--------+



In [17]:
sql = "select * from my order by Strength limit 5"
sqlDF = sqlCtx.sql(sql)
sqlDF.show()

+------+------------------+-------+-----+----------------+----------------+--------------+---+--------+
|Cement|Blast Furnace Slag|Fly Ash|Water|Superplasticizer|Coarse Aggregate|Fine Aggregate|Age|Strength|
+------+------------------+-------+-----+----------------+----------------+--------------+---+--------+
| 108.3|             162.4|    0.0|203.5|             0.0|           938.2|         849.0|3.0|    2.33|
| 122.6|             183.9|    0.0|203.5|             0.0|           958.2|         800.1|3.0|    3.32|
| 102.0|             153.0|    0.0|192.0|             0.0|           887.0|         942.0|3.0|    4.57|
| 153.0|             102.0|    0.0|192.0|             0.0|           888.0|         943.1|3.0|    4.78|
| 141.3|             212.0|    0.0|203.5|             0.0|           971.8|         748.5|3.0|    4.83|
+------+------------------+-------+-----+----------------+----------------+--------------+---+--------+



In [18]:
sql = "select * from my order by Strength desc limit 5"
sqlDF = sqlCtx.sql(sql)
sqlDF.show()

+------+------------------+-------+-----+----------------+----------------+--------------+----+--------+
|Cement|Blast Furnace Slag|Fly Ash|Water|Superplasticizer|Coarse Aggregate|Fine Aggregate| Age|Strength|
+------+------------------+-------+-----+----------------+----------------+--------------+----+--------+
| 389.9|             189.0|    0.0|145.9|            22.0|           944.7|         755.8|91.0|    82.6|
| 315.0|             137.0|    0.0|145.0|             5.9|          1130.0|         745.0|28.0|   81.75|
| 323.7|             282.8|    0.0|183.8|            10.3|           942.7|         659.9|56.0|    80.2|
| 540.0|               0.0|    0.0|162.0|             2.5|          1040.0|         676.0|28.0|   79.99|
| 389.9|             189.0|    0.0|145.9|            22.0|           944.7|         755.8|56.0|    79.4|
+------+------------------+-------+-----+----------------+----------------+--------------+----+--------+



In [20]:
sql = "select sum(Cement), avg(Cement) from my"
sqlDF = sqlCtx.sql(sql)
sqlDF.show()

+-----------------+-----------------+
|      sum(Cement)|      avg(Cement)|
+-----------------+-----------------+
|289602.8999999997|281.1678640776696|
+-----------------+-----------------+



In [21]:
sql = "select max(`Blast Furnace Slag`), min(`Blast Furnace Slag`) from my"
sqlDF = sqlCtx.sql(sql)
sqlDF.show()

+-----------------------+-----------------------+
|max(Blast Furnace Slag)|min(Blast Furnace Slag)|
+-----------------------+-----------------------+
|                  359.4|                    0.0|
+-----------------------+-----------------------+



In [27]:
sql = "select * from my where Age between 100 and 150 limit 5"
sqlDF = sqlCtx.sql(sql)
sqlDF.show()

+------+------------------+-------+-----+----------------+----------------+--------------+-----+--------+
|Cement|Blast Furnace Slag|Fly Ash|Water|Superplasticizer|Coarse Aggregate|Fine Aggregate|  Age|Strength|
+------+------------------+-------+-----+----------------+----------------+--------------+-----+--------+
| 222.4|               0.0|   96.7|189.3|             4.5|           967.1|         870.3|100.0|   40.71|
| 233.8|               0.0|   94.6|197.9|             4.6|           947.0|         852.2|100.0|   34.56|
| 194.7|               0.0|  100.5|165.6|             7.5|          1006.4|         905.9|100.0|   37.34|
| 190.7|               0.0|  125.4|162.1|             7.8|          1090.0|         804.0|100.0|   40.57|
| 212.1|               0.0|  121.6|180.3|             5.7|          1057.6|         779.3|100.0|   39.61|
+------+------------------+-------+-----+----------------+----------------+--------------+-----+--------+



In [30]:
sql = "select `Coarse Aggregate`, `Fine Aggregate`, Strength from my where Strength >= 50"
sqlDF = sqlCtx.sql(sql)
sqlDF.show()

+----------------+--------------+--------+
|Coarse Aggregate|Fine Aggregate|Strength|
+----------------+--------------+--------+
|          1040.0|         676.0|   79.99|
|          1055.0|         676.0|   61.89|
|           932.0|         670.0|   52.91|
|           932.0|         670.0|   56.14|
|           932.0|         670.0|   52.52|
|           932.0|         670.0|    53.3|
|           932.0|         670.0|   52.12|
|           932.0|         670.0|   55.26|
|           932.0|         670.0|   52.91|
|           932.0|         670.0|   53.69|
|           932.0|         670.0|   50.46|
|           932.0|         670.0|    53.1|
|           932.0|         670.0|   50.95|
|           932.0|         670.0|   54.38|
|           932.0|         670.0|   51.73|
|           932.0|         670.0|   50.66|
|           932.0|         670.0|   55.06|
|          1120.0|         800.0|   71.99|
|           852.1|         781.5|    55.6|
|           852.1|         840.5|    54.9|
+----------

In [31]:
sql = "select stddev_samp(`Fly Ash`) from my"
sqlDF = sqlCtx.sql(sql)
sqlDF.show()

+--------------------+
|stddev_samp(Fly Ash)|
+--------------------+
|  63.997004152687666|
+--------------------+



In [32]:
sql = "select max(Strength), min(Strength), mean(Strength) from my"
sqlDF = sqlCtx.sql(sql)
sqlDF.show()

+-------------+-------------+-----------------+
|max(Strength)|min(Strength)|   mean(Strength)|
+-------------+-------------+-----------------+
|         82.6|         2.33|35.81796116504851|
+-------------+-------------+-----------------+



In [35]:
sql = "select Strength, case when Strength >= 60 then '강함' when Strength >= 30 then '보통' else '약함' end as `강도` from my"
sqlDF = sqlCtx.sql(sql)
sqlDF.show()

+--------+----+
|Strength|강도|
+--------+----+
|   79.99|강함|
|   61.89|강함|
|   40.27|보통|
|   41.05|보통|
|    44.3|보통|
|   47.03|보통|
|    43.7|보통|
|   36.45|보통|
|   45.85|보통|
|   39.29|보통|
|   38.07|보통|
|   28.02|약함|
|   43.01|보통|
|   42.33|보통|
|   47.81|보통|
|   52.91|보통|
|   39.36|보통|
|   56.14|보통|
|   40.56|보통|
|   42.62|보통|
+--------+----+
only showing top 20 rows



In [39]:
sql = "select Water-100.0 from my"
sqlDF = sqlCtx.sql(sql)
sqlDF.show()

+-------------------------------+
|(Water - CAST(100.0 AS DOUBLE))|
+-------------------------------+
|                           62.0|
|                           62.0|
|                          128.0|
|                          128.0|
|                           92.0|
|                          128.0|
|                          128.0|
|                          128.0|
|                          128.0|
|                          128.0|
|                           92.0|
|                           92.0|
|                          128.0|
|                          128.0|
|                          128.0|
|                          128.0|
|                           92.0|
|                          128.0|
|                          128.0|
|                          128.0|
+-------------------------------+
only showing top 20 rows



In [40]:
sc.stop()