In [63]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml import Pipeline

from pyspark.ml.regression import LinearRegression

In [3]:
ss = SparkSession.builder.appName('house').getOrCreate()
ss

24/12/12 15:48:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/12 15:48:17 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
train_df = ss.read.csv('data/house/house_train.csv'\
            , inferSchema = True\
            , header = True)

test_df = ss.read.csv('data/house/house_test.csv'\
            , inferSchema = True\
            , header = True)

                                                                                

In [6]:
# sale price : label
# features : ????

### 전처리 - 결측치 처리
- 공백은 0으로 지정
- 숫자 타입 통일

In [7]:
train_df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- MSSubClass: integer (nullable = true)
 |-- MSZoning: string (nullable = true)
 |-- LotFrontage: string (nullable = true)
 |-- LotArea: integer (nullable = true)
 |-- Street: string (nullable = true)
 |-- Alley: string (nullable = true)
 |-- LotShape: string (nullable = true)
 |-- LandContour: string (nullable = true)
 |-- Utilities: string (nullable = true)
 |-- LotConfig: string (nullable = true)
 |-- LandSlope: string (nullable = true)
 |-- Neighborhood: string (nullable = true)
 |-- Condition1: string (nullable = true)
 |-- Condition2: string (nullable = true)
 |-- BldgType: string (nullable = true)
 |-- HouseStyle: string (nullable = true)
 |-- OverallQual: integer (nullable = true)
 |-- OverallCond: integer (nullable = true)
 |-- YearBuilt: integer (nullable = true)
 |-- YearRemodAdd: integer (nullable = true)
 |-- RoofStyle: string (nullable = true)
 |-- RoofMatl: string (nullable = true)
 |-- Exterior1st: string (nullable = true)
 |--

In [11]:
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

### encoding 
문자형 >> 숫자로 치환

In [48]:
string_columns = ['Neighborhood', 'GarageType']

# 1,2,3,4로 값을 단순화
indexers = [StringIndexer(inputCol = col, outputCol = col+"_index") for col in string_columns]

In [49]:
# one-hot encoding : 범주형 변수들이 값이 1,2,3,4 -> 컬럼을 쪼개고 각각의 자리에 1로 바꾸는 인코딩, 빈 공간은 0
encoders = [OneHotEncoder(inputCol=col+'_index', outputCol = col + "_encoded") for col in string_columns]

### Features Selection

In [68]:
numeric_columns = [ "LotArea"
                   , "OverallQual"
                   , "OverallCond"
                   , "YearBuilt"
                   , "YearRemodAdd"
                   , "1stFlrSF"
                   , "2ndFlrSF"
                   , "GrLivArea"
                   , "GarageCars"
                   , "GarageArea"]

In [51]:
assembler_input = [col+'_encoded' for col in string_columns] + numeric_columns

### assembler

In [52]:
assembler = VectorAssembler(inputCols = assembler_input
                            , outputCol = 'feature'
                           )

### Label Selection

In [53]:
train_df = train_df.withColumnRenamed(
                'SalePrice', 'label'
            )

### pipeline 설정
StringIndex + OnehotEncoder + Assembler >> 하나의 SparkML Pipeline으로 결합

In [54]:
pipeline = Pipeline(stages = indexers
                     + encoders
                     + [assembler])

### pipeline 실행
pipeline.fit()

In [55]:
pipeline_model = pipeline.fit(train_df)

In [59]:
train_transformed = pipeline_model.transform(train_df)

###  예측 > 회귀 모델 학습 > 평가 > 예측

In [64]:
lr = LinearRegression(featuresCol = 'feature'
                     , labelCol = 'label')

lr_mode = lr.fit(train_transformed)

24/12/12 16:47:13 WARN Instrumentation: [bc5b24b0] regParam is zero, which might cause numerical instability and overfitting.
24/12/12 16:47:14 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/12/12 16:47:14 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
24/12/12 16:47:14 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
24/12/12 16:47:14 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK


In [65]:
# 평가 데이터를 이용한 평가 > FIT(모델 맞춤, 테스트 데이터에 의해서 규칙이 변화)
test_transformed = pipeline_model.transform( test_df )

prediction = lr_model.transform(test_transformed)

IllegalArgumentException: Data type string of column GarageCars is not supported.
Data type string of column GarageArea is not supported.

In [81]:
ss.stop()