In [73]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml.regression import LinearRegression, LinearRegressionModel

import pandas as pd

In [6]:
ss = SparkSession.builder.appName('house').getOrCreate()
ss

In [7]:
train_df = ss.read.csv('data/house/house_train.csv'\
            , inferSchema = True\
            , header = True)

test_df = ss.read.csv('data/house/house_test.csv'\
            , inferSchema = True\
            , header = True)

In [6]:
# sale price : label
# features : ????

## 전처리

### 타입 바꿔주기

In [8]:
train_df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- MSSubClass: integer (nullable = true)
 |-- MSZoning: string (nullable = true)
 |-- LotFrontage: string (nullable = true)
 |-- LotArea: integer (nullable = true)
 |-- Street: string (nullable = true)
 |-- Alley: string (nullable = true)
 |-- LotShape: string (nullable = true)
 |-- LandContour: string (nullable = true)
 |-- Utilities: string (nullable = true)
 |-- LotConfig: string (nullable = true)
 |-- LandSlope: string (nullable = true)
 |-- Neighborhood: string (nullable = true)
 |-- Condition1: string (nullable = true)
 |-- Condition2: string (nullable = true)
 |-- BldgType: string (nullable = true)
 |-- HouseStyle: string (nullable = true)
 |-- OverallQual: integer (nullable = true)
 |-- OverallCond: integer (nullable = true)
 |-- YearBuilt: integer (nullable = true)
 |-- YearRemodAdd: integer (nullable = true)
 |-- RoofStyle: string (nullable = true)
 |-- RoofMatl: string (nullable = true)
 |-- Exterior1st: string (nullable = true)
 |--

In [21]:
test_df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- MSSubClass: integer (nullable = true)
 |-- MSZoning: string (nullable = true)
 |-- LotFrontage: string (nullable = true)
 |-- LotArea: integer (nullable = true)
 |-- Street: string (nullable = true)
 |-- Alley: string (nullable = true)
 |-- LotShape: string (nullable = true)
 |-- LandContour: string (nullable = true)
 |-- Utilities: string (nullable = true)
 |-- LotConfig: string (nullable = true)
 |-- LandSlope: string (nullable = true)
 |-- Neighborhood: string (nullable = true)
 |-- Condition1: string (nullable = true)
 |-- Condition2: string (nullable = true)
 |-- BldgType: string (nullable = true)
 |-- HouseStyle: string (nullable = true)
 |-- OverallQual: integer (nullable = true)
 |-- OverallCond: integer (nullable = true)
 |-- YearBuilt: integer (nullable = true)
 |-- YearRemodAdd: integer (nullable = true)
 |-- RoofStyle: string (nullable = true)
 |-- RoofMatl: string (nullable = true)
 |-- Exterior1st: string (nullable = true)
 |--

In [25]:
# 각각의 df [GarageArea, GarageCars] >> int로 형변환
train_df = train_df.withColumn('GarageArea'
                       , train_df['GarageArea'].cast('integer'))

train_df = train_df.withColumn('GarageCars'
                        , train_df['GarageCars'].cast('integer'))

test_df = test_df.withColumn('GarageArea'
                      , test_df['GarageArea'].cast('integer'))

test_df = test_df.withColumn('GarageCars'
                      , test_df['GarageCars'].cast('integer'))

### 결측치 처리
- 공백은 0으로 지정
- 숫자 타입 통일

In [26]:
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

### encoding 
문자형 >> 숫자로 치환

In [10]:
string_columns = ['Neighborhood', 'GarageType']

# 1,2,3,4로 값을 단순화
indexers = [StringIndexer(inputCol = col, outputCol = col+"_index") for col in string_columns]

In [11]:
# one-hot encoding : 범주형 변수들이 값이 1,2,3,4 -> 컬럼을 쪼개고 각각의 자리에 1로 바꾸는 인코딩, 빈 공간은 0
encoders = [OneHotEncoder(inputCol=col+'_index', outputCol = col + "_encoded") for col in string_columns]

### Features Selection

In [12]:
numeric_columns = [ "LotArea"
                   , "OverallQual"
                   , "OverallCond"
                   , "YearBuilt"
                   , "YearRemodAdd"
                   , "1stFlrSF"
                   , "2ndFlrSF"
                   , "GrLivArea"
                   , "GarageCars"
                   , "GarageArea"]

In [30]:
# 문자형(인코딩) + 숫자형 피처를 결합한 모델입력 생성
assembler_input = [col+'_encoded' for col in string_columns] + numeric_columns

### assembler

In [31]:
assembler = VectorAssembler(inputCols = assembler_input
                            , outputCol = 'feature'
                           )

### Label Selection

In [32]:
train_df = train_df.withColumnRenamed(
                'SalePrice', 'label'
            )

### pipeline 설정
StringIndex + OnehotEncoder + Assembler >> 하나의 SparkML Pipeline으로 결합

In [33]:
pipeline = Pipeline(stages = indexers
                     + encoders
                     + [assembler])

### pipeline 실행
pipeline.fit()

In [34]:
pipeline_model = pipeline.fit(train_df)

In [35]:
train_transformed = pipeline_model.transform(train_df)

###  모델 생성
회귀 모델 학습 > 평가 > 예측

In [36]:
lr = LinearRegression(featuresCol = 'feature'
                     , labelCol = 'label')

lr_model = lr.fit(train_transformed)

24/12/13 10:25:58 WARN Instrumentation: [502686ef] regParam is zero, which might cause numerical instability and overfitting.


In [37]:
# 평가 데이터를 이용한 평가 > FIT(모델 맞춤, 테스트 데이터에 의해서 규칙이 변화)
test_transformed = pipeline_model.transform( test_df )

prediction = lr_model.transform(test_transformed)

In [41]:
# 예측값 확인
prediction.select('id'
                  ,'feature'
                  , 'prediction')\
.show(10, truncate = False)

# 여기서 prediction = SalesPrice에 대한 예측값

+----+--------------------------------------------------------------------------------------------------------------+------------------+
|id  |feature                                                                                                       |prediction        |
+----+--------------------------------------------------------------------------------------------------------------+------------------+
|1461|(40,[0,24,30,31,32,33,34,35,37,38,39],[1.0,1.0,11622.0,5.0,6.0,1961.0,1961.0,896.0,896.0,1.0,730.0])          |117874.9926099244 |
|1462|(40,[0,24,30,31,32,33,34,35,37,38,39],[1.0,1.0,14267.0,6.0,6.0,1958.0,1958.0,1329.0,1329.0,1.0,312.0])        |154972.67874835432|
|1463|(40,[5,24,30,31,32,33,34,35,36,37,38,39],[1.0,1.0,13830.0,5.0,5.0,1997.0,1998.0,928.0,701.0,1629.0,2.0,482.0])|167400.6397246034 |
|1464|(40,[5,24,30,31,32,33,34,35,36,37,38,39],[1.0,1.0,9978.0,6.0,6.0,1998.0,1998.0,926.0,678.0,1604.0,2.0,470.0]) |185917.14120085537|
|1465|(40,[18,24,30,31,32,33,34,35,37,38,

In [48]:
# 결과 저장
prediction.select('id', 'prediction')\
            .withColumnRenamed('prediction', 'SalePrice')\
            .write.csv('data/output/house_machinelearning.csv', header = True, mode = 'overwrite')

In [51]:
# 예측값 읽어서 분석
df_result = ss.read.csv('data/output/house_machinelearning.csv', header = True)
df_result.show(20)

+----+------------------+
|  id|         SalePrice|
+----+------------------+
|1461| 117874.9926099244|
|1462|154972.67874835432|
|1463| 167400.6397246034|
|1464|185917.14120085537|
|1465|262118.65943398117|
|1466| 174609.4180503923|
|1467| 178398.1783211385|
|1468|  167553.838098821|
|1469| 189964.9238947332|
|1470|107112.15387231868|
|1471|191095.79137392435|
|1472| 92549.17578985298|
|1473|  84174.0979652555|
|1474| 152207.6350938361|
|1475|124992.61163117061|
|1476| 351583.2811955109|
|1477|303689.40388719365|
|1478| 323879.4245192136|
|1479|318208.69366380596|
|1480|  394253.857511088|
+----+------------------+
only showing top 20 rows



### 예측 모델 활용
1. 파이프 라인 저장 > 로컬 (data/output/house_machinelearning/model) > 모델 저장소에 저장
2. 모델 저장 > 로컬 (data/output/house_machinelearning/model) > 모델 저장소에 저장

In [68]:
pipeline_model_save_path = 'data/output/house_machinelearning.csv/model/pipeline'
lr_model_save_path = 'data/output/house_machinelearning.csv/model/lr'


# 파이프라인 모델 > 새로운 데이터를 변환하기 위해 저장
pipeline_model.write().overwrite().save(pipeline_model_save_path)

# 선형 회귀 모델 > 새로운 데이터로 예측하기 위해 저장
lr_model.write().overwrite().save(lr_model_save_path)
print('model save success~~')

model save success~~


### 모델 파이프라인 로드

In [66]:
loaded_pipeline = PipelineModel.load(pipeline_model_save_path)

In [67]:
loaded_pipeline

PipelineModel_1e417ef787f4

In [71]:
loaded_lr_model = LinearRegressionModel.load(lr_model_save_path)
loaded_lr_model

LinearRegressionModel: uid=LinearRegression_5268ffde4db7, numFeatures=40

### 새로운 데이터로 예측

1. 새로운 데이터 >>> ???
2. 파이프라인 모델을 이용해서 변환
3. 모델에 넣어서 예측

In [75]:
# 새로운 데이터 샘플 생성
data = {
    "Id": [1461],
    "MSSubClass": [20],
    "MSZoning": ["RH"],
    "LotFrontage": [80],
    "LotArea": [11622],
    "Street": ["Pave"],
    "Alley": [None],  # NA를 None으로 표현
    "LotShape": ["Reg"],
    "LandContour": ["Lvl"],
    "Utilities": ["AllPub"],
    "LotConfig": ["Inside"],
    "LandSlope": ["Gtl"],
    "Neighborhood": ["NAmes"],
    "Condition1": ["Feedr"],
    "Condition2": ["Norm"],
    "BldgType": ["1Fam"],
    "HouseStyle": ["1Story"],
    "OverallQual": [5],
    "OverallCond": [6],
    "YearBuilt": [1961],
    "YearRemodAdd": [1961],
    "RoofStyle": ["Gable"],
    "RoofMatl": ["CompShg"],
    "Exterior1st": ["VinylSd"],
    "Exterior2nd": ["VinylSd"],
    "MasVnrType": [None],  # None은 NA를 의미
    "MasVnrArea": [0],
    "ExterQual": ["TA"],
    "ExterCond": ["TA"],
    "Foundation": ["CBlock"],
    "BsmtQual": ["TA"],
    "BsmtCond": ["TA"],
    "BsmtExposure": ["No"],
    "BsmtFinType1": ["Rec"],
    "BsmtFinSF1": [468],
    "BsmtFinType2": ["LwQ"],
    "BsmtFinSF2": [144],
    "BsmtUnfSF": [270],
    "TotalBsmtSF": [882],
    "Heating": ["GasA"],
    "HeatingQC": ["TA"],
    "CentralAir": ["Y"],
    "Electrical": ["SBrkr"],
    "1stFlrSF": [896],
    "2ndFlrSF": [0],
    "LowQualFinSF": [0],
    "GrLivArea": [896],
    "BsmtFullBath": [0],
    "BsmtHalfBath": [0],
    "FullBath": [1],
    "HalfBath": [0],
    "BedroomAbvGr": [2],
    "KitchenAbvGr": [1],
    "KitchenQual": ["TA"],
    "TotRmsAbvGrd": [5],
    "Functional": ["Typ"],
    "Fireplaces": [0],
    "FireplaceQu": [None],  # NA를 None으로 표현
    "GarageType": ["Attchd"],
    "GarageYrBlt": [1961],
    "GarageFinish": ["Unf"],
    "GarageCars": [1],
    "GarageArea": [730],
    "GarageQual": ["TA"],
    "GarageCond": ["TA"],
    "PavedDrive": ["Y"],
    "WoodDeckSF": [140],
    "OpenPorchSF": [0],
    "EnclosedPorch": [0],
    "3SsnPorch": [0],
    "ScreenPorch": [120],
    "PoolArea": [0],
    "PoolQC": [None],  # NA를 None으로 표현
    "Fence": ["MnPrv"],
    "MiscFeature": [None],  # NA를 None으로 표현
    "MiscVal": [0],
    "MoSold": [6],
    "YrSold": [2010],
    "SaleType": ["WD"],
    "SaleCondition":["Normal"]
}

pd.DataFrame(data).to_csv('data/output/house_csv.csv', index = False)

In [90]:
new_test_data = ss.read.csv('data/output/house_csv.csv', header = True, inferSchema = True)

In [92]:
## 필요한 특성만 선택(파이프라인에서 사용된 특성들)
## 수치형 컬럼 + 범주형 컬럼 정의
selected_features = [
    "LotArea", "OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd", 
    "1stFlrSF", "2ndFlrSF", "GrLivArea", "GarageCars", "GarageArea", "Neighborhood"
]

In [93]:
new_test_data = new_test_data.withColumn('GarageCars', new_test_data['GarageCars'].cast('integer'))
new_test_data = new_test_data.withColumn('GarageArea', new_test_data['GarageArea'].cast('integer'))

In [99]:
# 파이프라인에 새로운 데잍터를 넣어준다
new_pipe = loaded_pipeline.transform(new_test_data)

In [100]:
# 파이프라인이 변환한 데이터를 모델에 넣어준다.
new_pred = loaded_lr_model.transform(new_pipe) #1건)

In [118]:
# 예측 수행
#new_pred.select('id', 'feature', 'prediction').show()
new_pred.selectExpr('id', 'feature', 'prediction').show()

+----+--------------------+-----------------+
|  id|             feature|       prediction|
+----+--------------------+-----------------+
|1461|(40,[0,24,30,31,3...|117874.9926099244|
+----+--------------------+-----------------+



In [122]:
new_pred.schema

StructType(List(StructField(Id,IntegerType,true),StructField(MSSubClass,IntegerType,true),StructField(MSZoning,StringType,true),StructField(LotFrontage,IntegerType,true),StructField(LotArea,IntegerType,true),StructField(Street,StringType,true),StructField(Alley,StringType,true),StructField(LotShape,StringType,true),StructField(LandContour,StringType,true),StructField(Utilities,StringType,true),StructField(LotConfig,StringType,true),StructField(LandSlope,StringType,true),StructField(Neighborhood,StringType,true),StructField(Condition1,StringType,true),StructField(Condition2,StringType,true),StructField(BldgType,StringType,true),StructField(HouseStyle,StringType,true),StructField(OverallQual,IntegerType,true),StructField(OverallCond,IntegerType,true),StructField(YearBuilt,IntegerType,true),StructField(YearRemodAdd,IntegerType,true),StructField(RoofStyle,StringType,true),StructField(RoofMatl,StringType,true),StructField(Exterior1st,StringType,true),StructField(Exterior2nd,StringType,true)

In [123]:
# csv 저장
# 결과 저장
# new_pred.write\
#         .csv('data/output/house_machinelearning.csv/new_pred.csv', header = True, mode = 'overwrite')

In [None]:
# log를 남긴다 : csv, 데이터베이스, hadoop 분산파일시스템(hdfs), kafka(streaming) (외부 저장)

In [124]:
ss.stop()