In [14]:
from pyspark.ml.feature import (
    StringIndexer,        # 범주형 → 수치형 인코딩
    OneHotEncoder,        # 원-핫 인코딩
    VectorAssembler,      # 여러 컬럼 → 하나의 feature 벡터
    StandardScaler,       # 표준 정규화
    MinMaxScaler,         # 최소/최대 스케일링
    Bucketizer,           # 연속형 변수 → 구간화
    QuantileDiscretizer,  # 분위수 기반 구간화
    PCA,                  # 주성분 분석
    PolynomialExpansion,  # 다항 특성 생성
    ChiSqSelector         # 카이제곱 기반 피처 선택
)
from pyspark.ml.classification import (
    LogisticRegression,
    DecisionTreeClassifier,
    RandomForestClassifier,
    GBTClassifier,
    NaiveBayes,
    MultilayerPerceptronClassifier
)

from pyspark.ml.regression import (
    LinearRegression,
    DecisionTreeRegressor,
    RandomForestRegressor,
    GBTRegressor
)
from pyspark.ml.clustering import (
    KMeans,
    GaussianMixture,
    BisectingKMeans,
    LDA  # Latent Dirichlet Allocation (토픽 모델링)
)
from pyspark.ml.evaluation import (
    BinaryClassificationEvaluator,
    MulticlassClassificationEvaluator,
    RegressionEvaluator,
    ClusteringEvaluator
)
from pyspark.ml import Pipeline  # 전체 파이프라인 구성

from pyspark.ml.tuning import (   # 모델 튜닝
    ParamGridBuilder,
    CrossValidator,
    TrainValidationSplit
)
from pyspark.ml.linalg import Vectors, DenseVector, SparseVector  # 벡터 수동 생성
from pyspark.ml.stat import Correlation, ChiSquareTest            # 통계 테스트

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('taxi-fare-prediction').getOrCreate()
spark

In [2]:
import os
cwd =os.getcwd()
trip_data_path = os.path.join(cwd, 'learning_spark_data', 'trips', '*.csv' )
trip_data_path

'/home/jovyan/work/learning_spark_data/trips/*.csv'

In [3]:
file_path = f"file:///{trip_data_path.replace(os.sep, '/')}"

In [4]:
trip_df = spark.read.csv(file_path, inferSchema=True, header=True)
trip_df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [6]:
trip_df.createOrReplaceTempView('trips')

In [7]:
query = """
SELECT
    trip_distance,
    total_amount
FROM trips

WHERE total_amount < 5000
  AND total_amount > 0
  AND trip_distance > 0
  AND trip_distance < 500
  AND passenger_count < 4
  AND TO_DATE(tpep_pickup_datetime) >= "2021-01-01"
  AND TO_DATE(tpep_pickup_datetime) < "2021-08-01"
"""
trip_df = spark.sql(query)

In [8]:
trip_df.createOrReplaceTempView('data')

In [10]:
# data table 결과 확인하기
spark.sql('select * from data limit 5').show()

+-------------+------------+
|trip_distance|total_amount|
+-------------+------------+
|         16.5|       70.07|
|         1.13|       11.16|
|         2.68|       18.59|
|         12.4|        43.8|
|          9.7|        32.3|
+-------------+------------+



In [13]:
# split 8:2
train_data, test_data = trip_df.randomSplit([0.8,0.2], seed = 12)
train_data.show(3), test_data.show(3) 

+-------------+------------+
|trip_distance|total_amount|
+-------------+------------+
|         0.01|        3.05|
|         0.01|         3.3|
|         0.01|         3.3|
+-------------+------------+
only showing top 3 rows

+-------------+------------+
|trip_distance|total_amount|
+-------------+------------+
|         0.01|         3.3|
|         0.01|         3.3|
|         0.01|         3.3|
+-------------+------------+
only showing top 3 rows



(None, None)

In [16]:
#vectorassembler > feature : trip_distance, target: total_amount

vassembler = VectorAssembler(
    inputCols=['trip_distance'],        
    outputCol='features'                
)
vtrain_df = vassembler.transform(train_data)
vtrain_df.select('features', 'total_amount').show(3)

+--------+------------+
|features|total_amount|
+--------+------------+
|  [0.01]|        3.05|
|  [0.01]|         3.3|
|  [0.01]|         3.3|
+--------+------------+
only showing top 3 rows



In [20]:
# linearRegression 생성 maxIter = 50, LabelCol='total_amount', featurescol='features'
lr = LinearRegression(maxIter=50, labelCol='total_amount', featuresCol='features')
# fit
lrModel = lr.fit(vtrain_df)
# vassem transform
vtest_df  =vassembler.transform(test_data)
# model.transform
pred = lrModel.transform(vtest_df)
pred.show(5)

+-------------+------------+--------+----------------+
|trip_distance|total_amount|features|      prediction|
+-------------+------------+--------+----------------+
|         0.01|         3.3|  [0.01]|9.38115212649648|
|         0.01|         3.3|  [0.01]|9.38115212649648|
|         0.01|         3.3|  [0.01]|9.38115212649648|
|         0.01|         3.3|  [0.01]|9.38115212649648|
|         0.01|         3.3|  [0.01]|9.38115212649648|
+-------------+------------+--------+----------------+
only showing top 5 rows



In [31]:
spark.stop()

In [22]:
lrModel.summary.rootMeanSquaredError

6.226191687602188

In [23]:
lrModel.summary.r2

0.770012630835246

In [24]:
# 새로운 데이터로 예측하기

In [29]:
from pyspark.sql.types import DoubleType
new_distance_list = [1.1, 5.4, 10.2, 30.0]
distance_df = spark.createDataFrame(new_distance_list, DoubleType()).toDF('trip_distance')
distance_df.show()

+-------------+
|trip_distance|
+-------------+
|          1.1|
|          5.4|
|         10.2|
|         30.0|
+-------------+



In [30]:
vassembler = VectorAssembler(
    inputCols=['trip_distance'],        
    outputCol='features'                
)
vdistance_df = vassembler.transform(distance_df)
lrModel.transform(vdistance_df).show()

+-------------+--------+------------------+
|trip_distance|features|        prediction|
+-------------+--------+------------------+
|          1.1|   [1.1]|12.642230524073724|
|          5.4|   [5.4]|25.507035211763775|
|         10.2|  [10.2]|39.867747421278246|
|         30.0|  [30.0]| 99.10568528552545|
+-------------+--------+------------------+

