# Step8-model & analysis

## 1. Impot necessary modules & start a spark session

In [None]:
# Import necessary modules
from pyspark.sql import SparkSession
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

In [None]:
# Create a Spark session
spark = (
    SparkSession.builder.appName('step_1-download_data.py')
    .config('spark.sql.repl.eagerEval.enabled', True)
    .config('spark.sql.parquet.cacheMetadata', 'true')
    .config('spark.sql.session.timeZone', 'Etc/UTC')
    .config('spark.driver.memory', '16g')
    .config('spark.executer.memory', '16g')
    .getOrCreate()
)

## 2. Preparation before models

### 2.1 Import `merged_data` & `merged_data_for_test` from the directory `data/merged_data/`

In [None]:
merged_data_path = '../data/merged_data/merged_data.parquet/'
merged_data = spark.read.parquet(merged_data_path)

merged_data_for_test_path = '../data/merged_data/merged_data_for_test.parquet/'
merged_data_for_test = spark.read.parquet(merged_data_for_test_path)

### 2.2 Create `train_data` & `test_data`

Create `train_data` by sampling from `merged_data`, with sample_size = 0.004

In [None]:
sample_size = 0.004
train_data = merged_data.sample(sample_size, seed=1).toPandas()
train_data = train_data.dropna()

Create `test_data` by sampling from `merged_data_for_test`, with sample_size = 0.01

In [None]:
sample_size = 0.01
test_data = merged_data_for_test.sample(sample_size, seed=1).toPandas()
test_data = test_data.dropna()

### 2.3 Change the continuous features of type `int` or `object` to float type

In [None]:
train_data['trip_duration'] = train_data['trip_duration'].astype(float)
train_data['uv_index'] = train_data['uv_index'].astype(float)
train_data['temperature'] = pd.to_numeric(train_data['temperature'], errors='coerce')
train_data['visibility'] = pd.to_numeric(train_data['visibility'], errors='coerce')
train_data.head()

In [None]:
test_data['trip_duration'] = test_data['trip_duration'].astype(float)
test_data['uv_index'] = test_data['uv_index'].astype(float)
test_data['temperature'] = pd.to_numeric(test_data['temperature'], errors='coerce')
test_data['visibility'] = pd.to_numeric(test_data['visibility'], errors='coerce')
test_data.head()

### 2.4 Remove the features which will not be used in model

In [None]:
train_data = train_data.drop(columns=['date', 'average_speed', 'visibility'])
test_data = test_data.drop(columns=['date', 'average_speed', 'visibility'])

### 2.5 Show data shapes of `train_data` & `test_data`

In [None]:
print('#rows of train_data: ', len(train_data))
train_data.head()

In [None]:
print('#rows of test_data: ', len(test_data))
test_data.head()

## 3. Linear regression model

### 3.1 Build linear regression model

In [None]:
# Define 2 feature lists
continuous_features = ['#passenger', 'trip_distance', 'congestion_fee', 'toll_fee', 'temperature', 'uv_index']
discrete_features = [
    'up_location_id', 'off_location_id', 'if_weekend', 'if_peak_hour', 'if_overnight', 
    'if_airport', 'if_rain', 'if_snow', 'if_overcast', 'if_cloudy', 'if_clear'
]

# Create the interaction term for 'up_location_id' & 'off_location_id'
interaction = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
interaction_term_train = interaction.fit_transform(train_data[['up_location_id', 'off_location_id']])
interaction_term_test = interaction.transform(test_data[['up_location_id', 'off_location_id']])

# Add the interaction term to train_data & test_data
train_data['location_interaction_term'] = interaction_term_train[:, 2]
test_data['location_interaction_term'] = interaction_term_test[:, 2]

# Add 'location_interaction_term' to discrete_features
discrete_features.append('location_interaction_term')

# Process continuous and discrete features at the same time
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), continuous_features),
        ('cat', 'passthrough', discrete_features)  
    ])

# Split train_data to train_data_X & train_data_Y
train_data_X = train_data.drop('trip_duration', axis=1)
train_data_Y = train_data['trip_duration']

# Split test_data to test_data_X & test_data_Y
test_data_X = test_data.drop('trip_duration', axis=1)
test_data_Y = test_data['trip_duration']

# Create & train linear regression model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LinearRegression())
])

model.fit(train_data_X, train_data_Y);

### 3.2 Use linear regression model to predict & evaluation

In [None]:
# use model to predict test_data
train_prediction_Y = model.predict(train_data_X)
test_prediction_Y = model.predict(test_data_X)

# Calculate R^2 & MSE for train result
train_r2 = r2_score(train_data_Y, train_prediction_Y)
train_mse = mean_squared_error(train_data_Y, train_prediction_Y)

# Calculate R^2 & MSE for test result
test_r2 = r2_score(test_data_Y, test_prediction_Y)
test_mse = mean_squared_error(test_data_Y, test_prediction_Y)

# Show the results
print(f"Train R^2: {train_r2:.4f}")
print(f"Train MSE: {train_mse:.4f}")
print('\n')
print(f"Test R^2: {test_r2:.4f}")
print(f"Test MSE: {test_mse:.4f}")

## 4. Random forest regressor

### 4.1 Build random forest regressor

In [None]:
# Create interaction term by combining 'up_location_id' & 'off_location_id'
train_data_X['location_interaction_term'] = train_data_X['up_location_id'].astype(str) + '_' + train_data_X['off_location_id'].astype(str)
test_data_X['location_interaction_term'] = test_data_X['up_location_id'].astype(str) + '_' + test_data_X['off_location_id'].astype(str)

# Define 2 feature lists
continuous_features = ['#passenger', 'trip_distance', 'congestion_fee', 'toll_fee', 'temperature', 'uv_index']
discrete_features = [
    'up_location_id', 'off_location_id', 'if_weekend', 'if_peak_hour', 'if_overnight', 'if_airport', 
    'if_rain', 'if_snow', 'if_overcast', 'if_cloudy', 'if_clear', 'location_interaction_term'
]

# Transformer for continuous features which does a z-score normalization
continuous_transformer = Pipeline([
    ('scaler', StandardScaler())
])

# Transformer for discrete features which does one-hot encoding
discrete_transformer = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a column transformer
preprocessor = ColumnTransformer([
    ('num', continuous_transformer, continuous_features),
    ('cat', discrete_transformer, discrete_features)
])

# Create & train random forest regressor
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, max_depth=8, random_state=42))
])

model.fit(train_data_X, train_data_Y);

### 4.2 Use random forest regressor to predict & evaluation

In [None]:
# use model to predict test_data
train_prediction_Y = model.predict(train_data_X)
test_prediction_Y = model.predict(test_data_X)

# Calculate R^2 & MSE for train result
train_r2 = r2_score(train_data_Y, train_prediction_Y)
train_mse = mean_squared_error(train_data_Y, train_prediction_Y)

# Calculate R^2 & MSE for test result
test_r2 = r2_score(test_data_Y, test_prediction_Y)
test_mse = mean_squared_error(test_data_Y, test_prediction_Y)

# show the results
print(f"Train R^2: {train_r2:.4f}")
print(f"Train MSE: {train_mse:.4f}")
print('\n')
print(f"Test R^2: {test_r2:.4f}")
print(f"Test MSE: {test_mse:.4f}")

## 6. Stop spark session

In [None]:
spark.stop()