#### GRU 모델 성능 평가
사용 데이터: od_uuid/2023/00c02071a7d249b8b528230e9d63ad1d

In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Bidirectional, TimeDistributed
from tensorflow.keras.preprocessing.text import Tokenizer
import geopy.distance
import string
from sklearn.metrics import precision_score, recall_score, f1_score

# Define a function to convert numbers into corresponding letter labels
def num_to_letter(num):
    '''
    num         : number that we have to convert
    '''
    return string.ascii_uppercase[num]

def generate_initial_grids():
        """초기 그리드 생성"""
        south, west, north, east = south_korea_bounds
        lat_step = (north - south) / grid_size
        lon_step = (east - west) / grid_size
        grid_queue = []

        for i in range(grid_size):
            for j in range(grid_size):
                grid_south = south + i * lat_step
                grid_north = south + (i + 1) * lat_step
                grid_west = west + j * lon_step
                grid_east = west + (j + 1) * lon_step
                grid_queue.append((grid_south, grid_west, grid_north, grid_east, 
                                   num_to_letter(i) + num_to_letter(j)))

        return grid_queue

# Create a function to get the grid label of the coordinate point
def get_grid_label(lat, lng, final_grids):
    '''
    lat         : latitude
    lng         : longitude
    final_grids : all cells and their minimum/maximum latitude/longitude
    '''
    for south, west, north, east, grid_label in final_grids:
        if south <= lat <= north and west <= lng <= east:
            return grid_label
    return None

def is_path_in_grid(south, west, north, east, path_points):
        """경로가 그리드 안에 있는지 확인"""
        return any(south <= lat <= north and west <= lng <= east for lat, lng in path_points)

def subdivide_grids(grid_queue, path_points):
    """그리드 분할"""
    final_grids = []

    while grid_queue:
        south, west, north, east, grid_label = grid_queue.pop(0)
        grid_size_km = min(geopy.distance.distance((south, west), (south, east)).km,
                           geopy.distance.distance((south, west), (north, west)).km)
            
        if grid_size_km > min_size_km and is_path_in_grid(south, west, north, east, path_points):
            mid_lat = (south + north) / 2
            mid_lon = (west + east) / 2
            grid_queue.append((south, west, mid_lat, mid_lon, grid_label + 'C'))
            grid_queue.append((mid_lat, west, north, mid_lon, grid_label + 'A'))
            grid_queue.append((south, mid_lon, mid_lat, east, grid_label + 'D'))
            grid_queue.append((mid_lat, mid_lon, north, east, grid_label + 'B'))
        else:
            final_grids.append((south, west, north, east, grid_label))

    return final_grids

# Approximate border coordinates of South Korea
south_korea_bounds = [33.10, 124.57, 38.60, 131]
min_size_km = 0.76
grid_size = 13

In [14]:
#데이터셋 로드
#os.chdir('../')
data_path=os.getcwd()+'/data/od_uuid/2023'

#데이터셋 칼럼명 추가
column_name=['id','start_time','end_time','start_lat','start_lng','end_lat','end_lng','?1','?2','?3']

raw_data=pd.read_csv(data_path+'/00c02071a7d249b8b528230e9d63ad1d.csv')
data=raw_data.values.tolist()
dataframe=pd.DataFrame(data, columns=column_name)

#전체데이터 그리드 생성
start_points=dataframe[['start_lat','start_lng']].values.tolist()
start_grid_queue = generate_initial_grids()
start_final_grids = subdivide_grids(start_grid_queue, start_points)

end_points=dataframe[['end_lat','end_lng']].values.tolist()
end_grid_queue = generate_initial_grids()
end_final_grids = subdivide_grids(end_grid_queue, end_points)

dataframe['start_grid']=dataframe.apply(lambda row: get_grid_label(row['start_lat'], row['start_lng'], start_final_grids), axis=1)
dataframe['end_grid']=dataframe.apply(lambda row: get_grid_label(row['end_lat'], row['end_lng'], end_final_grids), axis=1)
dataframe=dataframe[['start_grid', 'end_grid']]
dataframe

Unnamed: 0,start_grid,end_grid
0,GEACAADB,EGCADDDC
1,EGCADDDC,GEACAADA
2,GEACAADA,GFACDBDA
3,GFACDBDA,GFAADCDC
4,GFAADCDC,GFACDCBA
...,...,...
2604,GFACBABC,GFCADABA
2605,GFCADABA,GECDBACA
2606,GECDBACA,GECCDDAB
2607,GECDCACC,GDDDDDAA


In [15]:
# 입력과 출력 데이터를 토크나이저로 변환
tokenizer = Tokenizer(char_level=True)  # 문자 단위 토크나이저
tokenizer.fit_on_texts(dataframe['start_grid'] + dataframe['end_grid'])  # 입력과 출력 데이터 모두에 적용

# 문자열 -> 정수 시퀀스 변환
input_sequences = tokenizer.texts_to_sequences(dataframe['start_grid'])
output_sequences = tokenizer.texts_to_sequences(dataframe['end_grid'])

In [16]:
# One-Hot Encoding
num_classes = len(tokenizer.word_index) + 1  # 토큰 개수 +1 (0은 패딩에 사용)
y_sequences = np.array([to_categorical(seq, num_classes=num_classes) for seq in output_sequences])

In [17]:
# 훈련/테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    np.array(input_sequences), y_sequences, test_size=0.2, random_state=42
)

In [18]:
# 모델 생성
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=64, input_length=8),  # 임베딩 레이어
    Bidirectional(GRU(128, return_sequences=True)),  # GRU 레이어
    Bidirectional(GRU(64, return_sequences=True)),  # GRU 레이어
    Bidirectional(GRU(32, return_sequences=True)),  # GRU 레이어
    TimeDistributed(Dense(num_classes, activation='softmax'))
])

model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])



In [19]:
# 모델 훈련
history = model.fit(X_train, y_train, epochs=15, validation_split=0.2, batch_size=4)

# 모델 평가
loss, accuracy = model.evaluate(X_test, y_test)

# Precision, Recall, F1-Score 계산
y_pred = model.predict(X_test)

# One-Hot Encoded -> 정수 인코딩으로 변환
y_test_flat = np.argmax(y_test, axis=-1).flatten()
y_pred_flat = np.argmax(y_pred, axis=-1).flatten()
precision = precision_score(y_test_flat, y_pred_flat, average="macro")
recall = recall_score(y_test_flat, y_pred_flat, average="macro")
f1 = f1_score(y_test_flat, y_pred_flat, average="macro")

print('----------------------------------------------')
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f'테스트 정확도: {accuracy}')

Epoch 1/15
[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 10ms/step - accuracy: 0.3832 - loss: 0.0663 - val_accuracy: 0.5087 - val_loss: 0.0563
Epoch 2/15
[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.5167 - loss: 0.0560 - val_accuracy: 0.5493 - val_loss: 0.0537
Epoch 3/15
[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.5741 - loss: 0.0515 - val_accuracy: 0.5577 - val_loss: 0.0534
Epoch 4/15
[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.5955 - loss: 0.0496 - val_accuracy: 0.5652 - val_loss: 0.0512
Epoch 5/15
[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.6018 - loss: 0.0492 - val_accuracy: 0.5697 - val_loss: 0.0519
Epoch 6/15
[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.6347 - loss: 0.0462 - val_accuracy: 0.5700 - val_loss: 0.0506
Epoch 7/15
[1m418/418[0m

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [23]:
# 예측 수행
example_text = 'GFAADCDC'
example_sequence = tokenizer.texts_to_sequences([example_text])[0]
example_sequence = np.array([example_sequence])  # 모델 입력 형태로 변환

# 예측 결과
predictions = model.predict(example_sequence)
predicted_tokens = np.argmax(predictions[0], axis=1)
predicted_text = ''.join([tokenizer.index_word[token] for token in predicted_tokens])
predicted_text=predicted_text.upper()
print(f"입력 텍스트: {example_text}")
print(f"예측된 텍스트: {predicted_text}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
입력 텍스트: GFAADCDC
예측된 텍스트: GFACDCDA


|index|start_grid|end_grid|
|---|---|---|
|0|GEACAADB|EGCADDDC|
|1|EGCADDDC|GEACAADA|
|2|GEACAADA|GFACDBDA|
|3|GFACDBDA|GFAADCDC|
|4|GFAADCDC|GFACDCBA|
|...|...|...|
|2604|GFACBABC|GFCADABA|
|2605|GFCADABA|GECDBACA|
|2606|GECDBACA|GECCDDAB|
|2607|GECDCACC|GDDDDDAA|
|2608|GDDDDDAB|GFCABACD|