In [3]:
import obspy
import numpy as np
from obspy import read, UTCDateTime
import json
import os

In [14]:
# -------------------- 準備資料 --------------------
# 檔案路徑和時間設定
mseed_file_path = "/home/sysop/pytorch/Data/241122Tainan.mseed"
t1 = UTCDateTime(2024, 11, 22, 12, 40, 30)
t2 = UTCDateTime(2024, 11, 22, 12, 40, 35)
st = read(mseed_file_path)

# -------------------- 建立標籤映射 --------------------
# 這是你的分類規則
# positive_stations = ["ALS", "CHN8", "CHY", "SCL", "ELD"]
positive_stations = ["ALS", "ELD", "PNG", "SPT", "SSD", "WDL", "WGK", "WNT", "WTC", "WYL", "YUS"]

def get_label(station_code):
    if station_code in positive_stations:
        return 1  # 標籤 1
    else:
        return 0  # 標籤 0

# -------------------- 產生 JSON 格式數據 --------------------
# 建立一個空的列表來儲存每一筆數據
json_data_list = []

# 遍歷你的 st2 中的每一條波形 (trace)
for trace in st:
    # 獲取標籤
    station_code = trace.stats.station
    label = get_label(station_code)
    
    # 將波形數據轉換為 Python 列表
    waveform_data = trace.data.tolist()
       
    # 建立單筆數據的字典
    data_item = {
        "data": waveform_data,  # 這裡的 "data" 鍵值對應你的 X_train
        "label": label           # 這裡的 "label" 鍵值對應你的 Y_train
    }
        
    # 將這筆數據加入列表
    json_data_list.append(data_item)

# -------------------- 將列表儲存為 JSON 檔案 --------------------
output_json_file = "test_data.json"
with open(output_json_file, 'w') as f:
    json.dump(json_data_list, f, indent=4)

print(f"數據已成功轉換並儲存為 {output_json_file}")

數據已成功轉換並儲存為 test_data.json


In [10]:
from datasets import load_dataset
data_files = {"train": "train_data.json", "test": "test_data.json"}
dataset = load_dataset("json", data_files = data_files)
print(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['data', 'label'],
        num_rows: 201
    })
    test: Dataset({
        features: ['data', 'label'],
        num_rows: 186
    })
})


In [12]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = (
    BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels = 2,
        id2label = {0: "negative", 1: "positive"},
        label2id = {"negative": 0, "positive": 1}
    ).to(device)
)
model_name = "sentiment_model"

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score

def preprocess_function(example):
  return tokenizer(example['text'], truncation = True, padding = True)

train_dataset = dataset["train"].map(preprocess_function, batched = True)
test_dataset = dataset["test"].map(preprocess_function, batched = True)

data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

def compute_metrics(pred):
  labels = pred.label_ids
  predictions = pred.predictions.argmax(-1)
  accuracy = accuracy_score(labels, predictions)
  return {"accuracy": accuracy}

Map:   0%|          | 0/201 [00:00<?, ? examples/s]

KeyError: 'text'