## 連結至 Google drive 檔案位置

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd "/content/drive/MyDrive/Colab Notebooks/自然語言處理與文件探勘/HW3"

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/自然語言處理與文件探勘/HW3


# (2) Using word embedding for word similarity estimation

### 定義 word embedding model

In [2]:
import torch.nn as nn

class WordEmbeddingModel(nn.Module):
    def __init__(self, input_dim):
        super(WordEmbeddingModel, self).__init__()
        self.fc = nn.Linear(input_dim, 1)

    def forward(self, x):
        out = self.fc(x)
        return out

def train(model, train_data, num_epochs, learning_rate):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        total_loss = 0.0

        for vector1, vector2, score in train_data:
            inputs = torch.cat((vector1, vector2), dim=0)
            labels = torch.tensor([score])

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        average_loss = total_loss / len(train_data)
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {average_loss:.4f}')

print("Function defines success.")

Function defines success.


### 讀檔 (combined.csv)

In [3]:
import csv

word_pairs = []
similarity_scores = []

with open('similarity_estimation/combined.csv', 'r') as file:
    reader = csv.reader(file, delimiter=',')
    next(reader)  # 跳過標題行
    for row in reader:
        word_pairs.append((row[0], row[1]))
        similarity_scores.append(float(row[2]))
        
print("Train & test data generates successfully.")

Train & test data generates successfully.


### 利用 word2Vec 取得訓練集特徵值

In [4]:
import torch
import gensim

sentences = []
for pair in word_pairs:
    word1, word2 = pair
    sentences.append([word1, word2])

word2vec_model = gensim.models.Word2Vec(sentences, vector_size=200, window=5, min_count=1, workers=10)
word_vectors = word2vec_model.wv

train_data = []
for pair, score in zip(word_pairs, similarity_scores):
    word1, word2 = pair
    vector1 = torch.from_numpy(word_vectors[word1])
    vector2 = torch.from_numpy(word_vectors[word2])
    train_data.append((vector1, vector2, score))

print("Feature transform successfully.")

Feature transform successfully.


  vector1 = torch.from_numpy(word_vectors[word1])


### 訓練 word embedding 模型 並進行相似度預測

In [5]:
import torch.optim as optim

num_epochs = 100
learning_rate = 0.3

model = WordEmbeddingModel(input_dim=400)
train(model, train_data, num_epochs, learning_rate)

predictions = []
actual_scores = []
with torch.no_grad():
    for pair, score in zip(word_pairs, similarity_scores):
        word1, word2 = pair
        vector1 = torch.tensor(word_vectors[word1])
        vector2 = torch.tensor(word_vectors[word2])
        inputs = torch.cat((vector1, vector2), dim=0)
        prediction = model(inputs)
        predictions.append(round(prediction.item(), 2))
        actual_scores.append(score)

print("Classification model trained and predicted.")

Epoch 1/100, Loss: 6.4068
Epoch 2/100, Loss: 4.2303
Epoch 3/100, Loss: 3.4631
Epoch 4/100, Loss: 3.0155
Epoch 5/100, Loss: 2.6943
Epoch 6/100, Loss: 2.4447
Epoch 7/100, Loss: 2.2477
Epoch 8/100, Loss: 2.0894
Epoch 9/100, Loss: 1.9580
Epoch 10/100, Loss: 1.8455
Epoch 11/100, Loss: 1.7473
Epoch 12/100, Loss: 1.6609
Epoch 13/100, Loss: 1.5848
Epoch 14/100, Loss: 1.5175
Epoch 15/100, Loss: 1.4578
Epoch 16/100, Loss: 1.4047
Epoch 17/100, Loss: 1.3570
Epoch 18/100, Loss: 1.3138
Epoch 19/100, Loss: 1.2742
Epoch 20/100, Loss: 1.2375
Epoch 21/100, Loss: 1.2032
Epoch 22/100, Loss: 1.1707
Epoch 23/100, Loss: 1.1398
Epoch 24/100, Loss: 1.1102
Epoch 25/100, Loss: 1.0817
Epoch 26/100, Loss: 1.0541
Epoch 27/100, Loss: 1.0275
Epoch 28/100, Loss: 1.0017
Epoch 29/100, Loss: 0.9768
Epoch 30/100, Loss: 0.9527
Epoch 31/100, Loss: 0.9295
Epoch 32/100, Loss: 0.9071
Epoch 33/100, Loss: 0.8856
Epoch 34/100, Loss: 0.8649
Epoch 35/100, Loss: 0.8450
Epoch 36/100, Loss: 0.8260
Epoch 37/100, Loss: 0.8078
Epoch 38/1

### 匯出預測結果與人工標記相似度之檔案

In [6]:
import pandas as pd
import numpy as np

df_similarity = pd.DataFrame({
    'Word1': [pair[0] for pair in word_pairs],
    'Word2': [pair[1] for pair in word_pairs],
    'Prediction': predictions,
    'Actual': actual_scores
})
output_file = "similarity_estimation/similarity_predictions.csv"
df_similarity.to_csv(output_file, index=False)
print(f"Sucessful, output result at ./{output_file}")

Sucessful, output result at ./similarity_estimation/similarity_predictions.csv


# (3) Using word embedding for analogy prediction

### 只需跑一次 生成GloVe轉Word2Vec檔案 (約需執行5分鐘)

In [7]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

glove_file = "glove.6B.300d.txt"
word2vec_file = "glove_word2vec.txt"
glove2word2vec(glove_file, word2vec_file)
word_vectors = KeyedVectors.load_word2vec_format(word2vec_file)

print("Word vector loaded successfully.")

  glove2word2vec(glove_file, word2vec_file)


Word vector loaded successfully.


### 讀檔 分割訓練與測試資料集 (4:1)

In [8]:
import os
import re
import random

train_label = []
train_word_pair = []
test_label = []
test_word_pair = []
main_folder = "/content/drive/MyDrive/Colab Notebooks/自然語言處理與文件探勘/HW3/analogy_dataset"

for root, dirs, files in os.walk(main_folder):
    for file_name in files:
        file_path = os.path.join(root, file_name)
        with open(file_path, 'r') as file_obj:
            folder_name = os.path.splitext(file_name)[0]
            lines = file_obj.readlines()
            candidate_word_pair = []
            
            for line in lines:
                word1, word2_list = line.rstrip('\n').split('\t')
                for word2 in word2_list.split('/'):
                    if word2 and re.match('^[a-zA-Z]+$', word2):
                        candidate_word_pair.append([word1, word2])

            random.shuffle(candidate_word_pair)
            train_size = int(len(candidate_word_pair) * 0.8)
            train_lines = candidate_word_pair[:train_size]
            test_lines = candidate_word_pair[train_size:]

            for word_pair in train_lines:
                train_label.append(folder_name)
                train_word_pair.append(word_pair)

            for word_pair in test_lines:
                test_label.append(folder_name)
                test_word_pair.append(word_pair)
                
print("Train & test data generates successfully.")

Train & test data generates successfully.


### 將訓練與測試資料集 字詞組合透過 GloVe 轉換為向量特徵值、標籤透過 LabelEncoder 轉換

In [9]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

train_features = []
train_labels = []
test_features = []
test_labels = []

label_encoder = LabelEncoder()
encoded_train_labels = label_encoder.fit_transform(train_label)
encoded_test_labels = label_encoder.transform(test_label)

for pair, label in zip(train_word_pair, encoded_train_labels):
    word1, word2 = pair
    try:
        vector1 = word_vectors.get_vector(word1)
        vector2 = word_vectors.get_vector(word2)
    except KeyError:
        continue
    train_features.append(np.concatenate((vector1, vector2)))
    train_labels.append(label)

for pair, label in zip(test_word_pair, encoded_test_labels):
    word1, word2 = pair
    try:
        vector1 = word_vectors.get_vector(word1)
        vector2 = word_vectors.get_vector(word2)
    except KeyError:
        continue
    test_features.append(np.concatenate((vector1, vector2)))
    test_labels.append(label)
print("Feature transform successfully.")

Feature transform successfully.


### 分類器訓練和預測

In [10]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(train_features, train_labels)
predictions = svm.predict(test_features)

print("Classification model trained and predicted.")

Classification model trained and predicted.


### 輸出分類預測精準度

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(test_labels, predictions)
precision = precision_score(test_labels, predictions, average='macro', zero_division=1)
recall = recall_score(test_labels, predictions, average='macro', zero_division=1)
f1 = f1_score(test_labels, predictions, average='macro', zero_division=1)
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

Accuracy: 0.8755020080321285
Precision: 0.8608785539934637
Recall: 0.734383275219562
F1-score: 0.7730748468401523
