In [2]:
import numpy as np
import pandas as pd
import re
'''
train_data = pd.read_csv('train.tsv', sep='\t')

train_phrases = train_data['Phrase']
train_sentiments = train_data['Sentiment']

dev_data = pd.read_csv('test.tsv', sep='\t')

dev_phrases = train_data['Phrase']
dev_sentiments = train_data['Sentiment']

'''

train_data = pd.read_csv('new_train.tsv', sep='\t', header=None, names=['Phrase','Sentiment'])

train_phrases = train_data['Phrase']
train_sentiments = train_data['Sentiment']

dev_data = pd.read_csv('new_test.tsv', sep='\t', header=None, names=['Phrase','Sentiment'])

dev_phrases = dev_data['Phrase']
dev_sentiments = dev_data['Sentiment']

In [3]:
def tokenize(phrase):
    phrase = re.sub(r'[^\w\s]', '', phrase).lower()
    return phrase.split()

In [4]:
'''
def build_bow_vectors(phrases, vocab=None):
    # 构建词汇表
    if vocab is None:
        all_words = set()
        for phrase in phrases:
            words = tokenize(phrase)
            all_words.update(words)
        vocab = list(all_words)
    
    vocab_size = len(vocab)
    word_to_idx = {word: idx for idx, word in enumerate(vocab)}
    
    # 创建向量
    vectors = []
    for phrase in phrases:
        words = tokenize(phrase)
        vector = np.zeros(vocab_size)
        for word in words:
            if word in word_to_idx:
                vector[word_to_idx[word]] += 1
        vectors.append(vector)
    return np.array(vectors), vocab
    '''

'\ndef build_bow_vectors(phrases, vocab=None):\n    # 构建词汇表\n    if vocab is None:\n        all_words = set()\n        for phrase in phrases:\n            words = tokenize(phrase)\n            all_words.update(words)\n        vocab = list(all_words)\n    \n    vocab_size = len(vocab)\n    word_to_idx = {word: idx for idx, word in enumerate(vocab)}\n    \n    # 创建向量\n    vectors = []\n    for phrase in phrases:\n        words = tokenize(phrase)\n        vector = np.zeros(vocab_size)\n        for word in words:\n            if word in word_to_idx:\n                vector[word_to_idx[word]] += 1\n        vectors.append(vector)\n    return np.array(vectors), vocab\n    '

In [5]:
# Bag-of-Words向量化
from scipy import sparse

# Bag-of-Words向量化（稀疏矩阵）
def build_bow_vectors(phrases, vocab=None):
    if vocab is None:
        all_words = set()
        for phrase in phrases:
            words = tokenize(phrase)
            all_words.update(words)
        vocab = list(all_words)
    
    vocab_size = len(vocab)
    word_to_idx = {word: idx for idx, word in enumerate(vocab)}
    
    row = []
    col = []
    data = []
    
    for i, phrase in enumerate(phrases):
        words = tokenize(phrase)
        for word in words:
            if word in word_to_idx:
                row.append(i)
                col.append(word_to_idx[word])
                data.append(1)
    
    vectors = sparse.csr_matrix((data, (row, col)), shape=(len(phrases), vocab_size), dtype=np.float64)
    return vectors, vocab

In [6]:
'''
# N-gram向量化（以二元组为例）
def build_ngram_vectors(phrases, vocab=None, n=3):
    # 构建N-gram词汇表
    if vocab is None:
        all_ngrams = set()
        for phrase in phrases:
            words = tokenize(phrase)
            if(len(words)<n):
                continue
            for i in range(len(words) - n + 1):
                ngram = tuple(words[i:i+n])
                all_ngrams.add(ngram)
        vocab = list(all_ngrams)
    
    vocab_size = len(vocab)
    ngram_to_idx = {ngram: idx for idx, ngram in enumerate(vocab)}
    
    # 创建向量
    vectors = []
    for phrase in phrases:
        words = tokenize(phrase)
        vector = np.zeros(vocab_size)
        for i in range(len(words) - n + 1):
            ngram = tuple(words[i:i+n])
            if ngram in ngram_to_idx:
                vector[ngram_to_idx[ngram]] += 1
        vectors.append(vector)
    return np.array(vectors), vocab
    '''

'\n# N-gram向量化（以二元组为例）\ndef build_ngram_vectors(phrases, vocab=None, n=3):\n    # 构建N-gram词汇表\n    if vocab is None:\n        all_ngrams = set()\n        for phrase in phrases:\n            words = tokenize(phrase)\n            if(len(words)<n):\n                continue\n            for i in range(len(words) - n + 1):\n                ngram = tuple(words[i:i+n])\n                all_ngrams.add(ngram)\n        vocab = list(all_ngrams)\n    \n    vocab_size = len(vocab)\n    ngram_to_idx = {ngram: idx for idx, ngram in enumerate(vocab)}\n    \n    # 创建向量\n    vectors = []\n    for phrase in phrases:\n        words = tokenize(phrase)\n        vector = np.zeros(vocab_size)\n        for i in range(len(words) - n + 1):\n            ngram = tuple(words[i:i+n])\n            if ngram in ngram_to_idx:\n                vector[ngram_to_idx[ngram]] += 1\n        vectors.append(vector)\n    return np.array(vectors), vocab\n    '

In [7]:
# N-gram向量化（稀疏矩阵）
def build_ngram_vectors(phrases, vocab=None, max_n=2):
    if vocab is None:
        all_ngrams = set()
        for phrase in phrases:
            words = tokenize(phrase)
            for n in range(1, max_n + 1):
                for i in range(len(words) - n + 1):
                    ngram = tuple(words[i:i+n])
                    all_ngrams.add(ngram)
        vocab = list(all_ngrams)
    
    vocab_size = len(vocab)
    ngram_to_idx = {ngram: idx for idx, ngram in enumerate(vocab)}
    
    row = []
    col = []
    data = []
    
    for i, phrase in enumerate(phrases):
        words = tokenize(phrase)
        for n in range(1, max_n + 1):
            for j in range(len(words) - n + 1):
                ngram = tuple(words[j:j+n])
                if ngram in ngram_to_idx:
                    row.append(i)
                    col.append(ngram_to_idx[ngram])
                    data.append(1)
    
    vectors = sparse.csr_matrix((data, (row, col)), shape=(len(phrases), vocab_size), dtype=np.float64)
    return vectors, vocab

In [8]:
# Softmax函数
def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exp_z / exp_z.sum(axis=1, keepdims=True)


# 逻辑回归训练
def train_softmax_regression(X, y, num_classes=5, learning_rate=0.1, epochs=100):
    num_samples, num_features = X.shape
    np.random.seed(42)
    W = np.random.randn(num_features, num_classes) * 0.01
    b = np.zeros(num_classes)
    
    for epoch in range(epochs):
        # 前向传播
        Z = X @ W + b
        P = softmax(Z)
        
        # 计算损失
        y_onehot = np.zeros((num_samples, num_classes))
        y_onehot[np.arange(num_samples), y] = 1
        loss = -np.mean(np.sum(y_onehot * np.log(P + 1e-9), axis=1))
        
        # 反向传播
        dZ = P - y_onehot
        dW = X.T @ dZ / num_samples
        db = np.sum(dZ, axis=0) / num_samples
        
        # 更新参数
        W -= learning_rate * dW
        b -= learning_rate * db
        
        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {loss:.4f}")
    
    return W, b

# 预测函数
def predict(X, W, b):
    Z = X @ W + b
    P = softmax(Z)
    return np.argmax(P, axis=1)

In [9]:
# Bag-of-Words
train_bow, bow_vocab = build_bow_vectors(train_phrases)
dev_bow, ngram_vocab = build_bow_vectors(dev_phrases, bow_vocab)

# 训练和验证BoW模型
W_bow, b_bow = train_softmax_regression(train_bow, train_sentiments, epochs=4000)
pred_bow = predict(dev_bow, W_bow, b_bow)
print(pred_bow)
accuracy_bow = np.mean(pred_bow == dev_sentiments)
print(f"Bag-of-Words Accuracy: {accuracy_bow:.4f}")

Epoch 0, Loss: 1.6107
Epoch 10, Loss: 1.5329
Epoch 20, Loss: 1.5085
Epoch 30, Loss: 1.4954
Epoch 40, Loss: 1.4860
Epoch 50, Loss: 1.4784
Epoch 60, Loss: 1.4717
Epoch 70, Loss: 1.4657
Epoch 80, Loss: 1.4602
Epoch 90, Loss: 1.4552
Epoch 100, Loss: 1.4504
Epoch 110, Loss: 1.4459
Epoch 120, Loss: 1.4417
Epoch 130, Loss: 1.4377
Epoch 140, Loss: 1.4338
Epoch 150, Loss: 1.4302
Epoch 160, Loss: 1.4266
Epoch 170, Loss: 1.4232
Epoch 180, Loss: 1.4199
Epoch 190, Loss: 1.4168
Epoch 200, Loss: 1.4137
Epoch 210, Loss: 1.4107
Epoch 220, Loss: 1.4078
Epoch 230, Loss: 1.4050
Epoch 240, Loss: 1.4023
Epoch 250, Loss: 1.3996
Epoch 260, Loss: 1.3970
Epoch 270, Loss: 1.3945
Epoch 280, Loss: 1.3920
Epoch 290, Loss: 1.3895
Epoch 300, Loss: 1.3871
Epoch 310, Loss: 1.3848
Epoch 320, Loss: 1.3825
Epoch 330, Loss: 1.3802
Epoch 340, Loss: 1.3780
Epoch 350, Loss: 1.3758
Epoch 360, Loss: 1.3736
Epoch 370, Loss: 1.3715
Epoch 380, Loss: 1.3694
Epoch 390, Loss: 1.3674
Epoch 400, Loss: 1.3653
Epoch 410, Loss: 1.3633
Epo

In [10]:
# N-gram（二元组）
train_ngram, ngram_vocab = build_ngram_vectors(train_phrases, max_n=3)
dev_ngram, ngram_vocab = build_ngram_vectors(dev_phrases, vocab=ngram_vocab, max_n=3)

# 训练和验证N-gram模型
W_ngram, b_ngram = train_softmax_regression(train_ngram, train_sentiments, epochs=700)
pred_ngram = predict(dev_ngram, W_ngram, b_ngram)
print(pred_ngram)
accuracy_ngram = np.mean(pred_ngram == dev_sentiments)
print(f"N-gram Accuracy: {accuracy_ngram:.4f}")

Epoch 0, Loss: 1.6109
Epoch 10, Loss: 1.5297
Epoch 20, Loss: 1.5021
Epoch 30, Loss: 1.4857
Epoch 40, Loss: 1.4730
Epoch 50, Loss: 1.4621
Epoch 60, Loss: 1.4522
Epoch 70, Loss: 1.4431
Epoch 80, Loss: 1.4345
Epoch 90, Loss: 1.4264
Epoch 100, Loss: 1.4187
Epoch 110, Loss: 1.4112
Epoch 120, Loss: 1.4041
Epoch 130, Loss: 1.3972
Epoch 140, Loss: 1.3906
Epoch 150, Loss: 1.3841
Epoch 160, Loss: 1.3779
Epoch 170, Loss: 1.3718
Epoch 180, Loss: 1.3658
Epoch 190, Loss: 1.3600
Epoch 200, Loss: 1.3543
Epoch 210, Loss: 1.3488
Epoch 220, Loss: 1.3433
Epoch 230, Loss: 1.3379
Epoch 240, Loss: 1.3327
Epoch 250, Loss: 1.3275
Epoch 260, Loss: 1.3224
Epoch 270, Loss: 1.3174
Epoch 280, Loss: 1.3125
Epoch 290, Loss: 1.3077
Epoch 300, Loss: 1.3029
Epoch 310, Loss: 1.2982
Epoch 320, Loss: 1.2935
Epoch 330, Loss: 1.2889
Epoch 340, Loss: 1.2844
Epoch 350, Loss: 1.2799
Epoch 360, Loss: 1.2755
Epoch 370, Loss: 1.2711
Epoch 380, Loss: 1.2668
Epoch 390, Loss: 1.2625
Epoch 400, Loss: 1.2583
Epoch 410, Loss: 1.2541
Epo