# TREC

In [11]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder

df=pd.read_csv("/Users/zhanglikang/Desktop/SC4002_G06/datasets/TREC/train.csv")
df["text"] = df["text"].str.lower()
print(len(df))
print(df.shape)
df.head()

5452
(5452, 3)


Unnamed: 0,label-coarse,label-fine,text
0,0,0,how did serfdom develop in and then leave russ...
1,1,1,what films featured the character popeye doyle ?
2,0,0,how can i find a list of celebrities ' real na...
3,1,2,what fowl grabs the spotlight after the chines...
4,2,3,what is the full form of .com ?


In [2]:
from sklearn.model_selection import train_test_split

# Split the data to create a development set of 500 examples
train_data, dev_data = train_test_split(df, test_size=500, random_state=42)

# Display the size of the training and development sets
len(train_data), len(dev_data)

(4952, 500)

In [15]:
# Get unique coarse labels
unique_labels = train_data['label-coarse'].unique()

# Randomly select 4 classes
selected_labels = np.random.choice(unique_labels, size=4, replace=False)

# Update the labels in the train and dev sets
train_data = train_data.copy()
dev_data = dev_data.copy()

train_data['new_label'] = train_data['label-coarse'].apply(lambda x: x if x in selected_labels else 'OTHERS')
dev_data['new_label'] = dev_data['label-coarse'].apply(lambda x: x if x in selected_labels else 'OTHERS')

# Display the unique labels in the updated training set
train_data['new_label'].unique()


array(['OTHERS', 0, 5, 4, 3], dtype=object)

In [24]:
train_data['new_label'] = train_data['new_label'].astype(str)
dev_data['new_label'] = dev_data['new_label'].astype(str)

In [10]:
import gensim.downloader

print(list(gensim.downloader.info()['models'].keys()))

# Download the "glove-twitter-25" embeddings
glove_vectors = gensim.downloader.load('word2vec-google-news-300')

# retrieve the vector for 'computer'
# glove_vectors['computer'] 

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [16]:
train_data.head()

Unnamed: 0,label-coarse,label-fine,text,new_label
4943,2,34,what is mikhail gorbachev 's middle initial ?,OTHERS
2346,0,0,how does the tail affect the flight of a kite ?,0
1835,5,21,what were the first three cities to have a pop...,5
4047,1,1,what is the movie jonathan livingstone seagull ?,OTHERS
5097,1,23,what is a fear of home surroundings ?,OTHERS


In [19]:
dev_data.head()

Unnamed: 0,label-coarse,label-fine,text,new_label
3408,5,21,what city is served by tempelhol airport ?,5
371,1,2,what is dudley do-right 's horse 's name ?,OTHERS
453,0,9,what 's nature 's purpose for tornadoes ?,0
290,0,12,what is the history of valentine 's day cards ?,0
4457,3,4,what president became chief justice after his ...,3



## Use a simple linear layer + Averaging all word vectors


In [22]:
from sklearn.preprocessing import LabelEncoder

In [29]:
# Convert text to input vectors using glove_vectors
def text_to_vectors(text, glove_vectors):
    return [glove_vectors[w] for w in text.split() if w in glove_vectors]

X_train = np.array([text_to_vector(text, glove_vectors) for text in train_data['text']])
X_dev = np.array([text_to_vector(text, glove_vectors) for text in dev_data['text']])

# Convert labels to integers
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['new_label'])
y_dev = label_encoder.transform(dev_data['new_label'])

X_train, y_train = torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.int64)
X_dev, y_dev = torch.tensor(X_dev, dtype=torch.float32), torch.tensor(y_dev, dtype=torch.int64)


In [40]:
X_train.size()

torch.Size([4952, 300])

In [46]:
class QuestionClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(QuestionClassifier, self).__init__()
        # Hidden Layer
        self.hidden = nn.Linear(input_dim, hidden_dim)
        # Output Layer
        self.output = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.LogSoftmax(dim=1)

#     def forward(self, x):
#         # List to hold the outputs of the hidden layer for each word in a question
#         outputs = []
        
#         # Process each word through the hidden layer
#         for word_vec in x:
#             word_vec = torch.tensor(word_vec, dtype=torch.float32)
#             hidden_output = self.hidden(word_vec)
#             outputs.append(hidden_output)
        
#         # Aggregation Layer: Averaging outputs of the hidden layer
#         avg_output = torch.mean(torch.stack(outputs), dim=0)
        
#         # Output Layer
#         final_output = self.output(avg_output)
#         return self.softmax(final_output)
    
    
    def forward(self, x):
    # List to hold the outputs of the hidden layer for each word in a question
        outputs = []

        # Process each word through the hidden layer
        for word_vec in x:

            print(word_vec.shape)

            word_vec = torch.tensor(word_vec, dtype=torch.float32).view(1, -1)  # Reshape the word vector
            hidden_output = self.hidden(word_vec)
            outputs.append(hidden_output)

        # Aggregation Layer: Averaging outputs of the hidden layer
        avg_output = torch.mean(torch.stack(outputs), dim=0).squeeze()  # Make sure the tensor shape is consistent

        # Output Layer
        final_output = self.output(avg_output)
        return self.softmax(final_output)


input_dim = 300  # 300 for 'word2vec-google-news-300'
hidden_dim = 100
output_dim = len(train_data['new_label'].unique())
model = QuestionClassifier(input_dim, hidden_dim, output_dim)


In [47]:
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.NLLLoss()
epochs = 100

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    
    total_loss = 0
    for i, question in enumerate(X_train):
        predictions = model(question)
        loss = criterion(predictions.unsqueeze(0), y_train[i].unsqueeze(0))
        total_loss += loss.item()
        loss.backward(retain_graph=True)
    
    optimizer.step()
    
    # Evaluate on development set
    model.eval()
    total_dev_loss = 0
    correct = 0
    with torch.no_grad():
        for i, question in enumerate(X_dev):
            dev_predictions = model(question)
            dev_loss = criterion(dev_predictions.unsqueeze(0), y_dev[i].unsqueeze(0))
            total_dev_loss += dev_loss.item()
            
            _, predicted = torch.max(dev_predictions.data, 0)
            correct += (predicted == y_dev[i]).item()

    dev_accuracy = correct / len(y_dev)
    
    print(f"Epoch {epoch+1}/{epochs} => "
          f"Train Loss: {total_loss/len(X_train):.4f}, "
          f"Dev Loss: {total_dev_loss/len(X_dev):.4f}, "
          f"Dev Accuracy: {dev_accuracy:.4f}")

print("Training complete!")


torch.Size([])


  word_vec = torch.tensor(word_vec, dtype=torch.float32).view(1, -1)  # Reshape the word vector


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x1 and 300x100)

In [None]:
# Function to get the average word vector for a sentence
def sentence_vector(sentence):
    words = sentence.split()
    # Split the sentence into words.

    vectors = [glove_vectors[word] for word in words if word in glove_vectors]
    # Get the word vector for each word in the sentence if it exists in glove_vectors.


    if len(vectors) == 0: # to avoid empty lists
        return np.zeros(300)
    # If no words from the sentence are in the word vectors, return a vector of zeros.

    return np.mean(vectors, axis=0)
# Return the average word vector for the sentence.

train_data['avg_vector'] = train_data['text'].apply(sentence_vector)
# Compute the average word vector for each sentence in the training data.

X_train = np.vstack(train_data['avg_vector'].values)
# Stack the average vectors to form the training data.

y_train = pd.get_dummies(train_data['new_label']).values
# Convert the new labels to one-hot encoded vectors.

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
# Convert the training data to a PyTorch tensor.

y_train_tensor = torch.tensor(np.argmax(y_train, axis=1), dtype=torch.int64)
# Convert the one-hot encoded labels to their corresponding class indices.


In [None]:
# Neural Network Model with a simple linear layer
class SimpleClassifier(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(SimpleClassifier, self).__init__()
        # Initialize the parent class.
        
        self.fc = nn.Linear(input_dim, output_dim)
        # Define a fully connected layer.

    def forward(self, x):
        return self.fc(x)
    # Define the forward pass to return the output of the linear layer.

input_dim = 300  # as we're using word2vec-google-news-300
# Define the input dimension based on the word vector size.

output_dim = 5  # for our 5 new classes
# Define the output dimension based on the number of new classes.

model = SimpleClassifier(input_dim, output_dim)
# Initialize the model.

criterion = nn.CrossEntropyLoss()
# Define the loss function (cross entropy).

optimizer = optim.Adam(model.parameters(), lr=0.001)
# Define the optimizer (Adam) with a learning rate.

# Training
epochs = 5000
# Set the number of epochs.

for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    _, predicted = torch.max(outputs, 1)
    accuracy = (predicted == y_train_tensor).float().mean().item()
  


    # Store metrics for visualization
    all_metrics['A']['epochs'].append(epoch)
    all_metrics['A']['accuracy'].append(accuracy)
    all_metrics['A']['loss'].append(loss.item())

    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}, Accuracy: {accuracy}")
    # Print the loss and accuracy for each epoch.


In [None]:
visualize_metrics(all_metrics['A'])

## Use a feedforward network which is a combination of a linear transformation and a nonlinear activation function

## Max pooling over the word vectors

In [None]:
def sentence_vector(sentence):
    words = sentence.split()
    vectors = [glove_vectors[word] for word in words if word in glove_vectors]
    if len(vectors) == 0: # if no words in the sentence have embeddings
        return np.zeros(300)
    return np.max(vectors, axis=0) # max pooling across the words

train_data['maxpooled_vector'] = train_data['text'].apply(sentence_vector)

X_train = np.vstack(train_data['maxpooled_vector'].values)
y_train = pd.get_dummies(train_data['new_label']).values

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(np.argmax(y_train, axis=1), dtype=torch.int64)

In [None]:
train_data.head()

In [None]:
# Define the neural network model with feedforward layers
class FeedForwardClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(FeedForwardClassifier, self).__init__()
        
        # First linear layer
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        
        # Second linear layer that outputs class probabilities
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        # Apply first linear transformation
        x = self.fc1(x)
        
        # Apply ReLU activation function
        x = nn.ReLU()(x)
        
        # Apply second linear transformation
        return self.fc2(x)

input_dim = 300  # as we're using word2vec-google-news-300
hidden_dim = 1000  # can be adjusted based on performance
output_dim = 5   # for our 5 new classes

model = FeedForwardClassifier(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 100

for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    _, predicted = torch.max(outputs, 1)
    accuracy = (predicted == y_train_tensor).float().mean().item()

    # Store metrics for visualization
    all_metrics['B']['epochs'].append(epoch)
    all_metrics['B']['accuracy'].append(accuracy)
    all_metrics['B']['loss'].append(loss.item())
    
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}, Accuracy: {accuracy}")


In [None]:
visualize_metrics(all_metrics['B'])

##  Recurrent neural network
##  Aggregation Layer: Taking the representation of the last word (useful if using RNNs)

In [None]:
def sentence_matrix(sentence, max_len=30):
    words = sentence.split()[:max_len]  # truncate if necessary
    vectors = [glove_vectors[word] for word in words if word in glove_vectors]
    while len(vectors) < max_len:  # pad if necessary
        vectors.append(np.zeros(300))
    return np.array(vectors)

train_data['vector_matrix'] = train_data['text'].apply(sentence_matrix)

X_train = np.stack(train_data['vector_matrix'].values)
y_train = pd.get_dummies(train_data['new_label']).values

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(np.argmax(y_train, axis=1), dtype=torch.int64)

In [None]:
train_data.head()

In [None]:
# Define the neural network model with an RNN layer
class RNNClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(RNNClassifier, self).__init__()
        
        # RNN layer
        self.rnn = nn.RNN(input_dim, hidden_dim, batch_first=True)
        
        # Linear layer that outputs class probabilities
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        # Pass the input through the RNN layer
        out, _ = self.rnn(x)
        
        # Only take the output from the final timestep
        out = out[:, -1, :]
        
        # Pass the final output through the linear layer
        return self.fc(out)

input_dim = 300  # as we're using word2vec-google-news-300
hidden_dim = 100  # can be adjusted based on performance
output_dim = 5   # for our 5 new classes

model = RNNClassifier(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 100

for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    _, predicted = torch.max(outputs, 1)
    accuracy = (predicted == y_train_tensor).float().mean().item()

    # Store metrics for visualization
    all_metrics['C']['epochs'].append(epoch)
    all_metrics['C']['accuracy'].append(accuracy)
    all_metrics['C']['loss'].append(loss.item())
    
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}, Accuracy: {accuracy}")


In [None]:
visualize_metrics(all_metrics['C'])