In [1]:
import pandas as pd
import torch
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader

In [3]:
# Setting seed for reproducibility
torch.manual_seed(42)

# Setting device 
if torch.cuda.is_available():  # for nvidia GPUs etc.
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

device

device(type='mps')

# Prepare data

In [4]:
# Melting into cue-resp df
swow = pd.read_csv('../../data/free_assoc/SWOW-EN.R100.20180827.csv', usecols=['cue', 'R1', 'R2', 'R3'])
swow = (
    swow.melt(id_vars='cue', value_vars=['R1', 'R2', 'R3'], value_name='resp')
    .drop(columns=['variable']).dropna(axis=0).astype(str)
    .sample(frac=1, random_state=42).reset_index(drop=True)
)
swow

Unnamed: 0,cue,resp
0,control,repression
1,versus,opposition
2,fourteen,teen
3,reddish,hat
4,clarify,statement
...,...,...
68062,splendid,grandmother
68063,engine,fire engine
68064,assurance,confidence
68065,water,wave


In [5]:
# Dropping resps with <5 occurrences
print(len(swow.resp.unique()))
resp_counts = swow.resp.value_counts().to_dict()
swow = swow[swow.resp.map(lambda x: resp_counts[x] >= 5)]
print(len(swow.resp.unique()))

16548
3048


In [None]:
class SWOWDat(Dataset):

    def __init__(self, swow):

        # Converting words to indices
        self.cue_idxs = {cue: idx for idx, cue in enumerate(swow['cue'].unique())}
        self.resp_idxs = {resp: idx for idx, resp in enumerate(swow['resp'].unique())}
        self.n_cues, self.n_resps = len(self.cue_idxs), len(self.resp_idxs)
        swow.loc[:, 'cue'] = swow['cue'].map(self.cue_idxs)
        swow.loc[:, 'resp'] = swow['resp'].map(self.resp_idxs)

        self.x = torch.tensor(swow['cue'].to_numpy(dtype=int))
        self.y = torch.tensor(swow['resp'].to_numpy(dtype=int))

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx] # CrossEntropyLoss is more efficient with target with class indices


n_resps = len(swow['resp'].unique())
swow_dat = SWOWDat(swow)
swow_dataloader = DataLoader(swow_dat, batch_size=64, shuffle=True)

# Training

In [None]:
class Word2Vec(torch.nn.Module):
    def __init__(self, n_cues, n_resps, n_dims):
        super(Word2Vec, self).__init__()
        self.cue_embeds = torch.nn.Embedding(n_cues, n_dims)
        self.resp_embeds = torch.nn.Linear(n_dims, n_resps, bias=False)

    def forward(self, x):
        cue_embed = self.cue_embeds(x)
        logits = self.resp_embeds(cue_embed)
        return logits

w2v = Word2Vec(swow_dat.n_cues, swow_dat.n_resps, 300).to(device)
print(w2v)

In [None]:
loss_fn = torch.nn.CrossEntropyLoss(label_smoothing=0.0)
optimizer = torch.optim.Adam(w2v.parameters())

def train_loop(dataloader, model, loss_fn, optimizer):
  size = len(dataloader.dataset)

  for batch_idx, (X, y) in tqdm(enumerate(dataloader), total=len(dataloader)):
    
    # Compute prediction and loss
    X, y = X.to(device), y.to(device)
    pred = model(X)
    loss = loss_fn(pred, y)

    # Backpropogation 
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if batch_idx % 10000 == 0:
        loss, current = loss.item(), batch_idx * len(X)
        print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(swow_dataloader, w2v, loss_fn, optimizer)

In [None]:
# extract input embeddings
input_embeds = w2v.cue_embeds.weight.cpu().detach().numpy()
input_embeds = pd.DataFrame(input_embeds, index=swow_dat.cue_idxs.keys())

# extract output embeddings
output_embeds = w2v.resp_embeds.weight.cpu().detach().numpy()
output_embeds = pd.DataFrame(output_embeds, index=swow_dat.resp_idxs.keys())

# Subsetting to only the words in psychNorms norms
to_pull = set(
    pd.read_csv('../../data/psychNorms/psychNorms.zip', index_col=0, low_memory=False, compression='zip').index
)
input_embeds = input_embeds.loc[input_embeds.index.isin(to_pull)].astype(float)
output_embeds = output_embeds.loc[output_embeds.index.isin(to_pull)].astype(float)

# Saving the embeddings
input_embeds.to_csv('../../data/embeds/SGSoftMaxInput_SWOW.csv')
output_embeds.to_csv('../../data/embeds/SGSoftMaxOutput_SWOW.csv')