In [1]:
import pandas as pd
import torch
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader



In [2]:
# Setting seed for reproducibility
torch.manual_seed(42)

# Setting device 
if torch.cuda.is_available():  # for nvidia GPUs etc.
    device = torch.device('cuda')
elif torch.backends.mps.is_available(): # for Apple Metal Performance Sharder (mps) GPUs
    device = torch.device('mps')
else:
    device = torch.device('cpu')

device

device(type='mps')

# Prepare data

In [3]:
# Melting into cue-resp df
swow = pd.read_csv('../../data/embeds_train/SWOW-EN.R100.csv', usecols=['cue', 'R1', 'R2', 'R3'])
swow = (
    swow.melt(id_vars='cue', value_vars=['R1', 'R2', 'R3'], value_name='resp')
    .drop(columns=['variable']).dropna(axis=0).astype(str)
    .sample(frac=1, random_state=42).reset_index(drop=True)
)

swow

Unnamed: 0,cue,resp
0,control,repression
1,versus,opposition
2,fourteen,teen
3,reddish,hat
4,clarify,statement
...,...,...
3403393,facility,mental capacity
3403394,light,tunnel
3403395,Santa Claus,gifts
3403396,illusion,life


In [4]:
# Dropping resps with <5 occurrences
print(len(swow.resp.unique()))
resp_counts = swow.resp.value_counts().to_dict()
swow = swow[swow.resp.map(lambda x: resp_counts[x] >= 5)]
print(len(swow.resp.unique()))

134213
32312


In [5]:
class SWOWDat(Dataset):

    def __init__(self, swow):

        # Converting words to indices 
        self.cue_idxs = {cue: idx for idx, cue in enumerate(swow['cue'].unique())}
        self.resp_idxs = {resp: idx for idx, resp in enumerate(swow['resp'].unique())}
        self.n_cues, self.n_resps = len(self.cue_idxs), len(self.resp_idxs)
        swow['cue'] = swow['cue'].map(self.cue_idxs)
        swow['resp'] = swow['resp'].map(self.resp_idxs)

        self.x = torch.tensor(swow['cue'].to_numpy())
        self.y = torch.tensor(swow['resp'].to_numpy())

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx] # CrossEntropyLoss is more efficient with target with class indices


n_resps = len(swow['resp'].unique())
swow_dat = SWOWDat(swow)
swow_dataloader = DataLoader(swow_dat, batch_size=64, shuffle=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  swow['cue'] = swow['cue'].map(self.cue_idxs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  swow['resp'] = swow['resp'].map(self.resp_idxs)


# Training

In [7]:
class Word2Vec(torch.nn.Module):
    def __init__(self, n_cues, n_resps, n_dims):
        super(Word2Vec, self).__init__()
        self.cue_embeds = torch.nn.Embedding(n_cues, n_dims)
        self.resp_embeds = torch.nn.Linear(n_dims, n_resps, bias=False)

    def forward(self, x):
        cue_embed = self.cue_embeds(x)
        logits = self.resp_embeds(cue_embed)
        return logits

w2v = Word2Vec(swow_dat.n_cues, swow_dat.n_resps, 300).to(device)
print(w2v)

Word2Vec(
  (cue_embeds): Embedding(12281, 300)
  (resp_embeds): Linear(in_features=300, out_features=32312, bias=False)
)


In [8]:
loss_fn = torch.nn.CrossEntropyLoss(label_smoothing=0.0)
optimizer = torch.optim.Adam(w2v.parameters())

def train_loop(dataloader, model, loss_fn, optimizer):
  size = len(dataloader.dataset)

  for batch_idx, (X, y) in tqdm(enumerate(dataloader), total=len(dataloader)):
    
    # Compute prediction and loss
    X, y = X.to(device), y.to(device)
    pred = model(X)
    loss = loss_fn(pred, y)

    # Backpropogation 
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if batch_idx % 10000 == 0:
        loss, current = loss.item(), batch_idx * len(X)
        print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(swow_dataloader, w2v, loss_fn, optimizer)

Epoch 1
-------------------------------


  0%|          | 0/50956 [00:00<?, ?it/s]

loss: 10.460329  [    0/3261134]
loss: 6.803672  [640000/3261134]
loss: 7.478433  [1280000/3261134]
loss: 6.353588  [1920000/3261134]
loss: 6.180823  [2560000/3261134]
loss: 7.259861  [3200000/3261134]
Epoch 2
-------------------------------


  0%|          | 0/50956 [00:00<?, ?it/s]

loss: 5.688827  [    0/3261134]
loss: 6.911697  [640000/3261134]
loss: 5.778646  [1280000/3261134]
loss: 6.206161  [1920000/3261134]
loss: 5.476091  [2560000/3261134]
loss: 5.799956  [3200000/3261134]
Epoch 3
-------------------------------


  0%|          | 0/50956 [00:00<?, ?it/s]

loss: 5.844108  [    0/3261134]
loss: 5.442574  [640000/3261134]
loss: 6.094653  [1280000/3261134]
loss: 5.714053  [1920000/3261134]
loss: 6.747736  [2560000/3261134]
loss: 5.882218  [3200000/3261134]
Epoch 4
-------------------------------


  0%|          | 0/50956 [00:00<?, ?it/s]

loss: 5.180154  [    0/3261134]
loss: 5.671618  [640000/3261134]
loss: 6.022789  [1280000/3261134]
loss: 5.851384  [1920000/3261134]
loss: 6.210794  [2560000/3261134]
loss: 6.194073  [3200000/3261134]
Epoch 5
-------------------------------


  0%|          | 0/50956 [00:00<?, ?it/s]

loss: 4.980208  [    0/3261134]
loss: 5.800477  [640000/3261134]
loss: 5.624717  [1280000/3261134]
loss: 6.720362  [1920000/3261134]
loss: 5.718768  [2560000/3261134]
loss: 5.650105  [3200000/3261134]
Epoch 6
-------------------------------


  0%|          | 0/50956 [00:00<?, ?it/s]

loss: 5.465259  [    0/3261134]
loss: 5.136424  [640000/3261134]
loss: 5.732962  [1280000/3261134]
loss: 6.392437  [1920000/3261134]
loss: 5.784365  [2560000/3261134]
loss: 5.268448  [3200000/3261134]
Epoch 7
-------------------------------


  0%|          | 0/50956 [00:00<?, ?it/s]

loss: 5.423585  [    0/3261134]
loss: 5.774007  [640000/3261134]
loss: 5.412861  [1280000/3261134]
loss: 5.890086  [1920000/3261134]
loss: 5.946595  [2560000/3261134]
loss: 5.767954  [3200000/3261134]
Epoch 8
-------------------------------


  0%|          | 0/50956 [00:00<?, ?it/s]

loss: 5.019010  [    0/3261134]
loss: 5.495601  [640000/3261134]
loss: 5.391966  [1280000/3261134]
loss: 5.516641  [1920000/3261134]
loss: 5.411960  [2560000/3261134]
loss: 5.489806  [3200000/3261134]
Epoch 9
-------------------------------


  0%|          | 0/50956 [00:00<?, ?it/s]

loss: 5.842157  [    0/3261134]
loss: 5.501446  [640000/3261134]
loss: 5.690555  [1280000/3261134]
loss: 5.125530  [1920000/3261134]
loss: 5.378654  [2560000/3261134]
loss: 5.282742  [3200000/3261134]
Epoch 10
-------------------------------


  0%|          | 0/50956 [00:00<?, ?it/s]

loss: 5.284626  [    0/3261134]
loss: 5.856609  [640000/3261134]
loss: 4.927034  [1280000/3261134]
loss: 5.807247  [1920000/3261134]
loss: 5.725550  [2560000/3261134]
loss: 5.462958  [3200000/3261134]


In [9]:
# extract input embeddings
input_embeds = w2v.cue_embeds.weight.cpu().detach().numpy()
input_embeds = pd.DataFrame(input_embeds, index=swow_dat.cue_idxs.keys())

# extract output embeddings
output_embeds = w2v.resp_embeds.weight.cpu().detach().numpy()
output_embeds = pd.DataFrame(output_embeds, index=swow_dat.resp_idxs.keys())

# Subsetting to only the words in psychNorms norms
to_pull = set(
    pd.read_csv('../../data/psychNorms/psychNorms.zip', index_col=0, low_memory=False, compression='zip').index
)
input_embeds = input_embeds.loc[input_embeds.index.isin(to_pull)].astype(float)
output_embeds = output_embeds.loc[output_embeds.index.isin(to_pull)].astype(float)

# Saving the embeddings
input_embeds.to_csv('../../data/embeds/SGSoftMaxInput_SWOW.csv')
output_embeds.to_csv('../../data/embeds/SGSoftMaxOutput_SWOW.csv')