In [8]:
#pytorch imports
import torch
import torch.optim as optim
import torch.nn as nn

#python standard
import time, os

#python data science
import numpy as np
import pandas as pd

#cs224u imports
import utils
import vsm

def devdf_generator(df, scoring=str) -> pd.DataFrame:
    '''
    Removes repeated pairs with distinct scores from dev_df based on scoring removal string.
    '''
    repeats = df.groupby(['word1', 'word2']).apply(lambda x: x.score.var())
    repeats = repeats[repeats > 0].sort_values(ascending=False)
    repeats.name = 'score variance'
    repeat_list = repeats.index.tolist()
    
    def repeat_words(first_word, second_word):
        return df[(df['word1']==first_word)&(df['word2']==second_word)]

    temp_df = df.copy()
    for pair in repeat_list:
        repeat = repeat_words(pair[0], pair[1])
        
        if scoring == 'highest':
            while len(repeat.index) > 1:
                min_score = repeat.score.min()
                index = repeat[repeat.score == min_score].index.values[0]
                temp_df = temp_df.drop(index=index, axis=0)
                repeat = repeat.drop(index=index, axis=0)
                
        elif scoring == 'lowest':
            while len(repeat.index) > 1:
                max_score = repeat.score.max()
                index = repeat[repeat.score == max_score].index.values[0]
                temp_df = temp_df.drop(index=index, axis=0)
                repeat = repeat.drop(index=index, axis=0)
                
        elif scoring == 'mean':
                mean_score = repeat.score.mean()
                index = repeat.index[0]
                drops = repeat.index[1:]
                temp_df.loc[index, 'score'] = mean_score
                temp_df = temp_df.drop(index=drops, axis=0)
                
    #check to see that variant pairs are dropped
    repeats = temp_df.groupby(['word1', 'word2']).apply(lambda x: x.score.var())
    repeats = repeats[repeats > 0.06].sort_values(ascending=False)
    repeats.name = 'score variance'
    answer = repeats[repeats > 0].sort_values(ascending=False)
    if len(answer) < 1:
        print('All problematic word pairs are removed')
    
    return temp_df


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class AE(nn.Module):
    def __init__(self, input_dim, hidden_dim, hidden_activation=nn.Tanh(), tol=0.001, verbose=True):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = input_dim
        self.hidden_activation = hidden_activation
        self.fc1 = nn.Linear(self.input_dim, self.hidden_dim)
        self.fc2 = nn.Linear(self.hidden_dim, 128)
        self.fc3 = nn.Linear(128, self.hidden_dim)
        self.fc4 = nn.Linear(self.hidden_dim, self.output_dim)
        self.loss = nn.MSELoss(reduction="mean")
        self.tol = tol
        self.verbose = verbose
        
    def forward(self, features):
        encode1 = self.fc1(features)
        act1 = self.hidden_activation(encode1)
        encode2 = self.fc2(act1)
        code = self.hidden_activation(encode2)
        decode = self.fc3(code)
        act3 = self.hidden_activation(decode)
        y = self.fc4(act3)
        return code, y


In [None]:
nn.Sequential(
            nn.Linear(self.input_dim, self.hidden_dim),
            self.hidden_activation,
            nn.Linear(self.hidden_dim, self.output_dim))

#### Preprocessing work

In [2]:
from vsm import observed_over_expected, pmi

VSM_HOME = os.path.join('data', 'vsmdata')
DATA_HOME = os.path.join('data', 'wordrelatedness')
giga5 = pd.read_csv(os.path.join(VSM_HOME, 'giga_window5-scaled.csv.gz'), index_col=0)
dev = pd.read_csv(os.path.join(DATA_HOME, "cs224u-wordrelatedness-dev.csv"))
highest = devdf_generator(dev, scoring='highest')

def run_ppmi_lsa_pipeline(count_df, k):
    #reweights count matrix with PPMI
    ppmi_df = pmi(count_df)
    
    #reduce dimensions to k
    lsa_df = vsm.lsa(ppmi_df, k=k)
    
    #evaluate matrices and return rho value
    return lsa_df


All problematic word pairs are removed


In [3]:
lsa_pmi_df = run_ppmi_lsa_pipeline(giga5, 200)

In [4]:
dataset = lsa_pmi_df.values.astype(np.float32)
N = dataset.shape[0]

In [9]:
model = AE(200,275).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss_fx = nn.MSELoss()

In [10]:
epochs = 200
batch_size = 256
frames = []

start = time.perf_counter()

for x in range(0,N,batch_size):
    
    subset = dataset[x:x+batch_size]
    print(f'Training on batch {x} - {x+batch_size}')
    train_loader = torch.utils.data.DataLoader(subset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)
    
    for epoch in range(epochs):

        loss = 0
        previous_loss = 100
        
        for batch in train_loader:

            # load tensors to GPU device, size of feature matrix is [batch_size, hidden_dim]
            batch_features = batch.to(device)

            # reset the gradients to zero
            optimizer.zero_grad()

            # grab hidden layer for conversion to df and output for loss calculations
            code, outputs = model(batch_features)
            
            # compute training loss
            train_loss = loss_fx(outputs, batch_features)
                
            # compute gradients
            train_loss.backward()

            # update weights
            optimizer.step()

            # add training loss to epoch loss
            loss += train_loss.item()

        # compute the epoch training loss
        loss = loss / len(train_loader)
        
        #early stop training if tolerance is reached
        if loss < previous_loss:
            if abs(previous_loss - loss) < model.tol:
                break
            else:
                previous_loss = loss
                
        # display the epoch training loss
        if model.verbose:
            if epoch % 100 == 0:
                print(f"epoch : {epoch}/{epochs}, loss = {loss:.6f}")
            
    with torch.no_grad():
        weights = code.cpu().numpy()
    frame = pd.DataFrame(weights)
    frames.append(frame)

end = time.perf_counter() - start
print(f'Total time for {epochs} epochs: {end/60} minutes')

Training on batch 0 - 256
epoch : 0/200, loss = 3.660520
epoch : 100/200, loss = 1.016900
Training on batch 256 - 512
epoch : 0/200, loss = 2.037327
epoch : 100/200, loss = 0.343519
Training on batch 512 - 768
epoch : 0/200, loss = 2.293734
epoch : 100/200, loss = 0.239240
Training on batch 768 - 1024
epoch : 0/200, loss = 2.246789
epoch : 100/200, loss = 0.171374
Training on batch 1024 - 1280
epoch : 0/200, loss = 2.116736
epoch : 100/200, loss = 0.147670
Training on batch 1280 - 1536
epoch : 0/200, loss = 2.148747
epoch : 100/200, loss = 0.142657
Training on batch 1536 - 1792
epoch : 0/200, loss = 2.135506
epoch : 100/200, loss = 0.129096
Training on batch 1792 - 2048
epoch : 0/200, loss = 2.011657
epoch : 100/200, loss = 0.122687
Training on batch 2048 - 2304
epoch : 0/200, loss = 2.304222
epoch : 100/200, loss = 0.128825
Training on batch 2304 - 2560
epoch : 0/200, loss = 2.128673
epoch : 100/200, loss = 0.123068
Training on batch 2560 - 2816
epoch : 0/200, loss = 1.975822
epoch : 

In [11]:
ae_df = pd.concat(frames)

ae_df.index = lsa_pmi_df.index

In [12]:
df, baseline_rho = vsm.word_relatedness_evaluation(highest, lsa_pmi_df)
df, output_rho = vsm.word_relatedness_evaluation(highest, ae_df)
if baseline_rho > output_rho:
    print('Your model sucks')
else:
    print('Good job')
print(baseline_rho, output_rho)

Your model sucks
0.6803743112316648 0.5934984669463605


In [13]:
output_rho

0.5934984669463605

In [85]:
test_df = pd.read_csv('data/wordrelatedness/cs224u-wordrelatedness-test-unlabeled.csv')

In [86]:
test_df

Unnamed: 0,word1,word2
0,abandon,frost
1,abandon,railroad
2,abortion,religion
3,abstract,candle
4,abstract,frog
...,...,...
1495,water,water
1496,ways,ways
1497,weather,winter
1498,wednesday,weekday


In [87]:
df = vsm.word_relatedness_evaluation(test_df, ae_df)

In [88]:
df

(          word1     word2  prediction
 0       abandon     frost   -0.994743
 1       abandon  railroad   -0.938083
 2      abortion  religion   -0.668551
 3      abstract    candle   -0.734666
 4      abstract      frog   -0.759742
 ...         ...       ...         ...
 1495      water     water    0.000000
 1496       ways      ways    0.000000
 1497    weather    winter   -0.472126
 1498  wednesday   weekday   -0.751630
 1499       word      word    0.000000
 
 [1500 rows x 3 columns],
 None)

In [91]:
def create_bakeoff_submission(
        vsm_df,
        distfunc,
        output_filename="cs224u-wordrelatedness-bakeoff-entry.csv"):

    test_df = pd.read_csv('data/wordrelatedness/cs224u-wordrelatedness-test-unlabeled.csv')

    pred_df, _ = vsm.word_relatedness_evaluation(test_df, vsm_df, distfunc=distfunc)

    pred_df.to_csv(output_filename)

In [92]:
create_bakeoff_submission(ae_df, vsm.cosine)

In [93]:
check = pd.read_csv('cs224u-wordrelatedness-bakeoff-entry.csv')

In [4]:
test = {'path':'somehwereonyourharddrive', 'caption':'This is a blue truck'}
test2 = test
hits = [test, test2]

In [10]:
list(hits[0].values())[0]

'somehwereonyourharddrive'

In [12]:
hits[0]['path']

'somehwereonyourharddrive'

In [13]:
hits

[{'path': 'somehwereonyourharddrive', 'caption': 'This is a blue truck'},
 {'path': 'somehwereonyourharddrive', 'caption': 'This is a blue truck'}]

In [15]:
hits[:9000]

[{'path': 'somehwereonyourharddrive', 'caption': 'This is a blue truck'},
 {'path': 'somehwereonyourharddrive', 'caption': 'This is a blue truck'}]