In [2]:
%matplotlib inline
from collections import defaultdict
import nltk
from nltk.corpus import wordnet as wn
import numpy as np
import os
import pandas as pd
import retrofitting
from retrofitting import Retrofitter
import utils
import vsm
import csv
import itertools
from scipy.stats import spearmanr
from IPython.display import display
from sklearn.decomposition import PCA

In [3]:
def get_reader_name(reader):
    """
    Return a cleaned-up name for the dataset iterator `reader`.
    """
    return reader.__name__.replace("_reader", "")

In [4]:
def wordsim_dataset_reader(
        src_filename,
        header=False,
        delimiter=',',
        score_col_index=2):
    """
    Basic reader that works for all similarity datasets. They are
    all tabular-style releases where the first two columns give the
    word and a later column (`score_col_index`) gives the score.

    Parameters
    ----------
    src_filename : str
        Full path to the source file.

    header : bool
        Whether `src_filename` has a header.

    delimiter : str
        Field delimiter in `src_filename`.

    score_col_index : int
        Column containing the similarity scores Default: 2

    Yields
    ------
    (str, str, float)
       (w1, w2, score) where `score` is the negative of the similarity
       score in the file so that we are intuitively aligned with our
       distance-based code. To align with our VSMs, all the words are
       downcased.

    """
    with open(src_filename) as f:
        reader = csv.reader(f, delimiter=delimiter)
        if header:
            next(reader)
        for row in reader:
            w1 = row[0].strip().lower()
            w2 = row[1].strip().lower()
            score = row[score_col_index]
            # Negative of scores to align intuitively with distance functions:
            score = -float(score)
            yield (w1, w2, score)

def wordsim353_reader():
    """WordSim-353: http://www.gabrilovich.com/resources/data/wordsim353/"""
    src_filename = os.path.join(
        WORDSIM_HOME, 'wordsim353', 'combined.csv')
    return wordsim_dataset_reader(
        src_filename, header=True)

def mturk771_reader():
    """MTURK-771: http://www2.mta.ac.il/~gideon/mturk771.html"""
    src_filename = os.path.join(
        WORDSIM_HOME, 'MTURK-771.csv')
    return wordsim_dataset_reader(
        src_filename, header=False)

def simverb3500dev_reader():
    """SimVerb-3500: https://www.aclweb.org/anthology/D16-1235/"""
    src_filename = os.path.join(
        WORDSIM_HOME, 'SimVerb-3500', 'SimVerb-500-dev.txt')
    return wordsim_dataset_reader(
        src_filename, delimiter="\t", header=False, score_col_index=3)

def simverb3500test_reader():
    """SimVerb-3500: https://www.aclweb.org/anthology/D16-1235/"""
    src_filename = os.path.join(
        WORDSIM_HOME, 'SimVerb-3500', 'SimVerb-3000-test.txt')
    return wordsim_dataset_reader(
        src_filename, delimiter="\t", header=False, score_col_index=3)

def men_reader():
    """MEN: https://staff.fnwi.uva.nl/e.bruni/MEN"""
    src_filename = os.path.join(
        WORDSIM_HOME, 'MEN', 'MEN_dataset_natural_form_full')
    return wordsim_dataset_reader(
        src_filename, header=False, delimiter=' ')

In [5]:
READERS = (wordsim353_reader, mturk771_reader, simverb3500dev_reader,
           simverb3500test_reader, men_reader)

In [6]:
def word_similarity_evaluation(reader, df, distfunc=vsm.cosine):
    """
    Word-similarity evalution framework.

    Parameters
    ----------
    reader : iterator
        A reader for a word-similarity dataset. Just has to yield
        tuples (word1, word2, score).

    df : pd.DataFrame
        The VSM being evaluated.

    distfunc : function mapping vector pairs to floats.
        The measure of distance between vectors. Can also be
        `vsm.euclidean`, `vsm.matching`, `vsm.jaccard`, as well as
        any other float-valued function on pairs of vectors.

    Raises
    ------
    ValueError
        If `df.index` is not a subset of the words in `reader`.

    Returns
    -------
    float, data
        `float` is the Spearman rank correlation coefficient between
        the dataset scores and the similarity values obtained from
        `df` using  `distfunc`. This evaluation is sensitive only to
        rankings, not to absolute values.  `data` is a `pd.DataFrame`
        with columns['word1', 'word2', 'score', 'distance'].

    """
    data = []
    for w1, w2, score in reader():
        d = {'word1': w1, 'word2': w2, 'score': score}
        for w in [w1, w2]:
            if w not in df.index:
                raise ValueError(
                    "Word '{}' is in the similarity dataset {} but not in the "
                    "DataFrame, making this evaluation ill-defined. Please "
                    "switch to a DataFrame with an appropriate vocabulary.".
                    format(w, get_reader_name(reader)))
        d['distance'] = distfunc(df.loc[w1], df.loc[w2])
        data.append(d)
    data = pd.DataFrame(data)
    rho, pvalue = spearmanr(data['score'].values, data['distance'].values)
    return rho, data

In [7]:
def full_word_similarity_evaluation(df, readers=READERS, distfunc=vsm.cosine):
    """
    Evaluate a VSM against all datasets in `readers`.

    Parameters
    ----------
    df : pd.DataFrame

    readers : tuple
        The similarity dataset readers on which to evaluate.

    distfunc : function mapping vector pairs to floats.
        The measure of distance between vectors. Can also be
        `vsm.euclidean`, `vsm.matching`, `vsm.jaccard`, as well as
        any other float-valued function on pairs of vectors.

    Returns
    -------
    pd.Series
        Mapping dataset names to Spearman r values.

    """
    scores = {}
    for reader in readers:
        try:
            score, _ = word_similarity_evaluation(reader, df, distfunc=distfunc)
            scores[get_reader_name(reader)] = score
        except Exception as e:
            print(e)
            scores[get_reader_name(reader)] = np.nan
    series = pd.Series(scores, name='Spearman r')
    series['Macro-average'] = series.mean()
    return series

In [8]:
DATA_HOME = os.path.join('data')

#nltk.data.path.remove('/Users/antongochev/ai-program/cs224u/nltk_data')
nltk.data.path.append('/Users/antongochev/ai-program/cs224u/nltk_data')

In [9]:
nltk.data.path

['/Users/antongochev/nltk_data',
 '/Users/antongochev/opt/miniconda3/envs/nlu/nltk_data',
 '/Users/antongochev/opt/miniconda3/envs/nlu/share/nltk_data',
 '/Users/antongochev/opt/miniconda3/envs/nlu/lib/nltk_data',
 '/usr/share/nltk_data',
 '/usr/local/share/nltk_data',
 '/usr/lib/nltk_data',
 '/usr/local/lib/nltk_data',
 '/Users/antongochev/ai-program/cs224u/nltk_data']

In [10]:
VSM_HOME = os.path.join('data', 'vsmdata')

WORDSIM_HOME = os.path.join('data', 'wordsim')

In [11]:
def get_wordnet_edges():
    edges = defaultdict(set)
    for ss in wn.all_synsets():
        lem_names = {lem.name() for lem in ss.lemmas()}
        for lem in lem_names:
            edges[lem] |= lem_names
    return edges

In [12]:
def convert_edges_to_indices(edges, Q):
    lookup = dict(zip(Q.index, range(Q.shape[0])))
    index_edges = defaultdict(set)
    for start, finish_nodes in edges.items():
        s = lookup.get(start)
        if s:
            f = {lookup[n] for n in finish_nodes if n in lookup}
            if f:
                index_edges[s] = f
    return index_edges

In [13]:
def ttest(df):
    pass
    ##### YOUR CODE HERE
    all_sum_df = df.sum().sum()
    p_df_ij = np.outer((df.sum(axis=1) / all_sum_df), 
                       (df.sum(axis=0) / all_sum_df))
    
    return ((df / all_sum_df) - p_df_ij) / np.sqrt(p_df_ij)

In [14]:
wn_edges = get_wordnet_edges()

In [15]:
imdb5 = pd.read_csv(
    os.path.join(VSM_HOME, 'imdb_window5-scaled.csv.gz'), index_col=0)

In [40]:
imdb5_ppmi = vsm.pmi(imdb5)
imdb5_ppmi_lsa = vsm.lsa(imdb5_ppmi, k=150)

In [24]:
imdb5_ppmi.head()

Unnamed: 0,!,);,.,..,...,:),?,abandoned,abdomen,abduct,....1,zebra,zero,zinc,zombie,zombies,zone,zoo,,,´
!,0.551536,0.000413,-0.025943,0.000255,0.000153,0.003777,0.006555,-0.000612,-6.9e-05,-0.000109,...,-3.4e-05,-0.000133,-2.4e-05,-0.000639,-0.000166,-0.000414,-0.000136,-0.001352,-0.00065,-5.9e-05
);,0.000413,0.503995,-0.002881,-0.000194,-0.00039,-2.2e-05,0.000186,-3e-05,-9e-06,1.8e-05,...,5e-05,-4e-05,-4e-06,-8.2e-05,-1.9e-05,-1.8e-05,8e-06,3e-06,-5e-05,-8.5e-05
.,-0.025943,-0.002881,0.375808,-0.00723,-0.017306,-0.000921,-0.019409,-0.001347,-6.7e-05,-0.000292,...,-0.000274,-0.000796,-9e-05,-0.002336,-0.001317,-0.000906,-0.000353,-0.006129,-0.002785,-0.001723
..,0.000255,-0.000194,-0.00723,0.470064,0.001017,0.000936,0.000238,-0.000137,-2.5e-05,-1.8e-05,...,-2.8e-05,-6.8e-05,-1e-05,-0.000119,-6.4e-05,-0.000128,-8e-06,-0.000505,-0.000245,-5.6e-05
...,0.000153,-0.00039,-0.017306,0.001017,0.472124,0.001072,0.000504,-0.000331,1.3e-05,-6.9e-05,...,-4.8e-05,-0.00016,5e-05,-0.00039,-2.7e-05,-0.000206,-6.3e-05,-0.001386,-0.000632,-1e-05


In [26]:
imdb5_ppmi.head()

Unnamed: 0,!,);,.,..,...,:),?,abandoned,abdomen,abduct,....1,zebra,zero,zinc,zombie,zombies,zone,zoo,,,´
!,4.138301,0.376882,0.0,0.099717,0.024329,1.657124,0.655127,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
);,0.376882,8.617159,0.0,0.0,0.0,0.0,0.230616,0.0,0.0,0.873651,...,1.429102,0.0,0.0,0.0,0.0,0.0,0.223181,0.016123,0.0,0.0
.,0.0,0.0,1.520218,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
..,0.099717,0.0,0.0,6.567876,0.472386,1.582273,0.116486,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,0.024329,0.0,0.0,0.472386,4.710048,1.007327,0.097791,0.0,0.185895,0.0,...,0.0,0.0,1.105405,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
giga5 = pd.read_csv(
    os.path.join(VSM_HOME, 'giga_window5-scaled.csv.gz'), index_col=0)

In [62]:
#giga5_ppmi = vsm.pmi(giga5)
#giga5_ppmi = ttest(giga5)
giga5_ppmi_lsa = vsm.lsa(giga5_ppmi, k=300)

In [35]:
wn_index_edges = convert_edges_to_indices(wn_edges, imdb5_ppmi_lsa)

In [58]:
wn_index_edges = convert_edges_to_indices(wn_edges, giga5_ppmi_lsa)

In [59]:
wn_retro = Retrofitter(verbose=True)

In [60]:
X_retro = wn_retro.fit(giga5_ppmi_lsa, wn_index_edges)

Converged at iteration 4; change was 0.0063 

In [61]:
full_word_similarity_evaluation(X_retro)

wordsim353         0.509454
mturk771           0.532307
simverb3500dev     0.338965
simverb3500test    0.325530
men                0.721617
Macro-average      0.485575
Name: Spearman r, dtype: float64

In [132]:
imdb5_ppmi_lsa.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
!,-0.953478,0.336821,-0.158547,0.495488,-0.255167,-0.058961,0.144886,-0.136274,0.108274,-0.864362,...,-0.056899,0.000684,0.177645,-0.066055,-0.326783,-0.049701,0.116179,0.296209,-0.124177,-0.280193
);,-10.518781,1.27999,6.060256,4.770679,2.220207,-1.150071,-0.468161,-1.268059,-0.706758,0.676599,...,-0.27587,-1.178435,-0.960152,-0.803735,0.182258,0.370425,-0.47284,0.330447,0.826931,0.791772
.,-0.128465,-0.034167,0.228751,0.172249,0.054493,-0.004645,0.012694,-0.098767,-0.10724,-0.016764,...,-0.085989,0.001533,-0.11994,-0.099726,0.087249,0.081652,-0.071868,0.199602,0.056104,0.183868
..,-1.463028,0.502411,-0.40526,0.692142,-0.281529,-0.060485,0.141905,0.187798,0.306594,-0.716057,...,-0.251533,0.234417,0.020146,-0.206666,-0.062545,0.08564,0.226323,-0.061522,0.22617,0.007233
...,-0.731753,0.265058,-0.251195,0.413633,-0.268264,-0.134888,0.12174,-0.238754,0.184138,-0.700074,...,-0.130304,0.198935,0.010841,-0.151216,-0.190882,0.16608,0.012572,0.186777,-0.115343,0.075697


In [248]:
pca = PCA(.99)
pca.fit(imdb5_ppmi)

PCA(n_components=0.99)

In [249]:
imdb5_ppmi_pca = pca.transform(imdb5_ppmi)

In [250]:
imdb5_ppmi_pca = pd.DataFrame(imdb5_ppmi_pca, index=imdb5_ppmi.index)

In [251]:
imdb5_ppmi_pca.shape

(5000, 3440)

In [252]:
wn_index_edges = convert_edges_to_indices(wn_edges, imdb5_ppmi_pca)

In [253]:
wn_retro = Retrofitter(verbose=True)

In [254]:
X_retro = wn_retro.fit(imdb5_ppmi_pca, wn_index_edges)

Converged at iteration 9; change was 0.0058 

In [255]:
full_word_similarity_evaluation(X_retro)

wordsim353         0.645669
mturk771           0.602424
simverb3500dev     0.407016
simverb3500test    0.396003
men                0.722915
Macro-average      0.554805
Name: Spearman r, dtype: float64