# Project Code for ST446 Final Project
***By Aleksander Brynjulf Hubert***

***May 2020***


This code is used for my experiment comparing dimensionality reduction of word vectors. This code was run on a GCP machine and was used to process all of the word vectors to then be tested using QVEC. This file contains the code that cleans, tokenizes, and builds the word vector using the spark implementation of word2vec. The final chunks take the word vectors and use a Singular Value Decomposition to reduce the dimensionality from 150 to 75, 50, 25, and 10 dimensions. Note where it says '{BUCKET_NAME}' this will have to be replaced with the name of your bucket. For more information see the writeup.

In [None]:
import numpy as np
import re
import pyspark
from pyspark import SparkContext
from pyspark import SparkConf
import itertools
import collections
import nltk

from pyspark.mllib.feature import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

from pyspark.mllib.linalg import DenseMatrix
from pyspark.mllib.linalg.distributed import RowMatrix

## Data Cleaning Functions
These functions are set up to clean and tokenize the data using NLTK and removing English stopwords. 

In [None]:
stop_words = set(stopwords.words('english'))
table = str.maketrans('', '', string.punctuation)

def get_tokens(line):
    tokens = word_tokenize(line)
    tokens = [w.lower() for w in tokens]
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    return (words)

def get_page(content):
    #Remove any leading or lagging space if present 
    #content = content.encode('utf8').strip()
    page = []
    try:
        if(content != [] and content != None):
            # Split the content on the basis of new line
            page = content.split("\n", 2)
            page[2] = get_tokens(page[2])
    except:
        page = [1,2,3]
    return page


## Functions for Handling Word Vectors
This chunk sets up the helper functions for running the project. This includes a utility function which changes a list to a string separated by a whitespace. A function for creating an RDD to hold the wordvectors. A function that reduces dimensionality and finally a function that outputs the word vectors into the correct format as a txt. 

In [None]:
def list_to_srting(lst): 
    return ' '.join(map(str, lst)) 

##This function formats the word vectors from word2vec.model.getVectors()
def format_wordvectors(vectors):
    #array for storing the actual vectors
    vec_nums = [] 
    #array for storing the words in order
    words = [] 
    #iterates over the vectors
    for k in vectors:
        words.append(k)
        vec_nums.append(list(vectors[k]))

    #creates and RDD of the vectors where each row represents a vector
    vec_rdd = sc.parallelize(vec_nums)
    return vec_rdd, words

##This function reduces the dimension of the vector RDD using SVD from pyspark.mllib
def dim_reduction(vec_rdd, words, k):
    #creates a RowMatrix from pyspark.mllib.linalg
    mat = RowMatrix(vec_rdd)
    #computes the SVD with the given leading singular values K
    vectors_dimReduce = mat.computeSVD(k, computeU=True)

    #Creates a dense matrix S for multiplication with matrix U
    s_dense = DenseMatrix(len(vectors_dimReduce.s), len(vectors_dimReduce.s),np.diag(vectors_dimReduce.s).ravel("F"))
    #multiplies U*S
    dim_reduce_list = vectors_dimReduce.U.multiply(s_dense).rows.collect() 

    #Creates a dictionary where the key is the word and the value is the word vector
    svd_dictionary = {}

    for i in range(0,len(words)):
        svd_dictionary[words[i]] = dim_reduce_list[i]
    return svd_dictionary

##This function outputs the word vectors in the desired format for SVD
## and saves them as a .txt in the bucket
import datalab.storage as gcs
def doc_output(vec_dict, bucket, filename, output_folder):
    final_doc_dimReduce = ''
    for k in vectors:
        final_doc_dimReduce = final_doc_dimReduce + k + ' ' + list_to_srting(list(vec_dict[k])) + '\n'
    gcs.Bucket(bucket).item(filename).write_to(final_doc_dimReduce, output_folder)

## Experiment
This next section uses the DBLP bibliography to test the code. Due to the limitations of compute credits I chose to use the DBLP data since it was smaller and would not use up all the credits during testing. The instructions to set up the environment are in the appendix of the writeup.


### File Reading
This next chunk reads the files in from the data folder in the bucket. Then it extracts and tokenizes the title of each paper from the DBLP database and stores them in an RDD.

In [None]:
##Read in the author data
author_file = sc.\
    textFile(
        "gs://{BUCKET_NAME}/data/author.txt", 
        4)

#Extracts and creates and RDD of the tokenized titles
def extract_titles(row):
    row = row.strip()
    row_split = np.array(row.split("\t"))
    title_text = get_tokens(row_split[2])
    return title_text

author_rdd = author_file.map(extract_titles)
author_rdd.take(5)

### Word2Vec Vectors
This chunk trains the word2vec model using pyspark.

In [None]:
##runs the word2vec model with 100 dimensions
word2vec = Word2Vec().setSeed(42)
model = word2vec.fit(author_rdd)
vectors = model.getVectors()

#### File Output For Reduced Word Vectors
This chunk sets up the output of the wordvectors into the 'vectors/' folder within the assigned bucket.

In [None]:
doc_output(vectors, '{BUCKET_NAME}', 'vectors/baseline.txt', 'vectors/')

### Dimensionality Reduction

In [None]:
#formats the wordvectors
formatted = format_wordvectors(vectors)
#separates words from their vectors for dimensionality reduction
vec_rdd = formatted[0]
word = formatted[1]
vec_rdd.take(20)

In [None]:
#array of dimensions to test
dimensions = [75,50,25,10]
#array to store the word vectors 
dim_models = []
#runs dimensionaility reduction for dimensions in the dimension array
for i in dimensions:
    dim_models.append(dim_reduction(vec_rdd, word, i))
    

#### File Output For Reduced Word Vectors
Finally this chunk output all the wordvectors into 'txt' files that are formatted for usage with the QVEC software.

In [None]:
#array to store the output filenames
output_filenames = ['vectors/k75.txt','vectors/k50.txt','vectors/k25.txt','vectors/k10.txt']

#outputs the vectors to test with QVEC
for i in range(0, len(output_filenames)):
    doc_output(dim_models[i], '{BUCKET_NAME}', output_filenames[i], 'vectors/')
