In [19]:
import os
from google.cloud import bigquery
from typing import List, Dict, Union, Generator, Optional
import re
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
import json
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
import multiprocessing
import numpy as np
from scipy import spatial

nlp = spacy.load("en_core_web_lg")

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/adenletchworth/Downloads/studious-sign-417501-b1d1c2fe9312.json"

## Big Query Queries

In [None]:
def query_bigquery_batched(query: str, parameters: Optional[List[bigquery.ScalarQueryParameter]] = None, batch_size: int = 1000) -> Generator[List[Union[tuple, Dict[str, any]]], None, None]:
    """
    Streams results from a BigQuery SQL query in batches.

    :param query: SQL query string to execute.
    :param parameters: List of bigquery.ScalarQueryParameter objects for query parameterization.
    :param batch_size: Number of results to fetch per batch.
    :yield: Batches of results, each as a list of tuples or dictionaries.
    """
    client = bigquery.Client()

    # Only set query_parameters in job_config if parameters are not None
    job_config = bigquery.QueryJobConfig()
    if parameters:
        job_config.query_parameters = parameters

    try:
        # Execute the query
        query_job = client.query(query, job_config=job_config)

        # Iterate over pages of the query results
        for page in query_job.result(page_size=batch_size).pages:
            batch = [(row.title, row.body) for row in page]
            yield batch
    except Exception as e:
        print(f"An error occurred during query execution: {e}")
        yield []

## Annotated Data for Training Spacy

In [None]:
def normalize_and_annotate_with_phrase_matcher_batched(batches: Generator[List[Union[tuple, Dict[str, any]]], None, None]):
    matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
    titles_seen = set()

    for batch in batches:
        for title, body in batch:
            normalized_title = title.lower()
            if normalized_title not in titles_seen:
                pattern = nlp.make_doc(normalized_title)
                matcher.add("TECH_TERM", [pattern])
                titles_seen.add(normalized_title)
            
        annotated_batch = []
        for title, body in batch:
            normalized_title = title.lower()
            modified_body = normalized_title + ". " + body.lower()
            doc = nlp(modified_body)

            matches = matcher(doc)
            entities = []
            for match_id, start, end in matches:
                span = doc[start:end]
                entities.append((span.start_char, span.end_char, "TECH_TERM"))

            annotated_batch.append((modified_body, {"entities": entities}))
        
        yield annotated_batch
        
def save_annotated_data_as_json(annotated_data, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(annotated_data, f, ensure_ascii=False, indent=4)

## Data for Word2Vec Training

In [None]:
def create_data_for_word2vec(batches):
    annotated_batch = []
    for batch in batches:
        for body in batch:
            annotated_batch.append(body)
    return annotated_batch
    
        
def create_json_for_spacy():
    query = """
    SELECT title, body FROM `stack_overflow.posts_tag_wiki_excerpt`
    WHERE title IS NOT NULL AND body IS NOT NULL
    """

    batch_size = 10000  
    batches = query_bigquery_batched(query, batch_size=batch_size)

    all_annotated_data = []

    for annotated_batch in normalize_and_annotate_with_phrase_matcher_batched(batches):
        all_annotated_data.extend(annotated_batch) 

    file_path = "./NER/CS_ENTITIES.json"
    save_annotated_data_as_json(all_annotated_data, file_path)


def create_txt_for_word2vec(file_path):
    query = """
    SELECT title,body FROM `stack_overflow.posts_tag_wiki_excerpt`
    WHERE body IS NOT NULL
    """
    
    batch_size = 10000  
    batches = query_bigquery_batched(query, batch_size=batch_size)

    all_annotated_data = create_data_for_word2vec(batches)
    
    with open(file_path, 'w', encoding='utf-8') as file:
        for _, sentence in all_annotated_data:
            file.write(sentence + '\n')  



In [None]:
def training(model_name):
    with open('./data/word2vec.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Extracting just the tokenized sentences from the JSON data
    sentences = [item['tokenized'] for item in data]
    
    # Initialize the Word2Vec model
    word2vec_model = Word2Vec(vector_size=300, window=5, min_count=1, workers=multiprocessing.cpu_count()-1)
    
    # Building vocabulary from the sentences
    word2vec_model.build_vocab(sentences)
    
    # Training the Word2Vec model
    word2vec_model.train(sentences, total_examples=word2vec_model.corpus_count, epochs=30)
    
    # Saving the trained model
    word2vec_model.save(f"./models/{model_name}.model")
    word2vec_model.wv.save_word2vec_format(f'./models/{model_name}.txt')


training("word2vec_model")

In [None]:
def get_similarity(word):
    model = KeyedVectors.load_word2vec_format('./models/word2vec_model.txt')
    return model.most_similar(positive=[word])


In [None]:
print(get_similarity("pytorch"))

[('tensorflow', 0.7090263962745667), ('chainer', 0.6475980877876282), ('keras', 0.646369218826294), ('mxnet', 0.6445105671882629), ('tensor', 0.5889434814453125), ('xgboost', 0.563758909702301), ('tf', 0.5615367293357849), ('numpy', 0.5572689175605774), ('caffe', 0.5471710562705994), ('torch', 0.5371536612510681)]


In [18]:
word2vec_model = KeyedVectors.load_word2vec_format('./models/word2vec_model.txt', binary=False)  

for word in word2vec_model.key_to_index.keys():
    vector = word2vec_model.get_vector(word)
    nlp.vocab.set_vector(word, vector)


nlp.to_disk('./models/spacy_word2vec_model')

In [20]:
# Load the updated spaCy model
nlp = spacy.load('./models/spacy_word2vec_model')

def most_similar(word, topn=10):
    queried_token = nlp.vocab[word]
    
    # Ensure the word exists in the vocabulary
    if not queried_token.has_vector:
        print(f"The word '{word}' does not exist in the model's vocabulary.")
        return []
    
    # Calculate cosine similarity between the queried word's vector and all other vectors
    similarities = []
    for key, vector in nlp.vocab.vectors.items():
        if nlp.vocab.strings[key] != word:  # exclude the queried word itself
            similarity = 1 - spatial.distance.cosine(queried_token.vector, vector)
            similarities.append((nlp.vocab.strings[key], similarity))
    
    # Sort by similarity
    most_similar = sorted(similarities, key=lambda item: item[1], reverse=True)[:topn]
    
    return most_similar

word = 'pytorch'  
similar_words = most_similar(word, topn=5)
for similar_word, similarity in similar_words:
    print(f"{similar_word}: {similarity}")


tensorflow: 0.7090263712129975
chainer: 0.6475980879456193
keras: 0.6463692944339569
mxnet: 0.644510513292877
tensor: 0.5889435285062421
