### Constraint 1

#### Library

In [None]:
pip install wikipedia-api nltk

Collecting wikipedia-api
  Downloading Wikipedia_API-0.6.0-py3-none-any.whl (14 kB)
Installing collected packages: wikipedia-api
Successfully installed wikipedia-api-0.6.0


In [None]:
import wikipediaapi
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr, spearmanr
import pandas as pd

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

#### Scrape Data

In [None]:
user_agent = "MyWikipediaScraper/1.0 (https://mywebsite.com)"
wiki_wiki = wikipediaapi.Wikipedia('en', headers={'User-Agent': user_agent})

def get_wikipedia_article(article_name):
    page = wiki_wiki.page(article_name)
    if page.exists():
        return page.text
    return ""

articles = ["Natural language processing", "Machine learning", "Artificial intelligence",
            "Data science", "Computer science", "Mathematics", "Physics", "Chemistry",
            "Biology", "Economics"]

#### Tokenisation

In [None]:
corpus = ""
for article in articles:
    corpus += get_wikipedia_article(article)

tokens = word_tokenize(corpus.lower())
tokens = tokens[:1000000]  # Limit to the first 1,000,000 tokens
print(f"Total tokens collected: {len(tokens)}")

unique_tokens = set(tokens)
print(f"Total unique tokens: {len(unique_tokens)}")
print(f"Unique tokens: {unique_tokens}")

Total tokens collected: 80365
Total unique tokens: 9024


#### Functions

In [None]:
def build_co_occurrence_matrix(corpus, vocab_size, window_size):
    co_occurrence_matrix = np.zeros((vocab_size, vocab_size), dtype=np.float32)
    word_to_index = {word: idx for idx, word in enumerate(set(corpus))}
    for i, word in enumerate(corpus):
        word_idx = word_to_index[word]
        start = max(0, i - window_size)
        end = min(len(corpus), i + window_size + 1)
        context = [corpus[j] for j in range(start, end) if j != i]
        for context_word in context:
            context_word_idx = word_to_index[context_word]
            co_occurrence_matrix[word_idx, context_word_idx] += 1
    return co_occurrence_matrix, word_to_index

def calculate_similarity(word1, word2, word_embeddings, word_to_index):
    if word1 in word_to_index and word2 in word_to_index:
        vec1 = word_embeddings[word_to_index[word1]].reshape(1, -1)
        vec2 = word_embeddings[word_to_index[word2]].reshape(1, -1)
        return cosine_similarity(vec1, vec2)[0][0]
    else:
        return None

#### Result and Training

In [None]:
df = pd.read_csv('/content/SimLex-999.txt', delimiter='\t')

results = {}

window_sizes = [2, 5, 10]

for window_size in window_sizes:
    co_occurrence_matrix, word_to_index = build_co_occurrence_matrix(tokens, len(set(tokens)), window_size)

    total_count = np.sum(co_occurrence_matrix)
    sum_over_words = np.sum(co_occurrence_matrix, axis=0)
    expected_counts = np.outer(sum_over_words, sum_over_words) / total_count
    ppmi_matrix = np.maximum(np.log(co_occurrence_matrix / expected_counts), 0)

    svd = TruncatedSVD(n_components=100)
    word_embeddings = svd.fit_transform(ppmi_matrix)

    df['predicted_similarity'] = df.apply(lambda row: calculate_similarity(row['word1'], row['word2'], word_embeddings, word_to_index), axis=1)
    df_filtered = df.dropna(subset=['predicted_similarity'])
    pearson_corr = pearsonr(df_filtered['SimLex999'], df_filtered['predicted_similarity'])[0]
    spearman_corr = spearmanr(df_filtered['SimLex999'], df_filtered['predicted_similarity'])[0]
    results[window_size] = {
        'pearson': pearson_corr,
        'spearman': spearman_corr
    }

for window_size, result in results.items():
    print(f'Window Size: {window_size}')
    print(f'Pearson Correlation: {result["pearson"]}')
    print(f'Spearman Correlation: {result["spearman"]}')
    print('---')


  ppmi_matrix = np.maximum(np.log(co_occurrence_matrix / expected_counts), 0)
  ppmi_matrix = np.maximum(np.log(co_occurrence_matrix / expected_counts), 0)
  ppmi_matrix = np.maximum(np.log(co_occurrence_matrix / expected_counts), 0)


Window Size: 2
Pearson Correlation: 0.06962361055102537
Spearman Correlation: 0.09138243472587387
---
Window Size: 5
Pearson Correlation: 0.08966630813361028
Spearman Correlation: 0.09368264218944367
---
Window Size: 10
Pearson Correlation: 0.05128387522219853
Spearman Correlation: 0.031106934819627688
---


### Constraint 2

In [1]:
pip install transformers datasets torch scipy pandas

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 

In [2]:
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForMaskedLM, Trainer, TrainingArguments
import torch
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import spearmanr

#### Load Dataset

In [None]:
dataset = load_dataset('wikipedia', '20220301.en', split='train[:1%]')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=4)
tokenized_datasets = tokenized_datasets.remove_columns(['text'])
tokenized_datasets.set_format('torch')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/36.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0/41 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/6458670 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/64587 [00:00<?, ? examples/s]

#### Model

In [None]:
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
)

trainer.train()

#### Word Embeddings

In [None]:
def get_finetuned_word_embedding(word):
    inputs = tokenizer(word, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

#### Process the Test Data

In [None]:
df = pd.read_csv('SimLex-999.txt', delimiter='\t')
similarity_scores = []
for _, row in df.iterrows():
    word1, word2 = row['word1'], row['word2']
    try:
        vec1 = get_finetuned_word_embedding(word1)
        vec2 = get_finetuned_word_embedding(word2)
        similarity = cosine_similarity([vec1], [vec2])[0][0]
    except KeyError:
        similarity = None
    similarity_scores.append(similarity)

df['predicted_similarity'] = similarity_scores
df_filtered = df.dropna(subset=['predicted_similarity'])
spearman_corr = spearmanr(df_filtered['SimLex999'], df_filtered['predicted_similarity'])[0]
print(f'Spearman Rank Correlation: {spearman_corr:.4f}')