In [None]:
import spacy
import gensim
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Step 1: Generate Count Vectors

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
# Load the vocabulary from vocab.txt
def load_vocab(vocab_file):
    vocab = {}
    with open(vocab_file, 'r') as file:
        for line in file:
          word, index = line.strip().split(':')
          vocab[word] = int(index)
    return vocab

In [None]:
vocab_file = '/content/drive/MyDrive/Natural Language Processing/Assignment5_Solution/vocab.txt'
vocabulary = load_vocab(vocab_file)

In [None]:
def generate_count_vector(description, vocab):
    doc = nlp(description.lower())
    word_freq = defaultdict(int)
    for token in doc:
        if token.text in vocab:
            word_freq[vocab[token.text]] += 1
    return word_freq

In [None]:
descriptions = pd.read_json('/content/drive/MyDrive/Natural Language Processing/Assignment5_Solution/preprocessed_ads.json')
descriptions.head()

Unnamed: 0,Category,Title,Webindex,Company,Description
0,Engineering,Site Maintenance Engineer (ElectroMechanical),72635560,Rise Technical Recruitment,site maintenance electromechanical birmingham ...
1,Engineering,Inspector Sheet Metal,69145960,,absolute acting behalf established contract sh...
2,Engineering,Graduate Recruitment Consultant,69267760,Akton Recruitment Ltd,graduate consultant location altrincham compet...
3,Engineering,Electronics Project Engineer,69265319,Progressive Recruitment,electronics project technology permanent oxfor...
4,Engineering,Signalling Design Engineer,69198249,Hays TCE Jobs,largest consultancies world signalling growing...


In [None]:
count_vectors = []
for _, row in descriptions.iterrows():
    webindex = row['Webindex']
    description = row['Description']
    word_freq = generate_count_vector(description, vocabulary)
    count_vector = f"#{webindex}, " + ", ".join([f"{idx}:{freq}" for idx, freq in sorted(word_freq.items())])
    count_vectors.append(count_vector)
count_vectors[0]

'#72635560, 12:2, 200:1, 260:1, 482:1, 504:2, 645:1, 681:1, 755:2, 1045:1, 1052:1, 1176:2, 1199:1, 1203:2, 1316:1, 1448:4, 1504:1, 1527:3, 1532:1, 1533:1, 1534:2, 1711:1, 1735:1, 1768:1, 1809:1, 1844:1, 1980:1, 2018:1, 2079:1, 2201:1, 2202:1, 2239:1, 2368:1, 2370:1, 2415:2, 2457:1, 2641:1, 2782:1, 2784:1, 2799:3, 2823:1, 2828:2, 2884:2, 2976:1, 3034:5, 3187:1, 3213:1, 3315:1, 3354:1, 3435:1, 3482:1, 3492:1, 3510:1, 3700:3, 3843:1, 3948:1, 3962:1, 3970:2, 3987:1, 4115:1, 4250:2, 4266:2, 4338:3, 4431:2, 4451:1, 4541:1, 4680:2, 4681:1, 4682:1, 4918:1, 4948:1, 5000:1, 5057:1'

In [None]:
with open('/content/drive/MyDrive/Natural Language Processing/Assignment5_Solution/count_vectors.txt', 'w') as file:
    for count_vector in count_vectors:
        file.write(count_vector + "\n")

Step 2: Generate Weighted and Unweighted Word Embeddings using Word2Vec model

In [None]:
import gensim.downloader as api
word2vec_model = api.load('word2vec-google-news-300')



In [None]:
def preprocess(text):
    doc = nlp(text.lower())
    return [token.text for token in doc if token.is_alpha]

In [None]:
sentences = [preprocess(desc) for desc in descriptions['Description']]
word2vec_model = gensim.models.Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Fit TF-IDF Vectorizer
tfidf = TfidfVectorizer(vocabulary=vocabulary)
tfidf.fit(descriptions['Description'])

In [None]:
def get_word2vec_vector(description, model, vocab, tfidf=None):
    doc = preprocess(description)
    word_vecs = []
    for word in doc:
        if word in vocab:
            vec = model.wv[word]
            if tfidf:
                tfidf_weight = tfidf[word]
                vec = vec * tfidf_weight
            word_vecs.append(vec)
    if word_vecs:
        return np.mean(word_vecs, axis=0)
    else:
        return np.zeros(model.vector_size)

In [None]:
unweighted_vectors = []
weighted_vectors = []

for index, row in descriptions.iterrows():
    webindex = row['Webindex']
    description = row['Description']
    tfidf_weights = dict(zip(tfidf.get_feature_names_out(), tfidf.transform([description]).toarray()[0]))

    unweighted_vector = get_word2vec_vector(description, word2vec_model, vocabulary)
    weighted_vector = get_word2vec_vector(description, word2vec_model, vocabulary, tfidf_weights)

    unweighted_vectors.append((webindex, unweighted_vector))
    weighted_vectors.append((webindex, weighted_vector))

In [None]:
weighted_vectors[0]

(72635560,
 array([-0.01938217,  0.02888812,  0.0147196 ,  0.01519088,  0.02063204,
        -0.05110238,  0.00954569,  0.05143294, -0.01195538,  0.00425034,
        -0.02993809, -0.0383443 , -0.00118716,  0.02601862,  0.02108623,
        -0.01818527,  0.00172959, -0.05671785, -0.00125092, -0.05510456,
         0.01397065,  0.01433711,  0.01429488, -0.01812237, -0.0024076 ,
        -0.01230301, -0.02580633, -0.01791485, -0.038394  ,  0.00482546,
         0.05204249,  0.00064438, -0.0160452 , -0.01460527,  0.01343647,
         0.02544964,  0.00657236, -0.02436119, -0.00518897, -0.08859646,
        -0.02826651, -0.02573142, -0.00913762,  0.01628468,  0.01090103,
        -0.00531892, -0.02941119,  0.01011622, -0.00370406,  0.01583062,
         0.02570469, -0.01824846, -0.01634352,  0.01394244, -0.01642079,
         0.00385342,  0.03158551, -0.00428701, -0.05154535, -0.00316447,
        -0.00442164,  0.01844074, -0.04193947,  0.00875109, -0.03944056,
         0.02268142,  0.0019424 ,  0.020

In [None]:
unweighted_vectors[0]

(72635560,
 array([-0.19557409,  0.2963686 ,  0.14507405,  0.15054792,  0.2149704 ,
        -0.51802593,  0.09807508,  0.53277284, -0.11302419,  0.03250095,
        -0.29068333, -0.38897678, -0.014274  ,  0.26512623,  0.21947691,
        -0.18154569,  0.02229173, -0.57235825, -0.0184817 , -0.5610179 ,
         0.14199452,  0.14119534,  0.14818226, -0.18308073, -0.02961441,
        -0.12412178, -0.256904  , -0.18588604, -0.38870522,  0.05013752,
         0.5203427 , -0.00170244, -0.15800828, -0.15386477,  0.13448286,
         0.26267576,  0.0709855 , -0.25161543, -0.05869814, -0.8890644 ,
        -0.28155988, -0.26170054, -0.08959787,  0.1584724 ,  0.12599689,
        -0.05821666, -0.29773456,  0.10016868, -0.03220515,  0.16988215,
         0.25919285, -0.19287044, -0.1674825 ,  0.13462926, -0.16617745,
         0.03486444,  0.3236793 , -0.0331192 , -0.5220134 , -0.03141237,
        -0.04559215,  0.18895474, -0.42297697,  0.09346086, -0.39689425,
         0.22818363,  0.01521933,  0.218

In [None]:
def save_vectors(vectors, filename):
    with open(filename, 'w') as file:
        for webindex, vector in vectors:
            sparse_representation = ", ".join([f"{i}:{v}" for i, v in enumerate(vector) if v != 0])
            file.write(f"#{webindex}, {sparse_representation}\n")

In [None]:
# Save unweighted vectors to a file
save_vectors(unweighted_vectors, '/content/drive/MyDrive/Natural Language Processing/Assignment5_Solution/unweighted_vectors.txt')
# Save weighted vectors to a file
save_vectors(weighted_vectors, '/content/drive/MyDrive/Natural Language Processing/Assignment5_Solution/weighted_vectors.txt')