<a href="https://colab.research.google.com/github/agrigoridou/Word-embeddings-and-Recurrent-Neural-Networks-/blob/main/%CE%91_Word_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install gensim



In [5]:
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Φόρτωση των προ-εκπαιδευμένων μοντέλων Word2Vec και GloVe

In [6]:
# Φόρτωση Word2Vec (Google News 300)
word2vec_model = api.load("word2vec-google-news-300")

# Φόρτωση GloVe (Wiki-Gigaword 300)
glove_model = api.load("glove-wiki-gigaword-300")



# Συνάρτηση για εύρεση κοντινών λέξεων

In [7]:
def find_closest_words(model, word, topn=10):
    try:
        return model.most_similar(word, topn=topn)
    except KeyError:
        return f"The word '{word}' is not in the vocabulary."

# Ερώτημα 1: 10 πιο κοντινές λέξεις για προκαθορισμένες λέξεις

In [14]:
# Ορισμός των λέξεων
words = ['car', 'jaguar', 'Jaguar', 'facebook']

In [15]:
# Αποθήκευση αποτελεσμάτων για το Word2Vec και το GloVe
results = {}
for word in words:
    results[word] = {
        "word2vec": find_closest_words(word2vec_model, word),
        "glove": find_closest_words(glove_model, word),
    }

In [16]:
# Εκτύπωση των αποτελεσμάτων
for word in words:
    print(f"Results for word: {word}")
    print("Word2Vec:", results[word]["word2vec"])
    print("GloVe:", results[word]["glove"])
    print()

Results for word: car
Word2Vec: [('vehicle', 0.7821096181869507), ('cars', 0.7423831224441528), ('SUV', 0.7160962224006653), ('minivan', 0.6907036900520325), ('truck', 0.6735789775848389), ('Car', 0.6677608489990234), ('Ford_Focus', 0.667320191860199), ('Honda_Civic', 0.6626849174499512), ('Jeep', 0.651133120059967), ('pickup_truck', 0.6441438794136047)]
GloVe: [('cars', 0.7827162146568298), ('vehicle', 0.7655367851257324), ('truck', 0.7350621819496155), ('driver', 0.7114784717559814), ('driving', 0.6442225575447083), ('vehicles', 0.6328005194664001), ('motorcycle', 0.6022513508796692), ('automobile', 0.595572829246521), ('parked', 0.5910030603408813), ('drivers', 0.5778359770774841)]

Results for word: jaguar
Word2Vec: [('jaguars', 0.6738404631614685), ('Macho_B', 0.6313095688819885), ('panther', 0.608633816242218), ('lynx', 0.5814595818519592), ('rhino', 0.5754255056381226), ('lizard', 0.560685396194458), ('tapir', 0.5563079118728638), ('tiger', 0.5528684854507446), ('leopard', 0.547

## Εύρεση κοινών λέξεων

In [17]:
common_words = {}
for word in words:
    w2v_words = [item[0] for item in results[word]["word2vec"]]
    glove_words = [item[0] for item in results[word]["glove"]]
    common_words[word] = set(w2v_words).intersection(set(glove_words))


In [18]:
# Εκτύπωση κοινών λέξεων
for word in words:
    print(f"Common words for '{word}': {common_words[word]}")
    print(f"Number of common words: {len(common_words[word])}")

Common words for 'car': {'truck', 'cars', 'vehicle'}
Number of common words: 3
Common words for 'jaguar': set()
Number of common words: 0
Common words for 'Jaguar': set()
Number of common words: 0
Common words for 'facebook': {'linkedin', 'myspace', 'twitter'}
Number of common words: 3


# Ερώτημα 2: Επαναλήψεις για λέξεις επιλογής σας

In [19]:
# Ορισμός των νέων λέξεων
new_words = ['computer', 'dog', 'school', 'king']

In [20]:
# Αποθήκευση αποτελεσμάτων για το Word2Vec και το GloVe για τις νέες λέξεις
new_results = {}
for word in new_words:
    new_results[word] = {
        "word2vec": find_closest_words(word2vec_model, word),
        "glove": find_closest_words(glove_model, word),
    }

In [21]:
# Εκτύπωση των αποτελεσμάτων
for word in new_words:
    print(f"Results for word: {word}")
    print("Word2Vec:", new_results[word]["word2vec"])
    print("GloVe:", new_results[word]["glove"])
    print()

Results for word: computer
Word2Vec: [('computers', 0.7979379892349243), ('laptop', 0.6640493273735046), ('laptop_computer', 0.6548868417739868), ('Computer', 0.647333562374115), ('com_puter', 0.6082080006599426), ('technician_Leonard_Luchko', 0.5662748217582703), ('mainframes_minicomputers', 0.5617720484733582), ('laptop_computers', 0.5585449934005737), ('PC', 0.5539618730545044), ('maker_Dell_DELL.O', 0.5519254207611084)]
GloVe: [('computers', 0.8248152732849121), ('software', 0.7334420084953308), ('pc', 0.6240139603614807), ('technology', 0.6198545098304749), ('computing', 0.6178765296936035), ('laptop', 0.5955509543418884), ('internet', 0.5857782363891602), ('ibm', 0.5825320482254028), ('systems', 0.5744993686676025), ('hardware', 0.5728795528411865)]

Results for word: dog
Word2Vec: [('dogs', 0.8680489659309387), ('puppy', 0.8106428384780884), ('pit_bull', 0.780396044254303), ('pooch', 0.7627376914024353), ('cat', 0.7609457969665527), ('golden_retriever', 0.7500901818275452), ('Ge

In [22]:
# Εύρεση κοινών λέξεων για τις νέες λέξεις
new_common_words = {}
for word in new_words:
    w2v_words = [item[0] for item in new_results[word]["word2vec"]]
    glove_words = [item[0] for item in new_results[word]["glove"]]
    new_common_words[word] = set(w2v_words).intersection(set(glove_words))

In [23]:
# Εκτύπωση κοινών λέξεων για τις νέες λέξεις
for word in new_words:
    print(f"Common words for '{word}': {new_common_words[word]}")
    print(f"Number of common words: {len(new_common_words[word])}")

Common words for 'computer': {'laptop', 'computers'}
Number of common words: 2
Common words for 'dog': {'cat', 'dogs', 'puppy'}
Number of common words: 3
Common words for 'school': {'schools', 'teacher', 'students', 'elementary', 'kindergarten'}
Number of common words: 5
Common words for 'king': {'throne', 'queen', 'monarch', 'prince', 'kings'}
Number of common words: 5


# Ερώτημα 3: Φιλτράρισμα λέξεων για 'student'

##10 πιο κοντινές λέξεις για τη λέξη "student":

In [24]:
# Λέξη που θα εξετάσουμε
word = 'student'

In [25]:
# Βρίσκουμε τις 10 πιο κοντινές λέξεις στο Word2Vec και στο GloVe
word2vec_similar_words = find_closest_words(word2vec_model, word)
glove_similar_words = find_closest_words(glove_model, word)

In [26]:
# Εκτύπωση των αποτελεσμάτων
print(f"10 most similar words for 'student' according to Word2Vec:")
print(word2vec_similar_words)

print(f"\n10 most similar words for 'student' according to GloVe:")
print(glove_similar_words)

10 most similar words for 'student' according to Word2Vec:
[('students', 0.7294867038726807), ('Student', 0.6706662774085999), ('teacher', 0.6301366090774536), ('stu_dent', 0.6240993142127991), ('faculty', 0.6087332963943481), ('school', 0.6055627465248108), ('undergraduate', 0.6020305752754211), ('university', 0.600540041923523), ('undergraduates', 0.5755698680877686), ('semester', 0.573759913444519)]

10 most similar words for 'student' according to GloVe:
[('students', 0.7690913677215576), ('teacher', 0.6873654723167419), ('graduate', 0.6737601161003113), ('school', 0.6130647659301758), ('college', 0.6090279221534729), ('undergraduate', 0.6043776273727417), ('faculty', 0.599898636341095), ('university', 0.5970513224601746), ('academic', 0.5810065865516663), ('campus', 0.5767688155174255)]


##Αποκλεισμός λέξεων που σχετίζονται με φοιτητές πανεπιστημίου και μαθητές Δημοτικού-Γυμνασίου-Λυκείου:

Για να αφαιρέσουμε τις λέξεις που σχετίζονται με φοιτητές πανεπιστημίου ή μαθητές σχολείου, μπορούμε να δημιουργήσουμε δύο λίστες από λέξεις που συσχετίζονται με αυτές τις κατηγορίες και να τις αφαιρέσουμε από τα αποτελέσματα. Αυτές οι λέξεις μπορεί να περιλαμβάνουν "university", "college", "professor", "highschool", "school", "teacher", κ.λπ.

In [27]:
# Λίστες λέξεων που σχετίζονται με φοιτητές πανεπιστημίου και μαθητές
university_related_words = ['university', 'college', 'professor', 'campus', 'graduate', 'academic', 'degree']
school_related_words = ['school', 'teacher', 'highschool', 'classroom', 'curriculum', 'exam', 'homework']


In [28]:
# Συνδυασμός των δύο λιστών
exclude_words = university_related_words + school_related_words

In [29]:
# Αφαίρεση των λέξεων που βρίσκονται στις παραπάνω λίστες
filtered_word2vec = [word for word in word2vec_similar_words if word[0] not in exclude_words]
filtered_glove = [word for word in glove_similar_words if word[0] not in exclude_words]


In [30]:
# Εκτύπωση των φιλτραρισμένων αποτελεσμάτων
print(f"\nFiltered Word2Vec (excluding university and school-related words):")
print(filtered_word2vec)

print(f"\nFiltered GloVe (excluding university and school-related words):")
print(filtered_glove)


Filtered Word2Vec (excluding university and school-related words):
[('students', 0.7294867038726807), ('Student', 0.6706662774085999), ('stu_dent', 0.6240993142127991), ('faculty', 0.6087332963943481), ('undergraduate', 0.6020305752754211), ('undergraduates', 0.5755698680877686), ('semester', 0.573759913444519)]

Filtered GloVe (excluding university and school-related words):
[('students', 0.7690913677215576), ('undergraduate', 0.6043776273727417), ('faculty', 0.599898636341095)]


# Ερώτημα 4: Αναλογίες

In [31]:
# Αναλογίες
analogies = [
    ('king', 'man', 'woman'),
    ('France', 'Paris', 'Tokyo'),
    ('trees', 'apples', 'grapes'),
    ('swimming', 'walking', 'walked'),
    ('doctor', 'father', 'mother')
]

In [34]:
# Συνάρτηση για την αναλογία με έλεγχο αν οι λέξεις υπάρχουν στο λεξικό του μοντέλου
def find_analogy(model, word1, word2, word3):
    # Ελέγχει αν όλες οι λέξεις υπάρχουν στο λεξικό του μοντέλου
    if word1 in model.key_to_index and word2 in model.key_to_index and word3 in model.key_to_index:
        result = model.most_similar(positive=[word3, word2], negative=[word1], topn=2)
        return result
    else:
        return f"One or more words not in vocabulary: {word1}, {word2}, {word3}"

In [35]:
# Εκτέλεση για κάθε αναλογία
for analogy in analogies:
    word1, word2, word3 = analogy
    word2vec_result = find_analogy(word2vec_model, word1, word2, word3)
    glove_result = find_analogy(glove_model, word1, word2, word3)

    print(f"Analogy: {word1} - {word2} + {word3}")

    print(f"Word2Vec closest words: {word2vec_result}")
    print(f"GloVe closest words: {glove_result}")
    print("-" * 50)

Analogy: king - man + woman
Word2Vec closest words: [('teenage_girl', 0.626004159450531), ('girl', 0.598484456539154)]
GloVe closest words: [('girl', 0.6010250449180603), ('person', 0.5670419931411743)]
--------------------------------------------------
Analogy: France - Paris + Tokyo
Word2Vec closest words: [('Toyko', 0.626667857170105), ('Osaka', 0.6135361194610596)]
GloVe closest words: One or more words not in vocabulary: France, Paris, Tokyo
--------------------------------------------------
Analogy: trees - apples + grapes
Word2Vec closest words: [('grape', 0.670628547668457), ('peaches', 0.610050618648529)]
GloVe closest words: [('oranges', 0.5917646288871765), ('peaches', 0.5814932584762573)]
--------------------------------------------------
Analogy: swimming - walking + walked
Word2Vec closest words: [('walk', 0.6318311095237732), ('strode', 0.5944611430168152)]
GloVe closest words: [('walk', 0.6345222592353821), ('walks', 0.6176889538764954)]
--------------------------------

# Ερώτημα 5: Custom αναλογίες

In [36]:
# Ορισμός νέων αναλογιών
new_analogies = [
    ('doctor', 'hospital', 'nurse'),
    ('king', 'queen', 'prince'),
    ('teacher', 'school', 'classroom')
]

In [37]:
# Συνάρτηση για την αναλογία με έλεγχο αν οι λέξεις υπάρχουν στο λεξικό του μοντέλου
def find_analogy(model, word1, word2, word3):
    # Ελέγχει αν όλες οι λέξεις υπάρχουν στο λεξικό του μοντέλου
    if word1 in model.key_to_index and word2 in model.key_to_index and word3 in model.key_to_index:
        result = model.most_similar(positive=[word3, word2], negative=[word1], topn=2)
        return result
    else:
        return f"One or more words not in vocabulary: {word1}, {word2}, {word3}"


In [38]:
# Εκτέλεση για κάθε αναλογία
for analogy in new_analogies:
    word1, word2, word3 = analogy
    word2vec_result = find_analogy(word2vec_model, word1, word2, word3)
    glove_result = find_analogy(glove_model, word1, word2, word3)

    print(f"Analogy: {word1} - {word2} + {word3}")

    print(f"Word2Vec closest words: {word2vec_result}")
    print(f"GloVe closest words: {glove_result}")
    print("-" * 50)

Analogy: doctor - hospital + nurse
Word2Vec closest words: [('Hospital', 0.6594350337982178), ('intensive_care', 0.6123327612876892)]
GloVe closest words: [('nursing', 0.5339133739471436), ('nurses', 0.5213382840156555)]
--------------------------------------------------
Analogy: king - queen + prince
Word2Vec closest words: [('princess', 0.7093082070350647), ('duchess', 0.6415467858314514)]
GloVe closest words: [('princess', 0.6948407292366028), ('duchess', 0.5315698385238647)]
--------------------------------------------------
Analogy: teacher - school + classroom
Word2Vec closest words: [('classrooms', 0.6912325024604797), ('schools', 0.5725624561309814)]
GloVe closest words: [('classrooms', 0.678383469581604), ('campus', 0.5905386805534363)]
--------------------------------------------------
