In [None]:
from pathlib import Path

# 设置 local_data 目录路径
local_data_path = Path(r"C:\Users\Administrator\Desktop\MAN7916\local_data")

# 确保 local_data 目录存在
local_data_path.mkdir(parents=True, exist_ok=True)

print(f"Local data path: {local_data_path}")

import requests

glove_url = "http://nlp.stanford.edu/data/glove.6B.zip"
glove_zip_path = local_data_path / "glove.6B.zip"

if not glove_zip_path.exists():
    print("Downloading GloVe embeddings... This may take a while.")
    response = requests.get(glove_url, stream=True)
    response.raise_for_status()
    
    with open(glove_zip_path, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    
    print("Download complete.")
else:
    print("GloVe embeddings already downloaded.")



Local data path: C:\Users\Administrator\Desktop\MAN7916\local_data
Downloading GloVe embeddings... This may take a while.
Download complete.


In [5]:
import zipfile

if glove_zip_path.exists():
    print("Unzipping GloVe embeddings...")
    with zipfile.ZipFile(glove_zip_path, "r") as zip_ref:
        zip_ref.extractall(local_data_path)
    print("Unzipped GloVe embeddings.")


Unzipping GloVe embeddings...
Unzipped GloVe embeddings.


In [6]:
glove_files = list(local_data_path.glob("*glove*"))
print("Downloaded GloVe files:")
for file in glove_files:
    print(file)


Downloaded GloVe files:
C:\Users\Administrator\Desktop\MAN7916\local_data\glove.6B.100d.txt
C:\Users\Administrator\Desktop\MAN7916\local_data\glove.6B.200d.txt
C:\Users\Administrator\Desktop\MAN7916\local_data\glove.6B.300d.txt
C:\Users\Administrator\Desktop\MAN7916\local_data\glove.6B.50d.txt
C:\Users\Administrator\Desktop\MAN7916\local_data\glove.6B.zip


In [5]:
import gensim
import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine
from collections import Counter
from docx import Document

# File paths
glove_path = "C:/Users/Administrator/Desktop/MAN7916/local_data/glove.6B.100d.txt"
article_path = "C:/Users/Administrator/Desktop/MAN7916/assignments/materials/week_4/article_preprint.txt"
csv_output_path = "C:/Users/Administrator/Desktop/MAN7916/assignments/submissions/assignment_5/word_list_for_evaluation.csv"
docx_output_path = "C:/Users/Administrator/Desktop/MAN7916/assignments/submissions/assignment_5/word_embedding_results.docx"

# Load GloVe model
def load_glove_model(file_path):
    model = {}
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array([float(val) for val in split_line[1:]], dtype=np.float32)
            model[word] = embedding
    return model

glove_model = load_glove_model(glove_path)

def cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

# Root words
root_words = ["entrepreneurial", "creative", "innovative", "trailblazing"]
word_vectors = {word: glove_model[word] for word in root_words if word in glove_model}

In [6]:
# Compute pairwise cosine similarities
similarities = {}
for i, word1 in enumerate(root_words):
    for j, word2 in enumerate(root_words):
        if i < j and word1 in word_vectors and word2 in word_vectors:
            similarities[(word1, word2)] = cosine_similarity(word_vectors[word1], word_vectors[word2])

# Identify the word with the lowest average similarity
avg_similarities = {word: np.mean([sim for (w1, w2), sim in similarities.items() if w1 == word or w2 == word]) for word in root_words}
worst_word = min(avg_similarities, key=avg_similarities.get)

# Drop the least similar word
root_words.remove(worst_word)

# Compute average vector
average_vector = np.mean([word_vectors[word] for word in root_words], axis=0)

# Save results to Word file
doc = Document()
doc.add_heading("Word Embedding Results", level=1)

doc.add_heading("Pairwise Cosine Similarities", level=2)
for (w1, w2), sim in similarities.items():
    doc.add_paragraph(f"{w1} - {w2}: {sim:.4f}")

doc.add_heading("Removed Root Word", level=2)
doc.add_paragraph(f"Removed word: {worst_word}")

doc.add_heading("Average Vector (First 5 Dimensions)", level=2)
doc.add_paragraph(str(average_vector[:5]))

doc.add_heading("Cosine Similarities to Average Vector", level=2)
for word in root_words:
    doc.add_paragraph(f"{word}: {cosine_similarity(average_vector, word_vectors[word]):.4f}")

doc.save(docx_output_path)

# Find 50 closest words to average vector
def find_similar_words(glove_model, vector, top_n=50):
    similarities = {word: cosine_similarity(vector, vec) for word, vec in glove_model.items()}
    return sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:top_n]

deductive_words = find_similar_words(glove_model, average_vector)

def get_words_from_text(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read().lower().split()
    return set(text) & set(glove_model.keys())

text_words = get_words_from_text(article_path)
inductive_words = find_similar_words(glove_model, average_vector, top_n=50)

# Merge lists and remove duplicates
word_list = {word: score for word, score in deductive_words}
for word, score in inductive_words:
    word_list[word] = score

# Save to CSV
df = pd.DataFrame(list(word_list.items()), columns=["word", "score"])
df["eval"] = ""
df.to_csv(csv_output_path, index=False)

print(f"Results saved to {docx_output_path} and {csv_output_path}")


Results saved to C:/Users/Administrator/Desktop/MAN7916/assignments/submissions/assignment_5/word_embedding_results.docx and C:/Users/Administrator/Desktop/MAN7916/assignments/submissions/assignment_5/word_list_for_evaluation.csv
