## Read the data


In [1]:
import pandas as pd
import numpy as np
import sqlite3
import regex as re
from tqdm.auto import tqdm
from collections import Counter
from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
con = sqlite3.connect("wiki_articles_hw1_extended.db")

df = pd.read_sql_query("SELECT * from wiki_articles_hw1_extended", con)
con.close()
#df 

In [3]:
def single_words(df, field):
    return [w.lower() for words in df[field] for w in re.split(r'\||\#', words) if len(w)>1 and w.lower() not in STOP_WORDS]

In [4]:
nouns = Counter(single_words(df, "nouns"))
    
voc = [w[0] for w in nouns.most_common(7000)]
#voc.index("information")

### Tasks 1a and 1b

In [5]:
print(df.columns)

Index(['title', 'text', 'name', 'url', 'datePublished', 'dateModified',
       'headline', 'nouns', 'adjectives', 'verbs', 'lemmas', 'nav', 'entities',
       'noun_chunks', 'no_tokens', 'no_sentences', 'no_noun_chunks'],
      dtype='object')


In [6]:
len(df)

85

In [7]:
print(len(voc) == 7000)

True


In [8]:
print(nouns[voc[-1]] > 0)

True


In [11]:
print(df.head())

                  title                                               text  \
0            Abuse case  From Wikipedia, the free encyclopedia\n\n\nAbu...   
1   Access-control list  From Wikipedia, the free encyclopedia\n\n\nLis...   
2    Antivirus software  From Wikipedia, the free encyclopedia\n\n\nCom...   
3  Application security  From Wikipedia, the free encyclopedia\n\n\nMea...   
4  Application firewall  From Wikipedia, the free encyclopedia\n\n\nLay...   

                   name                                                url  \
0            Abuse case           https://en.wikipedia.org/wiki/Abuse_case   
1   Access-control list  https://en.wikipedia.org/wiki/Access-control_list   
2    Antivirus software   https://en.wikipedia.org/wiki/Antivirus_software   
3  Application security  https://en.wikipedia.org/wiki/Application_secu...   
4  Application firewall  https://en.wikipedia.org/wiki/Application_fire...   

          datePublished          dateModified  \
0  2010-03-19

# Co-occurrence analysis (main homework)


In [10]:
coocc_matrix = np.zeros((7000, 7000), dtype=int)

window_size = 5

# Extract all tokens (nouns) from the dataframe
tokens = single_words(df, "nouns")

# Process the tokens to fill the co-occurrence matrix
for i in range(len(tokens)):
    token_i = tokens[i]
    if token_i in voc:
        idx_i = voc.index(token_i)

        # Define the window for co-occurrence
        window_start = max(0, i - window_size)
        window_end = min(len(tokens), i + window_size + 1)

        for j in range(window_start, window_end):
            token_j = tokens[j]
            if token_j in voc:
                idx_j = voc.index(token_j)

                coocc_matrix[idx_i, idx_j] += 1
                coocc_matrix[idx_j, idx_i] += 1

# Example of checking the co-occurrence of 'information' and 'security'
index_info = voc.index("information")
index_security = voc.index("security")
print(f"Co-occurrence of 'information' and 'security': {coocc_matrix[index_info][index_security]}")

# Optional: Output the shape of the co-occurrence matrix and some matrix stats
print("Shape of co-occurrence matrix:", coocc_matrix.shape)
print("Sample co-occurrence matrix values:", coocc_matrix)


Co-occurrence of 'information' and 'security': 1350
Shape of co-occurrence matrix: (7000, 7000)
Sample co-occurrence matrix values: [[6764 1066  906 ...    2    0    0]
 [1066 5116  644 ...    0    0    0]
 [ 906  644 4526 ...    2    0    2]
 ...
 [   2    0    2 ...    2    0    0]
 [   0    0    0 ...    0    2    2]
 [   0    0    2 ...    0    2    2]]


In [12]:
def is_symmetric(matrix):
    rows, cols = matrix.shape
    for i in range(rows):
        for j in range(i, cols):
            if matrix[i, j] != matrix[j, i]:
                return False
    return True

# Check if the co-occurrence matrix is symmetric
symmetric_check = is_symmetric(coocc_matrix)
print(f"Is the co-occurrence matrix symmetric? {symmetric_check}")


Is the co-occurrence matrix symmetric? True


In [13]:
stochastic_matrix = coocc_matrix / (coocc_matrix.sum(axis=0, keepdims=True) + 0.001)

def pagerank(M, num_iterations=100, d=0.85):
    N = M.shape[1]
    v = np.ones(N) / N  
    for i in range(num_iterations):
        v = (1 - d) / N + d * M.dot(v)
    return v

pagerank_values = pagerank(stochastic_matrix)

top_10_indices = np.argsort(pagerank_values)[-10:][::-1]
top_10_tokens = [(voc[i], pagerank_values[i]) for i in top_10_indices]

# Output the top 10 most important tokens
print("Top 10 most important tokens by PageRank:")
for token, score in top_10_tokens:
    print(f"Token: {token}, PageRank score: {score}")

Top 10 most important tokens by PageRank:
Token: security, PageRank score: 0.020240331159672404
Token: system, PageRank score: 0.016933937561286475
Token: software, PageRank score: 0.014369306488398249
Token: computer, PageRank score: 0.010477543094239448
Token: attack, PageRank score: 0.009802975267342983
Token: user, PageRank score: 0.009101839919493532
Token: information, PageRank score: 0.009030091902604247
Token: application, PageRank score: 0.006579550848280298
Token: code, PageRank score: 0.006312133826602271
Token: datum, PageRank score: 0.006183375710124651


In [14]:
dice_matrix = np.copy(coocc_matrix)

def dice_coefficient(coocc_matrix, i, j):
    """Computes the Dice coefficient for tokens i and j."""
    intersection = coocc_matrix[i, j]
    sum_of_rows = coocc_matrix[i, i] + coocc_matrix[j, j]
    if sum_of_rows == 0:
        return 0 
    return (2 * intersection) / sum_of_rows

num_tokens = coocc_matrix.shape[0]

for i in range(num_tokens):
    for j in range(num_tokens):
        dice_matrix[i, j] = dice_coefficient(coocc_matrix, i, j)

# Output a few example values from the updated dice_matrix
print("Sample Dice coefficient values from dice_matrix:")
print(dice_matrix[:5, :5])  # Print a 5x5 sample from the matrix

Sample Dice coefficient values from dice_matrix:
[[1 0 0 0 0]
 [0 1 0 0 0]
 [0 0 1 0 0]
 [0 0 0 1 0]
 [0 0 0 0 1]]


In [15]:
idx_software = voc.index("software")
idx_security = voc.index("security")

dice_software_security = dice_coefficient(coocc_matrix, idx_software, idx_security)
dice_security_software = dice_coefficient(coocc_matrix, idx_security, idx_software)

print(f"Dice coefficient ('software', 'security'): {dice_software_security}")
print(f"Dice coefficient ('security', 'software'): {dice_security_software}")

if dice_software_security == dice_security_software:
    print("The Dice coefficients for both ('software', 'security') and ('security', 'software') match.")
else:
    print("The Dice coefficients for ('software', 'security') and ('security', 'software') do not match.")

Dice coefficient ('software', 'security'): 0.16049601417183348
Dice coefficient ('security', 'software'): 0.16049601417183348
The Dice coefficients for both ('software', 'security') and ('security', 'software') match.


In [17]:
from sklearn.metrics.pairwise import cosine_similarity

software_vector = dice_matrix[idx_software].reshape(1, -1)  # Reshape to 2D array for sklearn
security_vector = dice_matrix[idx_security].reshape(1, -1)

# Step 3: Calculate cosine similarity using sklearn's function
cosine_sim_sklearn = cosine_similarity(software_vector, security_vector)[0][0]

# Output the result
print(f"Cosine similarity (sklearn) between 'software' and 'security': {cosine_sim_sklearn}")

Cosine similarity (sklearn) between 'software' and 'security': 0.0


In [19]:
software_vector = dice_matrix[idx_software]
security_vector = dice_matrix[idx_security]
def cosine_similarity_np(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

cosine_sim_np = cosine_similarity_np(software_vector, security_vector)

# Output the result
print(f"Cosine similarity (NumPy) between 'software' and 'security': {cosine_sim_np}")

Cosine similarity (NumPy) between 'software' and 'security': 0.0


The key observation is that **cosine similarity** between the context vectors of "software" and "security" is `0`, while the **Dice coefficient** is `0.16`. 

### Interpretation of the Results:

1. **Cosine Similarity**:
   - Cosine similarity measures the **angular distance** between two vectors. A value of `0` indicates that the context vectors of "software" and "security" are **orthogonal**, meaning there is no similarity or correlation in the directions of their context vectors. In other words, "software" and "security" do not tend to occur in the same context in the dataset based on their context vectors.

2. **Dice Coefficient**:
   - The Dice coefficient measures **pairwise overlap** between the occurrences of two terms. A value of `0.16` suggests that there is some degree of co-occurrence between "software" and "security", meaning they do appear together in certain contexts. The Dice coefficient focuses on the **raw frequency of co-occurrences** rather than the direction of the context vectors.

### Conclusion:
- The **cosine similarity** being `0` and the **Dice coefficient** being `0.16` shows that "software" and "security" may share some overlap in terms of co-occurrences, but their **context vectors** are different enough to be considered dissimilar in terms of their usage or context in the dataset. The Dice coefficient is influenced by the **frequency of co-occurrence**, whereas cosine similarity emphasizes the **relative direction** of the entire context.


In [20]:
software_vector = dice_matrix[idx_software].reshape(1, -1)

cosine_similarities = cosine_similarity(dice_matrix, software_vector).flatten()

top_10_indices = np.argsort(cosine_similarities)[-10:][::-1]

print("Top 10 most similar tokens to 'software' by cosine similarity:")
for idx in top_10_indices:
    print(f"Token: {voc[idx]}, Cosine Similarity: {cosine_similarities[idx]}")

Top 10 most similar tokens to 'software' by cosine similarity:
Token: software, Cosine Similarity: 1.0
Token: misnomer, Cosine Similarity: 0.0
Token: net, Cosine Similarity: 0.0
Token: tail, Cosine Similarity: 0.0
Token: graduate, Cosine Similarity: 0.0
Token: vbscript, Cosine Similarity: 0.0
Token: lan, Cosine Similarity: 0.0
Token: countermeasures, Cosine Similarity: 0.0
Token: xerox, Cosine Similarity: 0.0
Token: ethernet, Cosine Similarity: 0.0


### Step 1: My Prompt to ChatGPT
**Prompt:**
*"Could you explain how Text-representing Centroids (TRC) can be implemented using static word embeddings? What conditions are necessary for this implementation? What are the advantages and disadvantages of using static embeddings for TRC, and in which application areas would this approach be most effective?"*

### Step 2: Version of ChatGPT Used
I utilized **ChatGPT-4** to generate this response. GPT-4 is recognized for its detailed, coherent responses and strong explanations.

### Step 3: ChatGPT’s Complete Answer
**ChatGPT's Answer:**
The **Text-representing Centroid (TRC)** approach can be realized using **static word embeddings**. Below is an explanation of the process, required conditions, benefits, limitations, and appropriate applications for this method.

#### Implementing TRC Using Static Word Embeddings:
A **centroid** refers to an average vector that summarizes a set of vectors. When applied to text, the goal is to derive a centroid vector representing an entire document by averaging the word embeddings of each word in the text.

For static word embeddings (such as those provided by Word2Vec, GloVe, or FastText), the centroid is computed by taking the mean of the embedding vectors corresponding to each word in the document. This centroid effectively summarizes the document’s content.

#### Required Conditions:
- **Availability of Pre-trained Word Embeddings**: You must have access to pre-trained static word embeddings (like GloVe or Word2Vec), allowing each word in the document to map to a vector.
- **Comprehensive Word Coverage**: The embeddings must cover the majority of the words in the text to ensure that the centroid accurately represents the document. Missing embeddings for many words can distort the final representation.
- **Preprocessing**: Text preprocessing, such as removing unnecessary punctuation or stopwords, ensures that only meaningful words contribute to the centroid.
- **Dimensional Consistency**: All word vectors should be of the same dimensionality to allow for proper averaging.

#### Advantages of Static Word Embeddings for TRC:
1. **Ease of Use**: Static embeddings are relatively easy to compute, and calculating a centroid by averaging vectors is computationally efficient.
2. **Speed**: Pre-trained embeddings allow for rapid implementation without needing to train embeddings from scratch.
3. **Simplicity in Interpretation**: Since static embeddings are fixed, the centroid can be interpreted as the average meaning of the document's words.

#### Disadvantages of Static Word Embeddings for TRC:
1. **Context Insensitivity**: Static embeddings do not account for word context, so the same word is always represented by the same vector, which can lead to inaccuracies, especially for words with multiple meanings.
2. **Loss of Word Order**: Averaging embeddings disregards word order, meaning important syntactic information is lost.
3. **Lack of Up-to-Date Representations**: Static embeddings may not capture newer word usages or domain-specific meanings, limiting their effectiveness.
4. **Out-of-Vocabulary Words**: Words that lack pre-trained embeddings are ignored, potentially distorting the centroid.

#### Suitable Application Areas:
1. **Document Classification**: Using the centroid to represent a document can facilitate its classification based on semantic content.
2. **Information Retrieval**: Similarity between document centroids can be useful for retrieving related documents.
3. **Summarization**: Centroids can help summarize text content for applications like document or topic summarization.
4. **Recommendation Systems**: Centroids can represent documents, articles, or other entities for similarity-based recommendations.

### Step 4: My Opinion of ChatGPT's Answer
ChatGPT’s response is **accurate**, **concise**, and **well-organized**. It provides a solid explanation of how TRC can be implemented using static word embeddings and discusses the necessary conditions, advantages, disadvantages, and potential application areas.

#### Evaluation of ChatGPT's Answer:
- **Correctness**: The explanation is precise, clearly describing how TRC can be computed by averaging word embeddings and aligning with current knowledge on static embeddings.
- **Coherence**: The response follows a logical structure and covers all aspects in a sequential, coherent manner.
- **Comprehensive**: The response addresses all key points—implementation, necessary conditions, pros and cons, and use cases—without leaving out critical information.
- **Clarity**: The explanation is easy to follow, using simple language to convey even technical concepts, making it accessible for a broad audience.

In conclusion, ChatGPT’s answer is well-structured and offers a comprehensive overview of how TRCs can be implemented using static word embeddings, including their advantages, limitations, and application areas. This explanation can be utilized as a reference or further discussion in a Jupyter notebook or related work.
