In [2]:
import time
import random 
from gensim.models import Word2Vec
from gensim.models.phrases import Phraser
import os
from nltk.tokenize import word_tokenize
import pandas as pd

In [2]:
## bootstrapped the article data

def write_booted_txt(input_file, output_file, seed_no):

    # Read the original text data
    with open(input_file, "r", encoding="utf-8") as file:
        lines = file.readlines()  # Read the file line by line

    if not lines:
        print(f"âš  No content found in {input_file}, skipping.")
        return

    # Write the new bootstrapped text file
    with open(output_file, "w", encoding="utf-8") as f:

        random.seed(seed_no)
        bootstrapped_lines = random.choices(lines, k=len(lines))  # Sampling with replacement

        for article in bootstrapped_lines:
            # Split the article into sentences using the delimiter
            sentences_list = article.split(' SENTENCESPLITHERE ')  # Split into sentences
            
            # Write each sentence to the file, ensuring each sentence is on a new line
            for sent in sentences_list:
                if sent.strip():  # Avoid writing empty sentences
                    f.write(sent.strip())  # Write sentence without leading/trailing spaces
                    f.write("\n")  # Add newline after each sentence
    
input_file = '/Users/yvette/Desktop/data/Final/preprocessed grouped txt'
output_file = '/Users/yvette/Desktop/data/Final/bootstrap txt'

In [3]:
class SentenceIterator:   
    def __init__(self, filepath): 
        self.filepath = filepath

    def __iter__(self): 
        for line in open(self.filepath, "r", encoding="utf-8" ): 
            yield word_tokenize(line.rstrip('\n'))

           
class PhrasingIterable(object):
    def __init__(self, phrasifier, texts):
        self.phrasifier, self.texts = phrasifier, texts
    def __iter__(self):
        return iter(self.phrasifier[self.texts])

In [None]:
# Define the time window year ranges as per your filenames
year_ranges = [
    ("1971", "1980"),
    ("1981", "1990"),
    ("1991", "2000"),
    ("2001", "2010"),
    ("2011", "2020"),
    ("2021", "2024")
]

# Define the journals
journals = [
    "American Journal of Political Science",
    "British Journal of Political Science"
]

# Loop over each year range and journal to process subcorpus
for start_year, end_year in year_ranges:
    for journal in journals:
        # Create the subcorpus name (e.g., 1971-1980_American Journal of Political Science)
        subcorpus_name = f"{start_year}-{end_year}_{journal}"

        print(f"Processing subcorpus: {subcorpus_name}...")

        # Define the paths for bigram models
        bigram_model_path = f"/Users/yvette/Desktop/data/Final/bigram phraser full text/bigram_text_{subcorpus_name}.model"
        

        # Load the bigram models
        bigram_transformer = Phraser.load(bigram_model_path)
        

        # Loop for 25 bootstraps for each subcorpus
        for boot in range(25):
            print(f"Processing bootstrap {boot} for {subcorpus_name}...")

            # Write bootstrapped text
            input_file = f'/Users/yvette/Desktop/data/Final/preprocessed grouped txt/text_{subcorpus_name}.txt'
            output_file = f'/Users/yvette/Desktop/data/Final/bootstrap txt/bootstrapped_text_{subcorpus_name}_boot{boot}.txt'
            write_booted_txt(input_file, output_file, seed_no=boot)

            # Process the bootstrapped file
            bootstrapped_file = f'/Users/yvette/Desktop/data/Final/bootstrap txt/bootstrapped_text_{subcorpus_name}_boot{boot}.txt'
            sentences = SentenceIterator(bootstrapped_file)

            # Apply bigram transformation to the sentences
            corpus = PhrasingIterable(bigram_transformer, sentences)

            # Train the Word2Vec model
            model = Word2Vec(corpus, sg=1, vector_size=300, window=10, min_count=10, workers=10, hs=0, negative=15, epochs=10)
            model.init_sims(replace=True)

            # Save the trained Word2Vec model
            model_save_path = f"/Users/yvette/Desktop/data/Final/bootstrapped_model_bigram/sg300win10c10iter10_{subcorpus_name}_boot{boot}_.model"
            model.save(model_save_path)

            # Allow a short pause between model saves
            time.sleep(120)


In [None]:
### Model evaluation
# Path to models directory
models_dir = "/Users/yvette/Desktop/data/Final/bootstrapped_model_bigram/"
output_csv = "/Users/yvette/Desktop/data/Final/word_similarities_democracy.csv"

# List all model files (ignore .npy files)
model_files = [f for f in os.listdir(models_dir) if f.endswith(".model")]

# Initialize list to store results
results = []

# Iterate over each model file
for model_file in model_files:
    model_path = os.path.join(models_dir, model_file)
    print(f"Loading model: {model_file}...")

    try:
        # Load Word2Vec model
        model = Word2Vec.load(model_path)
        
        # Get top 5 most similar words to "democracy"
        similar_words = model.wv.most_similar("democracy", topn=5)
        top_words = [word for word, score in similar_words]
        
        # Extract subcorpus name from the filename
        subcorpus_name = "_".join(model_file.split("_")[1:3])  # Example: "1971-1980_American Journal of Political Science"
        
        # Store results
        results.append([model_file, subcorpus_name] + top_words)
    
    except KeyError:
        print(f"Word 'democracy' not found in {model_file}, skipping...")
        results.append([model_file, subcorpus_name] + ["N/A"] * 5)

# Convert results to DataFrame
columns = ["Model", "Subcorpus", "Top1", "Top2", "Top3", "Top4", "Top5"]
df = pd.DataFrame(results, columns=columns)

# Save to CSV
df.to_csv(output_csv, index=False)

print(f"Results saved to {output_csv}")