In [1]:
import pandas as pd
import os
import random
import pickle
from sklearn.metrics import precision_score, recall_score, f1_score
from gensim.models.phrases import Phraser
from gensim.models import Word2Vec, phrases 
from collections import Counter

In [2]:
### bigram tranformer
## the sample text for bigram model
import os
import random
import pandas as pd

def process_subcorpora(input_dir, sample_ratio=0.75, delimiter=' SENTENCESPLITHERE '):
    """
    Process text files representing subcorpora by:
    - Loading the text data
    - Sampling a fraction of the articles
    - Splitting them into sentences
    - Storing processed sentences separately for each file

    Parameters:
        input_dir (str): Directory containing the subcorpora text files.
        sample_ratio (float): Fraction of articles to sample (default: 0.75).
        delimiter (str): Sentence boundary delimiter used in the text.

    Returns:
        dict: A dictionary where keys are filenames (without .txt) and values are lists of tokenized sentences.
    """

    processed_data = {}

    # Iterate over all text files in the directory
    for filename in sorted(os.listdir(input_dir)):
        if filename.endswith(".txt"):  
            file_path = os.path.join(input_dir, filename)
            file_key = filename.replace(".txt", "")  # Remove .txt for dictionary key
            
            # Read the file
            with open(file_path, "r", encoding="utf-8") as file:
                articles = file.readlines()  # Each line represents an article
            
            # Sample a fraction of the articles
            sample_size = round(sample_ratio * len(articles))
            sampled_articles = random.sample(articles, sample_size) if articles else []
            
            # Process each sampled article
            tokenized_sentences = []
            for article in sampled_articles:
                sentences_list = article.split(delimiter)
                for sentence in sentences_list:
                    sentence_tokens = sentence.split()  # Tokenize by whitespace
                    if sentence_tokens:
                        tokenized_sentences.append(sentence_tokens)  # Store tokenized sentence

            processed_data[file_key] = tokenized_sentences  # Store in dictionary

            print(f"Processed {filename}: {len(sampled_articles)} articles sampled.")

    return processed_data

# Example usage:
input_directory = '/Users/yvette/Desktop/data/Final/preprocessed grouped txt'
processed_sentences_dict = process_subcorpora(input_directory)

# Convert each file's processed data into a DataFrame with a single "sentence" column
dfs = {file: pd.DataFrame({"sentence": sentences}) for file, sentences in processed_sentences_dict.items()}

# Example: Access DataFrame for a specific file
sample_filename = list(dfs.keys())[0]  # Get first filename
print(dfs[sample_filename].head())  # Show first few rows of its DataFrame

Processed text_1971-1980_American Journal of Political Science.txt: 354 articles sampled.
Processed text_1971-1980_British Journal of Political Science.txt: 244 articles sampled.
Processed text_1981-1990_American Journal of Political Science.txt: 326 articles sampled.
Processed text_1981-1990_British Journal of Political Science.txt: 198 articles sampled.
Processed text_1991-2000_American Journal of Political Science.txt: 404 articles sampled.
Processed text_1991-2000_British Journal of Political Science.txt: 209 articles sampled.
Processed text_2001-2010_American Journal of Political Science.txt: 445 articles sampled.
Processed text_2001-2010_British Journal of Political Science.txt: 397 articles sampled.
Processed text_2011-2020_American Journal of Political Science.txt: 932 articles sampled.
Processed text_2011-2020_British Journal of Political Science.txt: 617 articles sampled.
Processed text_2021-2024_American Journal of Political Science.txt: 270 articles sampled.
Processed text_

In [3]:
output_dir = "/Users/yvette/Desktop/data/Final/sample text for phraser"

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)

for filename, df in dfs.items():
    file_path = os.path.join(output_dir, f"{filename}.csv")
    df.to_csv(file_path, index=False)
    print(f"Saved: {file_path}")


Saved: /Users/yvette/Desktop/data/Final/sample text for phraser/text_1971-1980_American Journal of Political Science.csv
Saved: /Users/yvette/Desktop/data/Final/sample text for phraser/text_1971-1980_British Journal of Political Science.csv
Saved: /Users/yvette/Desktop/data/Final/sample text for phraser/text_1981-1990_American Journal of Political Science.csv
Saved: /Users/yvette/Desktop/data/Final/sample text for phraser/text_1981-1990_British Journal of Political Science.csv
Saved: /Users/yvette/Desktop/data/Final/sample text for phraser/text_1991-2000_American Journal of Political Science.csv
Saved: /Users/yvette/Desktop/data/Final/sample text for phraser/text_1991-2000_British Journal of Political Science.csv
Saved: /Users/yvette/Desktop/data/Final/sample text for phraser/text_2001-2010_American Journal of Political Science.csv
Saved: /Users/yvette/Desktop/data/Final/sample text for phraser/text_2001-2010_British Journal of Political Science.csv
Saved: /Users/yvette/Desktop/data/Fi

In [None]:
# Define the function to train and evaluate bigram models using processed sentences
def evaluate_bigram_model(model, processed_sentences_dict, specific_filename, bigram_dir):
    detected_bigrams = set()

    # Apply the bigram model to the specific file in processed_sentences_dict
    sentences = processed_sentences_dict.get(specific_filename, [])
    
    if not sentences:
        print(f"No sentences found for {specific_filename}.")
        return None
    
    # Debugging - check phraser type
    print(f"Phaser model for {specific_filename}: {type(model)}")

    # Apply the model to each sentence and collect bigrams
    for sentence in sentences:
        transformed_sentence = model[sentence]  # Apply the bigram model (Phraser)
        for bigram in transformed_sentence:
            if isinstance(bigram, tuple):  # Check if it's a tuple (e.g., ("word1", "word2"))
                detected_bigrams.add("_".join(bigram))  # Join the words with "_"
            elif isinstance(bigram, str):  # If it's a string, assume it's already a bigram
                detected_bigrams.add(bigram)

    # Extract the common part of the filename (before the first underscore)
    common_part = "_".join(specific_filename.split("_")[1:])  # Skip 'text_' part, keep everything after it
    print(f"Common part of the filename: {common_part}")
    
    # Dynamically generate the reference filename using the common part
    reference_filename = f"bigram_{common_part}_without_stopword.csv"
    reference_file_path = os.path.join(bigram_dir, reference_filename)
    
    if not os.path.exists(reference_file_path):
        print(f"Reference bigram file for {specific_filename} not found.")
        return None

    # Load the reference bigram DataFrame
    bigram_df = pd.read_csv(reference_file_path)

    # Extract bigrams from the 'ngram' column and format them consistently
    reference_bigrams = set(bigram_df["ngram"].astype(str).apply(lambda x: x.replace(" ", "_")))

    # Compute Precision, Recall, and F1-score
    intersection = detected_bigrams.intersection(reference_bigrams)
    precision = len(intersection) / len(detected_bigrams) if detected_bigrams else 0
    recall = len(intersection) / len(reference_bigrams) if reference_bigrams else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1

# Define a DataFrame to store the results
results_df = pd.DataFrame(columns=["min_count", "threshold", "filename", "Precision", "Recall", "F1"])
# Loop through different parameter combinations for min_count and threshold
def evaluate_for_all_files(processed_sentences_dict, bigram_dir):
    for min_count in [10, 20, 30, 40]:
        for threshold in [5, 8, 10, 12, 15]:
            
            # Loop through all the filenames in processed_sentences_dict
            for specific_filename in processed_sentences_dict.keys():
                print(f"Evaluating bigram model for {specific_filename}...")
                
                # Train the bigram model with the current parameters
                bigram_model = phrases.Phrases(processed_sentences_dict[specific_filename], min_count=min_count, threshold=threshold)
                
                # Convert to Phraser for optimized speed
                phraser = Phraser(bigram_model)
                
                # Evaluate the model using precision, recall, and F1 score for the specific file
                precision, recall, f1 = evaluate_bigram_model(phraser, processed_sentences_dict, specific_filename, bigram_dir)
        

                # Print the results
                if precision is not None:
                    print(f"min_count={min_count}, threshold={threshold}, filename={specific_filename} => Precision: {precision}, Recall: {recall}, F1: {f1}")
                    # Store the results in the DataFrame
                    results_df.loc[len(results_df)] = [min_count, threshold, specific_filename, precision, recall, f1]

    return results_df  



bigram_dir = '/Users/yvette/Desktop/data/Final/ngram word frequency without stopwords'
evaluate_for_all_files(processed_sentences_dict, bigram_dir)
results_df.to_csv("/Users/yvette/Desktop/bigram_results.csv", index=False)

In [None]:
import pandas as pd

# Load the CSV data
df = pd.read_csv("/Users/yvette/Desktop/data/Final/bigram_results.csv")

# Group by 'min_count' and 'threshold', calculate mean of metrics
grouped = df.groupby(['min_count', 'threshold']).agg({
    'Precision': 'mean',
    'Recall': 'mean',
    'F1': 'mean'
}).reset_index()

# Sort by F1 in descending order
sorted_result = grouped.sort_values('F1', ascending=False)

# Print or save the result
print(sorted_result)

    min_count  threshold  Precision    Recall        F1
0          10          5   0.108347  0.012104  0.021753
1          10          8   0.096650  0.010589  0.019067
2          10         10   0.091025  0.009884  0.017812
3          10         12   0.086431  0.009317  0.016803
4          10         15   0.080946  0.008653  0.015617
5          20          5   0.051402  0.005268  0.009547
6          20          8   0.045707  0.004641  0.008417
7          20         10   0.042849  0.004332  0.007860
8          20         12   0.040606  0.004091  0.007426
9          20         15   0.037839  0.003797  0.006894
10         30          5   0.031353  0.003118  0.005667
11         30          8   0.027858  0.002755  0.005009
12         30         10   0.026103  0.002575  0.004682
13         30         12   0.024672  0.002428  0.004416
14         30         15   0.022914  0.002249  0.004091
15         40          5   0.021495  0.002106  0.003833
16         40          8   0.019019  0.001856  0

In [None]:
### bigram model choosed （min_count 10, threshold 8）and saved 
bigram_save_dir = "/Users/yvette/Desktop/data/Final/bigram phraser"

for specific_filename in processed_sentences_dict.keys():
    print(f"bigram model for {specific_filename}...")
                
    # Train the bigram model with the current parameters
    bigram_model = phrases.Phrases(processed_sentences_dict[specific_filename], min_count=10, threshold=8)

    # save bigram phraser
    bigram_model_path = os.path.join(bigram_save_dir, f"bigram_{specific_filename}.model")
    bigram_phraser = Phraser(bigram_model)
    bigram_model.save(bigram_model_path)

    print(f"Bigram model saved at: {bigram_model_path}")

bigram model for text_1971-1980_American Journal of Political Science...
Bigram model saved at: /Users/yvette/Desktop/data/Final/bigram phraser/bigram_text_1971-1980_American Journal of Political Science.model
bigram model for text_1971-1980_British Journal of Political Science...
Bigram model saved at: /Users/yvette/Desktop/data/Final/bigram phraser/bigram_text_1971-1980_British Journal of Political Science.model
bigram model for text_1981-1990_American Journal of Political Science...
Bigram model saved at: /Users/yvette/Desktop/data/Final/bigram phraser/bigram_text_1981-1990_American Journal of Political Science.model
bigram model for text_1981-1990_British Journal of Political Science...
Bigram model saved at: /Users/yvette/Desktop/data/Final/bigram phraser/bigram_text_1981-1990_British Journal of Political Science.model
bigram model for text_1991-2000_American Journal of Political Science...
Bigram model saved at: /Users/yvette/Desktop/data/Final/bigram phraser/bigram_text_1991-200

In [None]:
### bigram for full text 
import os
import pandas as pd

def process_subcorpora(input_dir, delimiter=' SENTENCESPLITHERE '):
    """
    Process text files representing subcorpora by:
    - Loading the text data
    - Splitting them into sentences
    - Storing processed sentences separately for each file

    Parameters:
        input_dir (str): Directory containing the subcorpora text files.
        delimiter (str): Sentence boundary delimiter used in the text.

    Returns:
        dict: A dictionary where keys are filenames (without .txt) and values are lists of tokenized sentences.
    """

    processed_data = {}

    # Iterate over all text files in the directory
    for filename in sorted(os.listdir(input_dir)):
        if filename.endswith(".txt"):  
            file_path = os.path.join(input_dir, filename)
            file_key = filename.replace(".txt", "")  # Remove .txt for dictionary key
            
            # Read the file
            with open(file_path, "r", encoding="utf-8") as file:
                articles = file.readlines()  # Each line represents an article
            
            # Process each article (without sampling)
            tokenized_sentences = []
            for article in articles:
                sentences_list = article.split(delimiter)
                for sentence in sentences_list:
                    sentence_tokens = sentence.split()  # Tokenize by whitespace
                    if sentence_tokens:
                        tokenized_sentences.append(sentence_tokens)  # Store tokenized sentence

            processed_data[file_key] = tokenized_sentences  # Store in dictionary

            print(f"Processed {filename}: {len(articles)} articles processed.")

    return processed_data

# Example usage:
input_directory = '/Users/yvette/Desktop/data/Final/preprocessed grouped txt'
processed_sentences_dict = process_subcorpora(input_directory)

# Convert each file's processed data into a DataFrame with a single "sentence" column
dfs = {file: pd.DataFrame({"sentence": sentences}) for file, sentences in processed_sentences_dict.items()}

# Example: Access DataFrame for a specific file
sample_filename = list(dfs.keys())[0]  # Get first filename
print(dfs[sample_filename].head())  # Show first few rows of its DataFrame

bigram_save_dir = "/Users/yvette/Desktop/data/Final/bigram phraser full text"

for specific_filename in processed_sentences_dict.keys():
    print(f"bigram model for {specific_filename}...")
                
    # Train the bigram model with the current parameters
    bigram_model = phrases.Phrases(processed_sentences_dict[specific_filename], min_count=10, threshold=8)

    # generate path for save
    bigram_model_path = os.path.join(bigram_save_dir, f"bigram_{specific_filename}.model")
    bigram_phraser = Phraser(bigram_model)
    
    # save bigram phraser
    bigram_model.save(bigram_model_path)

    print(f"Bigram model saved at: {bigram_model_path}")


Processed text_1971-1980_American Journal of Political Science.txt: 472 articles processed.
Processed text_1971-1980_British Journal of Political Science.txt: 326 articles processed.
Processed text_1981-1990_American Journal of Political Science.txt: 435 articles processed.
Processed text_1981-1990_British Journal of Political Science.txt: 264 articles processed.
Processed text_1991-2000_American Journal of Political Science.txt: 538 articles processed.
Processed text_1991-2000_British Journal of Political Science.txt: 279 articles processed.
Processed text_2001-2010_American Journal of Political Science.txt: 593 articles processed.
Processed text_2001-2010_British Journal of Political Science.txt: 530 articles processed.
Processed text_2011-2020_American Journal of Political Science.txt: 1242 articles processed.
Processed text_2011-2020_British Journal of Political Science.txt: 823 articles processed.
Processed text_2021-2024_American Journal of Political Science.txt: 360 articles pro