In [1]:
import os
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import re

In [2]:
# Load JSONL file into a DataFrame
def jsonl_to_dataframe(file_path):
    return pd.read_json(file_path, lines=True)

file_path = "/Users/yvette/Desktop/data/Final/df_decade.jsonl"
df = jsonl_to_dataframe(file_path)

# Display the first few rows
print(df.head)

<bound method NDFrame.head of                                         id  \
0                   ark://27927/pjcswvb5kf   
1                  ark://27927/pjb7q8vz8dg   
2                  ark://27927/phz45krzm5c   
3                  ark://27927/phzptbf6jw3   
4                  ark://27927/pgh2cghxgbg   
...                                    ...   
6169  http://www.jstor.org/stable/25193870   
6170   http://www.jstor.org/stable/2110624   
6171    http://www.jstor.org/stable/194108   
6172  http://www.jstor.org/stable/25193859   
6173   http://www.jstor.org/stable/3647727   

                                                  title  \
0     Buttery Guns and Welfare Hawks: The Politics o...   
1     Do Campaign Contribution Limits Curb the Influ...   
2     When Toleration Becomes a Vice: Naming Aristot...   
3     How Does Minority Political Representation Aff...   
4     Assortative Mating on Ideology Could Operate T...   
...                                                 ...   
6169

In [None]:
# Create a set of custom stopwords
custom_stop_words = set(stopwords.words('english'))

# Add the words you want to keep
custom_metadata_stopwords = {
    "downloaded", "doi", "wiley", "author", "email", "vol", "et", "al", "pp", "copyright", "ph.d.", "issue", "volume", "edition", 
    "university", "oxford","cambridge", "b.j.pol.s.", "midwest", "linkoping", "creative commons",  "open access"
    # Add all months
    "january", "february", "march", "april", "may", "june", "july",
    "august", "september", "october", "november", "december"
}

custom_stop_words.update(custom_metadata_stopwords)

# Preprocessing ngram
def preprocess_ngram(text):
    if not isinstance(text, str):  # Handle NaN or non-string values
        return ""
    
    # Tokenization
    tokens = word_tokenize(text)

    # Determine n-gram type
    num_words = len(tokens)
    is_bigram = num_words == 2
    is_trigram = num_words == 3

    
    # Remove stopwords
    tokens = [word for word in tokens if word not in custom_stop_words and "_" not in word]
    
    # Handle bigrams/trigrams:
    if is_bigram:
        # If any word in the bigram is a stopword, return an empty string
        if any(word in custom_stop_words for word in tokens):
            return ""  # Remove entire bigram if it contains a stopword

    elif is_trigram:
        # If there are two or more stopwords in the trigram, return an empty string
        if sum(word in custom_stop_words for word in tokens) >= 2:
            return ""  # Remove entire trigram if it contains two or more stopwords

    return " ".join(tokens)  # Return cleaned n-gram


# Preprocess function on a sentence level
def preprocess(text):
    # Split the text into sentences
    sentences = sent_tokenize(text)

    # Preprocess each sentence
    processed_sentences = []
    for sentence in sentences:
        # Convert to lowercase
        sentence = sentence.lower()
        
        # Remove reference section if present
        sentence = re.sub(r"references[\s\S]*", "", sentence, flags=re.IGNORECASE)

        # Remove mentions of email addresses
        sentence = re.sub(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", "", sentence)

        # Remove standard metadata phrases
        metadata_patterns = [
            r"downloaded from.*?wiley online library(\n|$)",  
            r"see the terms and conditions.*?(\n|$)",
            r"published online by cambridge university press(\n|$)",
            r"this is an open access article.*?creative commons license(\n|$)",
            r"american journal of political science, vol\..*?, pp\..*?(\n|$)",
            r"copyright ©.*?(\n|$)",
            r"first published online.*?(\n|$)",
            r"earlier versions of this article were presented at.*?(\n|$)"
        ]
        for pattern in metadata_patterns:
            sentence = re.sub(pattern, "", sentence, flags=re.IGNORECASE | re.MULTILINE)

        # Remove URLs and numbers
        sentence = re.sub(r"http\S+|www\S+|https\S+", "", sentence)  # Remove URLs
        sentence = re.sub(r"\d+", "", sentence)  # Remove numbers

        # Replace specific characters with spaces
        sentence = re.sub(r"['‘’“”]", " ", sentence)  # Remove quotes and apostrophes
        sentence = re.sub(r"[-_]", " ", sentence)  # Replace hyphens and underscores with spaces

        # Remove punctuation except for apostrophes and hyphens
        sentence = re.sub(r"[^\w\s'-]", " ", sentence)  # Keep letters, numbers, spaces, apostrophes, and hyphens

        # Tokenize the sentence into words
        tokens = word_tokenize(sentence)

        # Remove stopwords
        tokens = [word for word in tokens if word not in custom_stop_words]

        # Rejoin the tokens into the processed sentence
        processed_sentence = " ".join(tokens)
        processed_sentences.append(processed_sentence)
    
    # Rejoin the processed sentences into the final preprocessed text
    processed_text = ' SENTENCESPLITHERE '.join(processed_sentences)
    
    return processed_text


In [None]:
# Assuming you have already defined preprocess function and your DataFrame `df` is ready
output_path = '/Users/yvette/Desktop/data/Final/preprocessed grouped txt'

# Apply preprocessing to the text column
df['preprocessed_text'] = df['text'].apply(preprocess)

# Group the DataFrame by 'year_range' and 'isPartOf'
grouped_df = df.groupby(['year_range', 'isPartOf'])

token_counts = []

# Loop through each group and save the output to a .txt file
for (year_range, is_part_of), group in grouped_df:
    # Construct the filename
    filename = f'text_{year_range}_{is_part_of}.txt'
    file_path = os.path.join(output_path, filename)
    
    # Ensure the output directory exists
    os.makedirs(output_path, exist_ok=True)
    
    # Combine all the preprocessed text of the group into one string
    group_text = "\n".join(group['preprocessed_text'].dropna())
    token_count = len(group_text.split())  
    token_counts.append({"group_name": filename, "token_count": token_count})
    
    
    # Write the text to the file
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(group_text)
    
    # Print confirmation message
    print(f"Saved: {file_path}")

token_counts_df = pd.DataFrame(token_counts)
print(token_counts_df)

token_counts_df.to_csv("/Users/yvette/Desktop/group_token_counts.csv", index=False)

In [None]:
# Path to the folder containing the ngram CSV files
folder_path = '/Users/yvette/Desktop/data/Final/ngram word frequency'
output_folder = "/Users/yvette/Desktop/data/Final/ngram word frequency without stopwords"

# Ensure the output directory exists
os.makedirs(output_folder, exist_ok=True)

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):  # Process only CSV files
        file_path = os.path.join(folder_path, filename)

        # Read the CSV file into a DataFrame
        df_ngram = pd.read_csv(file_path)

        # Check if 'ngram' column exists
        if 'ngram' in df_ngram.columns:
            # Apply preprocessing (you should define your preprocess_ngram function beforehand)
            df_ngram['ngram'] = df_ngram['ngram'].apply(preprocess_ngram)

            # Remove rows where 'ngram' is empty
            df_ngram = df_ngram[df_ngram['ngram'].str.strip() != ""]

            # Remove bigrams with only one word and trigrams with two words based on filename
            condition = (
                (df_ngram['ngram'].apply(lambda x: len(x.split()) == 1) & filename.__contains__("unigram")) |
                (df_ngram['ngram'].apply(lambda x: len(x.split()) == 2) & filename.__contains__("bigram")) |
                (df_ngram['ngram'].apply(lambda x: len(x.split()) == 3) & filename.__contains__("trigram"))
            )
            df_ngram = df_ngram[condition]

            # Save the preprocessed DataFrame
            new_filename = filename.replace(".csv", "_without_stopword.csv")
            new_file_path = os.path.join(output_folder, new_filename)
            df_ngram.to_csv(new_file_path, index=False)

            print(f"Processed and saved: {new_filename}")