In [None]:
import os
import re
import json
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

In [None]:
def preprocess_keyword(keyword):
    # Convert to lowercase
    keyword = keyword.lower()
    # Remove parentheses and their contents
    keyword = re.sub(r'\s*\([^)]*\)', '', keyword)
    # Tokenize words
    processed_words = nltk.word_tokenize(keyword)

    return processed_words

def extract_compound_keywords_from_json(folder_path):
    compound_keywords_dict = {}
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                
                # Extract keywords from the JSON structure
                keywords_data = data.get('full-text-retrieval-response', {}).get('coredata', {}).get('dcterms:subject', [])
                keywords = [kw['$'] for kw in keywords_data if '$' in kw]
                
                for keyword in keywords:
                    processed_words = preprocess_keyword(keyword)
                    if len(processed_words) > 1:
                        compound_keyword = '_'.join(processed_words)
                        normal_keyword = ' '.join(processed_words)
                        compound_keywords_dict[normal_keyword] = compound_keyword
                    
    return compound_keywords_dict

def save_dict_as_py(dict_obj, output_file):
    # Convert dictionary to string and add import statement
    dict_content = f"compound_keywords = {dict_obj}\n"
    # Write to a .py file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(dict_content)

folder_path = 'C:/Users/wenha/OneDrive - University College London/Desktop/first_paper_code/downloaded_articles'
compound_keywords = extract_compound_keywords_from_json(folder_path)

# Save dictionary to a .py file
output_file = 'compound_keywords.py'
save_dict_as_py(compound_keywords, output_file)
print(f"Dictionary has been saved to {output_file}")