In [16]:
import pandas as pd
import glob
import os
from pycountry import countries

In [17]:
def is_country(concept):
    try:
        # Try to get the country by name
        if countries.lookup(concept):
            return "Country"
    except LookupError:
        return "Other"

In [18]:
# Use glob to find all files matching the pattern
all_filenames = glob.glob('../data/*_keyword_frequency.csv')

# Exclude files that have 'cleaned' in their names
dataset_filenames = [filename for filename in all_filenames if 'cleaned' not in filename]

dataset_filenames

['../data/authors_keyword_frequency.csv',
 '../data/index_keyword_frequency.csv',
 '../data/all_keyword_frequency.csv']

In [19]:
# Load the stop words dataset
stop_words_df = pd.read_csv('../data/StopWord.csv')
stop_words = stop_words_df['StopWord'].tolist()
print(f"Number of stop words: {len(stop_words)}")

Number of stop words: 1478


In [20]:
# Load the thesaurus terms
thesaurus_df = pd.read_csv('../data/thesaurus_terms.txt', delimiter=';', names=['Label', 'Replace by'])

# Create a dictionary for replacements
thesaurus_dict = pd.Series(thesaurus_df['Replace by'].values, index=thesaurus_df['Label']).to_dict()
print(f"Number of concepts to clean: {len(thesaurus_dict)}")

Number of concepts to clean: 62


In [21]:
# Define the function to clean and replace concepts
def clean_replace_concept(concept):
    concept = concept.strip()  # Remove leading/trailing whitespace
    return thesaurus_dict.get(concept, concept)  # Replace if in dictionary, else return original


In [22]:
# Initialize an empty list to store cleaned DataFrames
cleaned_dataframes = []

# Process each dataset
for filename in dataset_filenames:
    # Load the dataset
    df = pd.read_csv(filename)
    
    # Filter out rows where the keyword is in the stop words list
    cleaned_df = df[~df['Concepts'].isin(stop_words)]
    
    # Clean and replace concepts using thesaurus
    cleaned_df['Concepts'] = cleaned_df['Concepts'].apply(clean_replace_concept)
    
    # Strip leading/trailing whitespace from concepts
    cleaned_df['Concepts'] = cleaned_df['Concepts'].str.strip()
    
    # Assign category based on the country using .loc to avoid SettingWithCopyWarning
    cleaned_df.loc[:, 'Category'] = cleaned_df['Concepts'].apply(is_country)
    
    # Filter out rows where Category is 'Country'
    cleaned_df = cleaned_df[cleaned_df['Category'] != 'Country']
    
    # Select Concepts and Total column
    cleaned_df = cleaned_df[['Concepts', 'Total']]
    
    # Group by the cleaned concepts and recalculate the total
    cleaned_df = cleaned_df.groupby(['Concepts']).agg({'Total': 'sum'}).reset_index()
    
    # Sort values and select top 200
    cleaned_df = cleaned_df.sort_values('Total', ascending=False).head(100)
    
    # Append the cleaned DataFrame to the list
    cleaned_dataframes.append(cleaned_df)
    
    # Save the cleaned DataFrame to a new CSV file in the 'dataset' folder
    cleaned_filename = os.path.join('../data', os.path.basename(filename).replace('keyword_frequency', 'cleaned_keyword_frequency'))
    cleaned_df.to_csv(cleaned_filename, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['Concepts'] = cleaned_df['Concepts'].apply(clean_replace_concept)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['Concepts'] = cleaned_df['Concepts'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df.loc[:, 'Category'] = cleaned_df['Concepts'].apply(is_coun