In [8]:
vals = [
    "['Data', 'AI']", 
    "['ML', 123]", 
    "[None]", 
    "garbage", 
    None, 
    "['data']"
] * 1000000

In [None]:
import pandas as pd
import ast

def extract_keywords(vals):
    kw_set = set()

    for val in vals:
        if pd.isna(val) or not isinstance(val, str) or not val.strip().startswith("["):
            continue

        try:
            kws = ast.literal_eval(val.lower()) #eval if its python literal
            if isinstance(kws, list):
                for kw in kws:
                    if isinstance(kw, str):
                        kw_set.add(kw.lower())
        except (ValueError, SyntaxError):
            continue  # skip malformed entries safely

    return kw_set


In [None]:
import pandas as pd
import ast

# Sample data (replace with your actual dataframe)
extracted_df = pd.read_csv(args.extractedfile)

# Step 1: Preprocess the keywords column
def parse_keywords(val):
    if pd.isna(val) or not isinstance(val, str) or not val.strip().startswith("["):
        return []
    try:
        kws = ast.literal_eval(val.lower())
        if isinstance(kws, list):
            return [kw.lower() for kw in kws if isinstance(kw, str)]
    except (ValueError, SyntaxError):
        pass
    return []

# Apply preprocessing
data['parsed_keywords'] = data['keywords'].apply(parse_keywords)

# Explode the lists to get one row per keyword
exploded_df = data.explode('parsed_keywords')
exploded_df = exploded_df[exploded_df['parsed_keywords'].str.len() > 0]

# Step 2: Group by and aggregate
# For frequencies (counts)
freq_by_cluster = exploded_df.groupby(['cluster', 'parsed_keywords']).size().reset_index(name='frequency')

# For financial values (sum of ecmax)
financial_by_cluster = exploded_df.groupby(['cluster', 'parsed_keywords'])['ecmax'].sum().reset_index()

# Also create the "all" cluster aggregates
freq_all = exploded_df.groupby(['parsed_keywords']).size().reset_index(name='frequency')
freq_all['cluster'] = 'all'

financial_all = exploded_df.groupby(['parsed_keywords'])['ecmax'].sum().reset_index()
financial_all['cluster'] = 'all'

# Combine cluster-specific and "all" results
freq_results = pd.concat([freq_by_cluster, freq_all])
financial_results = pd.concat([financial_by_cluster, financial_all])

# If you need dictionaries like in the original code:
# Convert to nested dictionaries {cluster: {keyword: count}}
freq_dict = {cluster: dict(zip(group['parsed_keywords'], group['frequency'])) 
             for cluster, group in freq_results.groupby('cluster')}

financial_dict = {cluster: dict(zip(group['parsed_keywords'], group['ecmax'])) 
                  for cluster, group in financial_results.groupby('cluster')}

# Display results
print("Frequency Results:")
print(freq_results)
print("\nFinancial Results:")
print(financial_results)
print("\nFrequency Dictionary:")
print(freq_dict)
print("\nFinancial Dictionary:")
print(financial_dict)