dictionary method for radio

In [8]:
import pandas as pd
import re
import itertools
from joblib import Parallel, delayed
from tqdm import tqdm

# === Topic Dictionary ===
topic_dict = {
    "abortion": [
        "birth control", "contraception", "abortion provider", "ACOG",
        "medication abortion", "medical abortion", "planned parenthood", "pro-choice", "pro-life"
    ],
    "international affairs": [
        "russia", "ukraine", "israel", "gaza",
        "who involvement", "climate agreements", "foreign policy", "international relations"
    ],
    "immigration": [
        "border control", "citizenship", "immigration policy", "border security",
        "deportation", "visa", "migrants", "asylum"
    ],
    "economy": [
        "job opportunities", "jobs", "tariffs", "trade", "inflation", "tax",
        "budget deficit", "unemployment", "economic growth"
    ],
    "violent crime": [
        "gun control", "school shootings", "2nd amendment", "second amendment",
        "firearms", "mass shooting", "gun violence", "crime rate"
    ],
    "climate change": [
        "science", "environment", "disaster", "climate crisis", "global warming",
        "carbon emissions", "green energy", "natural disasters"
    ]
}

# === Keyword expansion ===
def get_variants(word):
    word = word.lower()
    return {
        word,
        word + 's',
        word + 'ed',
        word + 'ing'
    }

def expand_phrase(phrase):
    words = phrase.lower().split()
    variants = [get_variants(w) for w in words]
    return set(" ".join(p) for p in itertools.product(*variants))

# === Scoring Function ===
def compute_topic_scores(text):
    text = str(text).lower()
    tokens = text.split()
    word_count = len(tokens)

    scores = {}
    for topic, phrases in topic_dict.items():
        count = 0
        for phrase in phrases:
            for variant in expand_phrase(phrase):
                pattern = r'\b' + re.escape(variant) + r'\b'
                count += len(re.findall(pattern, text))
        scores[topic] = count / word_count if word_count else 0
    return scores

# === Parallel scoring ===
def parallel_score_texts(texts, n_jobs=-1):
    results = Parallel(n_jobs=n_jobs)(
        delayed(compute_topic_scores)(text) for text in tqdm(texts, desc="Scoring entries")
    )
    return pd.DataFrame(results)

# === MAIN ===
input_path = "stratified_radio_sample.csv"       # change this
output_path = "radio_sample_topics.csv"  # change this if needed

print("📥 Loading data...")
df = pd.read_csv(input_path)

print("🚀 Scoring in parallel using joblib...")
topic_scores_df = parallel_score_texts(df["text"].tolist(), n_jobs=-1)  # Use all cores

print("📎 Merging and saving...")
df = pd.concat([df, topic_scores_df], axis=1)
df.to_csv(output_path, index=False)

print("✅ Done! Output saved to:", output_path)


📥 Loading data...
🚀 Scoring in parallel using joblib...


Scoring entries: 100%|██████████| 624/624 [01:10<00:00,  8.90it/s]


📎 Merging and saving...
✅ Done! Output saved to: radio_sample_topics.csv


In [None]:
import pandas as pd
import re
import itertools
from joblib import Parallel, delayed
from tqdm import tqdm

# === Base Topic Dictionary ===
base_topic_dict = {
    "abortion": [
        "birth control", "contraception", "abortion provider", "ACOG",
        "medication abortion", "medical abortion", "planned parenthood", "pro-choice", "pro-life"
    ],
    "international affairs": [
        "russia", "ukraine", "israel", "gaza",
        "who involvement", "climate agreements", "foreign policy", "international relations"
    ],
    "immigration": [
        "border control", "citizenship", "immigration policy", "border security",
    "economy": [
        "job opportunities", "jobs", "tariffs", "trade", "inflation", "tax",
        "budget deficit", "unemployment", "economic growth"
    ],
    "violent crime": [
        "gun control", "school shootings", "2nd amendment", "second amendment",
        "firearms", "mass shooting", "gun violence", "crime rate"
    ],
    "climate change": [
        "science", "environment", "disaster", "climate crisis", "global warming",
        "carbon emissions", "green energy", "natural disasters"
    ]
}

# === Keyword Expansion Utilities ===
def get_variants(word):
    word = word.lower()
    return {
        word,
        word + 's',
        word + 'ed',
        word + 'ing'
    }

def expand_phrase(phrase):
    words = phrase.lower().split()
    variants = [get_variants(w) for w in words]
    return set(" ".join(p) for p in itertools.product(*variants))

def build_flexible_topic_dict(base_dict):
    expanded_dict = {}
    for topic, phrases in base_dict.items():
        phrase_variants = set()
        phrase_variants.update(get_variants(topic))  # include topic name
        for phrase in phrases:
            phrase_variants.add(phrase.lower())  # whole phrase
            phrase_variants.update(expand_phrase(phrase))  # all combinations
            for word in phrase.lower().split():  # individual words
                phrase_variants.update(get_variants(word))
        expanded_dict[topic] = list(phrase_variants)
    return expanded_dict

# === Scoring Function ===
flexible_topic_dict = build_flexible_topic_dict(base_topic_dict)

def compute_topic_scores(text):
    text = str(text).lower()
    tokens = text.split()
    word_count = len(tokens)

    scores = {}
    for topic, variants in flexible_topic_dict.items():
        count = 0
        for variant in variants:
            pattern = r'\b' + re.escape(variant) + r'\b'
            count += len(re.findall(pattern, text))
        scores[topic] = count / word_count if word_count else 0
    return scores

# === Parallel Processing ===
def parallel_score_texts(texts, n_jobs=-1):
    results = Parallel(n_jobs=n_jobs)(
        delayed(compute_topic_scores)(text) for text in tqdm(texts, desc="Scoring entries")
    )
    return pd.DataFrame(results)

# === MAIN ===
input_path = "stratified_radio_sample.csv"
output_path = "radio_sample_topics.csv"

print("📥 Loading data...")
df = pd.read_csv(input_path)

print("🚀 Scoring in parallel using joblib...")
topic_scores_df = parallel_score_texts(df["text"].tolist(), n_jobs=-1)

print("📎 Merging and saving...")
df = pd.concat([df, topic_scores_df], axis=1)
df.to_csv(output_path, index=False)

print("✅ Done! Output saved to:", output_path)


📥 Loading data...
🚀 Scoring in parallel using joblib...


Scoring entries: 100%|██████████| 624/624 [01:40<00:00,  6.24it/s]


📎 Merging and saving...
✅ Done! Output saved to: radio_sample_topics.csv


revised dictionary method for social media (add expanded dictionary)

In [2]:
import pandas as pd
import re
import itertools
from joblib import Parallel, delayed
from tqdm import tqdm
import time

# === BASE TOPIC DICTIONARY ===
base_topic_dict = {
    "abortion": [
        "birth control", "contraception", "abortion provider", "ACOG",
        "medication abortion", "medical abortion", "planned parenthood", "pro-choice", "pro-life"
    ],
    "international affairs": [
        "russia", "ukraine", "israel", "gaza",
        "who involvement", "climate agreements", "foreign policy", "international relations"
    ],
    "immigration": [
        "border control", "citizenship", "immigration policy", "border security",
        "deportation", "visa", "migrants", "asylum"
    ],
    "economy": [
        "job opportunities", "jobs", "tariffs", "trade", "inflation", "tax",
        "budget deficit", "unemployment", "economic growth"
    ],
    "violent crime": [
        "gun control", "school shootings", "2nd amendment", "second amendment",
        "firearms", "mass shooting", "gun violence", "crime rate"
    ],
    "climate change": [
        "science", "environment", "disaster", "climate crisis", "global warming",
        "carbon emissions", "green energy", "natural disasters"
    ]
}

# === TEXT CLEANING ===
def clean_social_media_text(text):
    text = str(text)
    text = re.sub(r"<.*?>", " ", text)  # remove HTML tags
    text = re.sub(r"http\S+|www\S+|t\.me\S+", " ", text)  # remove URLs
    text = re.sub(r"\s+", " ", text)  # normalize whitespace
    text = text.replace("\n", " ").replace("\r", " ")
    return text.strip().lower()

# === VARIANT EXPANSION ===
def get_variants(word):
    word = word.lower()
    return {
        word,
        word + 's',
        word + 'ed',
        word + 'ing'
    }

def expand_phrase(phrase):
    words = phrase.lower().split()
    variants = [get_variants(w) for w in words]
    return set(" ".join(p) for p in itertools.product(*variants))

# === FLEXIBLE DICTIONARY BUILDER ===
def build_flexible_topic_dict(base_dict):
    updated_dict = {}
    for topic, phrases in base_dict.items():
        expanded = set()
        expanded.update(get_variants(topic))  # include topic name
        for phrase in phrases:
            expanded.add(phrase.lower())  # whole phrase
            expanded.update(expand_phrase(phrase))  # phrase variants
            for word in phrase.lower().split():  # individual words
                expanded.update(get_variants(word))
        updated_dict[topic] = sorted(list(expanded))
    return updated_dict

# === USE EXPANDED TOPIC DICTIONARY ===
topic_dict = build_flexible_topic_dict(base_topic_dict)

# === TOPIC SCORING FUNCTION ===
def compute_topic_scores(text):
    text = clean_social_media_text(text)
    tokens = text.split()
    word_count = len(tokens)

    scores = {}
    for topic, variants in topic_dict.items():
        count = 0
        for variant in variants:
            pattern = r'\b' + re.escape(variant) + r'\b'
            count += len(re.findall(pattern, text))
        scores[topic] = count / word_count if word_count else 0
    return scores

# === PARALLEL TOPIC SCORING ===
def parallel_score_texts(texts, n_jobs=-1):
    return pd.DataFrame(Parallel(n_jobs=n_jobs)(
        delayed(compute_topic_scores)(text) for text in tqdm(texts, desc="Scoring posts")
    ))

# === MAIN EXECUTION ===
input_path = "social_posts.csv"         # Change to your file
output_path = "social_scored_output.csv"

print("📥 Loading data...")
df = pd.read_csv(input_path)

print("🧼 Cleaning text...")
df["clean_text"] = df["attributes.search_data_fields.all_text"].apply(clean_social_media_text)

print("🚀 Scoring in parallel...")
start = time.time()
topic_scores = parallel_score_texts(df["clean_text"].tolist(), n_jobs=-1)

print("📎 Merging and saving...")
df = pd.concat([df, topic_scores], axis=1)
df.to_csv(output_path, index=False)

print(f"✅ Done! Scored output saved to {output_path}")
print(f"⏱️ Total time: {time.time() - start:.2f} seconds")


📥 Loading data...


  df = pd.read_csv(input_path)


🧼 Cleaning text...
🚀 Scoring in parallel...


Scoring posts: 100%|██████████| 35000/35000 [01:21<00:00, 431.80it/s]


📎 Merging and saving...
✅ Done! Scored output saved to social_scored_output.csv
⏱️ Total time: 81.81 seconds


what does expanded dictionary looks like?

In [10]:
import re
import itertools
import pprint  # for pretty printing

# === Base Topic Dictionary ===
base_topic_dict = {
    "abortion": [
        "birth control", "contraception", "abortion provider", "ACOG",
        "medication abortion", "medical abortion", "planned parenthood", "pro-choice", "pro-life"
    ],
    "international affairs": [
        "russia", "ukraine", "israel", "gaza",
        "who involvement", "climate agreements", "foreign policy", "international relations"
    ],
    "immigration": [
        "border control", "citizenship", "immigration policy", "border security",
        "deportation", "visa", "migrants", "asylum"
    ],
    "economy": [
        "job opportunities", "jobs", "tariffs", "trade", "inflation", "tax",
        "budget deficit", "unemployment", "economic growth"
    ],
    "violent crime": [
        "gun control", "school shootings", "2nd amendment", "second amendment",
        "firearms", "mass shooting", "gun violence", "crime rate"
    ],
    "climate change": [
        "science", "environment", "disaster", "climate crisis", "global warming",
        "carbon emissions", "green energy", "natural disasters"
    ]
}

# === Generate Variants ===
def get_variants(word):
    word = word.lower()
    return {
        word,
        word + 's',
        word + 'ed',
        word + 'ing'
    }

# === Build Flexible Dictionary ===
def build_flexible_topic_dict(base_dict):
    updated_dict = {}
    for topic, phrases in base_dict.items():
        expanded = set()
        expanded.update(get_variants(topic))  # include topic name
        for phrase in phrases:
            expanded.add(phrase.lower())  # whole phrase
            for word in phrase.lower().split():  # individual words
                expanded.update(get_variants(word))
        updated_dict[topic] = sorted(list(expanded))
    return updated_dict

# === Build and Print ===
topic_dict = build_flexible_topic_dict(base_topic_dict)
pprint.pprint(topic_dict, width=120)


{'abortion': ['abortion',
              'abortion provider',
              'abortioned',
              'abortioning',
              'abortions',
              'acog',
              'acoged',
              'acoging',
              'acogs',
              'birth',
              'birth control',
              'birthed',
              'birthing',
              'births',
              'contraception',
              'contraceptioned',
              'contraceptioning',
              'contraceptions',
              'control',
              'controled',
              'controling',
              'controls',
              'medical',
              'medical abortion',
              'medicaled',
              'medicaling',
              'medicals',
              'medication',
              'medication abortion',
              'medicationed',
              'medicationing',
              'medications',
              'parenthood',
              'parenthooded',
              'parenthooding',
            

initial dictionary method for social media

In [9]:
import pandas as pd
import re
import itertools
from joblib import Parallel, delayed
from tqdm import tqdm
import time

# === Topic Dictionary ===
topic_dict = {
    "abortion": [
        "birth control", "contraception", "abortion provider", "ACOG",
        "medication abortion", "medical abortion", "planned parenthood", "pro-choice", "pro-life"
    ],
    "international affairs": [
        "russia", "ukraine", "israel", "gaza",
        "who involvement", "climate agreements", "foreign policy", "international relations"
    ],
    "immigration": [
        "border control", "citizenship", "immigration policy", "border security",
        "deportation", "visa", "migrants", "asylum"
    ],
    "economy": [
        "job opportunities", "jobs", "tariffs", "trade", "inflation", "tax",
        "budget deficit", "unemployment", "economic growth"
    ],
    "violent crime": [
        "gun control", "school shootings", "2nd amendment", "second amendment",
        "firearms", "mass shooting", "gun violence", "crime rate"
    ],
    "climate change": [
        "science", "environment", "disaster", "climate crisis", "global warming",
        "carbon emissions", "green energy", "natural disasters"
    ]
}

# === CLEANING FUNCTION ===
def clean_social_media_text(text):
    text = str(text)
    text = re.sub(r"<.*?>", " ", text)  # remove HTML tags
    text = re.sub(r"http\S+|www\S+|t\.me\S+", " ", text)  # remove URLs
    text = re.sub(r"\s+", " ", text)  # normalize whitespace
    text = text.replace("\n", " ").replace("\r", " ")
    return text.strip().lower()

# === Variant Expansion ===
def get_variants(word):
    word = word.lower()
    return {
        word,
        word + 's',
        word + 'ed',
        word + 'ing'
    }

def expand_phrase(phrase):
    words = phrase.lower().split()
    variants = [get_variants(w) for w in words]
    return set(" ".join(p) for p in itertools.product(*variants))

# === Scoring ===
def compute_topic_scores(text):
    text = clean_social_media_text(text)
    tokens = text.split()
    word_count = len(tokens)

    scores = {}
    for topic, phrases in topic_dict.items():
        count = 0
        for phrase in phrases:
            for variant in expand_phrase(phrase):
                pattern = r'\b' + re.escape(variant) + r'\b'
                count += len(re.findall(pattern, text))
        scores[topic] = count / word_count if word_count else 0
    return scores

# === Parallel Apply ===
def parallel_score_texts(texts, n_jobs=-1):
    return pd.DataFrame(Parallel(n_jobs=n_jobs)(
        delayed(compute_topic_scores)(text) for text in tqdm(texts, desc="Scoring posts")
    ))

# === MAIN ===
input_path = "social_posts.csv"     # Change to your file
output_path = "social_scored_output.csv"

df = pd.read_csv(input_path)
# Clean the original messy text column
df["clean_text"] = df["attributes.search_data_fields.all_text"].apply(clean_social_media_text)

start = time.time()
topic_scores = parallel_score_texts(df["clean_text"].tolist(), n_jobs=-1)
df = pd.concat([df, topic_scores], axis=1)
df.to_csv(output_path, index=False)

print(f"✅ Done! Scored output saved to {output_path}")
print(f"⏱️ Total time: {time.time() - start:.2f} seconds")


  df = pd.read_csv(input_path)
Scoring posts: 100%|██████████| 35000/35000 [01:00<00:00, 576.28it/s]


✅ Done! Scored output saved to social_scored_output.csv
⏱️ Total time: 61.59 seconds
