In [88]:
# Load the data from a CSV file and print the first 5 rows
import pandas as pd
data = pd.read_csv('/Users/ame/02805_climate_conv/data/cleaned_twitter_embedded_data.csv')

In [89]:
# Print column names
print(data.columns)

print(len(data))

# Print number of rows where hashtags is null
print(data['hashtags'].isnull().sum())

Index(['tweetid', 'message', 'embeddings', 'metadata', 'date', 'hashtags',
       'location', 'sentiment', 'clean_text'],
      dtype='object')
51376
2


In [90]:
# Make subset where 'clean_text' is longer than the 25% quantile and hastags is not null
subset = data[(data['clean_text'].str.len() > data['clean_text'].str.len().quantile(0.25)) & (data['hashtags'].notnull())]

# Print number of rows in subset
print(len(subset))

38402


In [91]:
import re

def extract_tags(x):
    if isinstance(x, list):
        tags = x
    elif isinstance(x, str):
        tags = re.findall(r"#?\w+", x)
    else:
        tags = []

    # normalize
    return [t.lower().lstrip("#") for t in tags if t.strip() != ""]
    
subset["hashtags"] = subset["hashtags"].apply(extract_tags)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset["hashtags"] = subset["hashtags"].apply(extract_tags)


In [92]:
before = set(subset["hashtags"].explode())
print("Unique BEFORE:", len(before))

Unique BEFORE: 29159


In [93]:
from rapidfuzz import fuzz
from collections import defaultdict

tag_counts = subset["hashtags"].explode().value_counts()

# Only consider tags that appear >= 5 times
tags = tag_counts[tag_counts >= 5].index.tolist()

length_buckets = defaultdict(list)
for t in tags:
    length_buckets[len(t)].append(t)

similar_pairs = []

for L, bucket in length_buckets.items():
    candidates = (
        bucket
        + length_buckets.get(L-1, [])
        + length_buckets.get(L+1, [])
        + length_buckets.get(L-2, [])
        + length_buckets.get(L+2, [])
    )

    for i, t1 in enumerate(bucket):
        for t2 in candidates[i+1:]:
            if fuzz.ratio(t1, t2) >= 92:
                similar_pairs.append((t1, t2))

print("Pairs found:", len(similar_pairs))

Pairs found: 384


In [94]:
graph = defaultdict(set)
for a,b in similar_pairs:
    graph[a].add(b)
    graph[b].add(a)

def find_clusters(graph):
    visited = set()
    clusters = []
    for node in graph:
        if node not in visited:
            stack=[node]
            group=set()
            while stack:
                u=stack.pop()
                if u not in visited:
                    visited.add(u)
                    group.add(u)
                    stack.extend(graph[u])
            clusters.append(group)
    return clusters

clusters = find_clusters(graph)
print("Clusters:", len(clusters))

Clusters: 163


In [95]:
canonical_map = {}

for group in clusters:
    canonical = tag_counts.loc[list(group)].idxmax()
    for tag in group:
        canonical_map[tag] = canonical

In [96]:
def fix_tag(tag):
    return canonical_map.get(tag, tag)

subset["hashtags"] = subset["hashtags"].apply(lambda tags: [fix_tag(t) for t in tags])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset["hashtags"] = subset["hashtags"].apply(lambda tags: [fix_tag(t) for t in tags])


In [97]:
after = set(subset["hashtags"].explode())
print("Unique AFTER:", len(after))

Unique AFTER: 28966


In [98]:
# Print the first 5 tweets and the hashtags
for i, row in subset.head(5).iterrows():
    print(row['clean_text'])
    print("Hashtags:", row['hashtags'])
    print()
    

2020 is the year we votethemout the year we climatestrike our hearts out the year we rebelforlife because without a liveable future nothing else matters 2020 is the year we get shit done 3 3 date 2019 12 31 location california usa sentiment negative
Hashtags: ['votethemout', 'climatestrike', 'rebelforlife']

winter has not stopped this group of dedicated climate activists they are an example to follow climatefriday climatestrike climateaction date 2019 12 27 location california usa sentiment positive
Hashtags: ['climatefriday', 'climatestrike', 'climateaction']

week 55 of climatestrike at the next week heads into its 3rd year of striking as our time on the streets gets longer we need you to act and do something for the climate in 2020 people must stop looking away and stop pretending this crisis doesn t exist at united nations date 2019 12 27 location california usa sentiment positive
Hashtags: ['climatestrike']

a year of resistance as youth protests shaped climate change discussions

In [99]:
# Print number of hashtags appearing more than 100 times
from collections import Counter
all_tags = subset["hashtags"].explode()
tag_counter = Counter(all_tags)
popular_tags = {tag: count for tag, count in tag_counter.items() if count > 20}
print("Hashtags appearing more than 20 times:", len(popular_tags))

Hashtags appearing more than 20 times: 804


In [100]:
# Print number of hashtags used more than 100 times
print("Hashtags appearing more than 100 times:", sum(1 for count in tag_counter.values() if count > 100))

Hashtags appearing more than 100 times: 139


In [101]:
import pandas as pd

# 1) Count hashtag frequencies across all tweets
all_tags = subset["hashtags"].explode()
tag_counts = all_tags.value_counts()

# 2) Build a set of hashtags we want to KEEP
#    - must appear at least 20 times
#    - must appear at most 100 times
#    - must not be exactly "climatechange"
keep_tags = set(
    tag_counts[(tag_counts >= 20) & (tag_counts <= 100)].index
) - {"climatechange"}

print(f"Unique hashtags before filtering: {all_tags.nunique()}")
print(f"Unique hashtags kept: {len(keep_tags)}")

# 3) Filter hashtags in each row
def filter_hashtags(tags):
    return [t for t in tags if t in keep_tags]

subset["hashtags"] = subset["hashtags"].apply(filter_hashtags)

# Optional: check how many tags remain
remaining_tags = subset["hashtags"].explode()
print(f"Unique hashtags after filtering: {remaining_tags.nunique()}")

Unique hashtags before filtering: 28966
Unique hashtags kept: 712
Unique hashtags after filtering: 712


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset["hashtags"] = subset["hashtags"].apply(filter_hashtags)


In [102]:
# Save the subset to a new CSV file
subset.to_csv('/Users/ame/02805_climate_conv/data/cleaned_twitter_embedded_data_hashtags_fixed.csv', index=False)

In [None]:
# Search the dataframe "message" column for URLs and extract them into a new column "urls"
import re
def extract_urls(text):
    url_pattern = r'(https?://\S+)'
    return re.findall(url_pattern, text)

data['urls'] = data['message'].apply(extract_urls)

In [None]:
import pandas as pd
import requests
from requests.exceptions import RequestException
from tqdm import tqdm
from urllib.parse import urlparse

# --------------------------------------------------
# Helper: validate URLs
# --------------------------------------------------
def clean_url(url):
    """Return a cleaned, valid URL or None if impossible."""
    if not isinstance(url, str):
        return None
    
    url = url.strip()

    # Remove accidental trailing dots
    while url.endswith("."):
        url = url[:-1]

    # Add scheme if missing
    if not url.startswith(("http://", "https://")):
        url = "http://" + url

    parsed = urlparse(url)

    # Must contain at least a domain
    if not parsed.netloc:
        return None

    # Detect double dots or malformed hostnames
    if ".." in parsed.netloc:
        return None

    return url


# --------------------------------------------------
# Helper: fetch page content
# --------------------------------------------------
def fetch_url(url, timeout=5):
    """Return HTML or None if failed."""
    try:
        resp = requests.get(url, timeout=timeout, headers={"User-Agent": "Mozilla/5.0"})
        if resp.status_code == 200:
            return resp.text
        return None
    except RequestException:
        return None


# --------------------------------------------------
# 1) Build mapping URL -> tweets
# --------------------------------------------------
url_to_tweets = {}

for tweet_id, url_list in zip(data["tweetid"], data["urls"]):
    urls = url_list if isinstance(url_list, list) else [url_list]

    for u in urls:
        cleaned = clean_url(u)
        if cleaned:
            url_to_tweets.setdefault(cleaned, []).append(tweet_id)

# Stats
total_urls_raw = sum(len(v if isinstance(v, list) else [v]) for v in data["url"].dropna())
unique_valid_urls = sorted(url_to_tweets.keys())
invalid_urls = total_urls_raw - len(unique_valid_urls)

print(f"Total URLs found: {total_urls_raw}")
print(f"Valid URLs: {len(unique_valid_urls)}")
print(f"Ignored invalid URLs: {invalid_urls}")


# --------------------------------------------------
# 2) Crawl valid URLs
# --------------------------------------------------
rows = []
success_count = 0
fail_count = 0

for url in tqdm(unique_valid_urls, desc="Crawling URLs"):
    html = fetch_url(url)

    if html is not None:
        success_count += 1
    else:
        fail_count += 1

    rows.append({
        "url": url,
        "page_content": html,
        "tweet_ids": url_to_tweets[url]
    })

# --------------------------------------------------
# 3) Build dataframe
# --------------------------------------------------
web = pd.DataFrame(rows)

print("\n===== Crawl Summary =====")
print(f"Valid unique URLs:        {len(unique_valid_urls)}")
print(f"Successfully crawled:     {success_count}")
print(f"Failed / unreachable:     {fail_count}")
print(f"Ignored invalid URLs:     {invalid_urls}")
print("==========================")

web.head()

KeyError: 'url'