In [1]:
import pandas as pd

# Load the crawled web content from CSV
web = pd.read_csv('/Users/ame/02805_climate_conv/data/crawled_web_content.csv')

In [4]:
# Print column names
print(web.columns)

# Print dimensions of the DataFrame
print(web.shape)

Index(['url', 'page_content', 'tweet_ids'], dtype='object')
(6480, 3)


In [9]:
from newspaper import Article
import trafilatura

def extract_with_newspaper(html, url=None):
    """Try to extract article text using newspaper3k."""
    try:
        article = Article(url if url else "")
        article.set_html(html)
        article.parse()
        text = article.text.strip()
        if text and len(text) > 50:
            return text
    except:
        pass
    return None

def extract_with_trafilatura(html):
    """Fallback: extract text using trafilatura."""
    try:
        text = trafilatura.extract(html, include_comments=False, include_tables=False)
        if text and len(text.strip()) > 50:
            return text.strip()
    except:
        pass
    return None

def clean_page(html, url=None):
    """Full cleaning pipeline: newspaper3k → trafilatura."""
    if not isinstance(html, str) or len(html) < 50:
        return None
    
    # 1st try: newspaper3k
    txt = extract_with_newspaper(html, url)
    if txt:
        return txt
    
    # 2nd try: trafilatura
    txt = extract_with_trafilatura(html)
    if txt:
        return txt
    
    return None

In [10]:
from tqdm import tqdm

clean_texts = []

for html, url in tqdm(zip(web["page_content"], web["url"]),
                      total=len(web),
                      desc="Extracting clean text"):
    clean_texts.append(clean_page(html, url))

web["clean_text"] = clean_texts

Extracting clean text: 100%|██████████| 6480/6480 [09:50<00:00, 10.98it/s]  


In [12]:
usable = web["clean_text"].notna().sum()
empty = web["clean_text"].isna().sum()

print("Usable cleaned pages:", usable)
print("Empty or unreadable pages:", empty)

Usable cleaned pages: 2379
Empty or unreadable pages: 4101


In [15]:
# Print the first "clean_text"
web["url"][0]

'http://2066.uk'

In [16]:
# Save dataframe as CSV
web.to_csv('/Users/ame/02805_climate_conv/data/cleaned_web_content.csv', index=False)

In [18]:
web.columns

Index(['url', 'page_content', 'tweet_ids', 'clean_text'], dtype='object')

In [19]:
web["tweet_ids"][:10]

0                         [33863, 33865, 36194]
1                                        [9141]
2                                       [35230]
3                                       [22941]
4                                [34129, 36815]
5                                       [34125]
6                                       [24819]
7                                       [23512]
8    [16074, 16195, 23434, 23435, 23450, 23490]
9                                       [35146]
Name: tweet_ids, dtype: object

In [23]:
cleaned_web = pd.read_csv('/Users/ame/02805_climate_conv/data/cleaned_web_content.csv')

In [36]:
def to_list(x):
    if isinstance(x, list):
        return x
    if pd.isna(x):
        return []
    try:
        return ast.literal_eval(x)  # turns string repr of list into real list
    except Exception:
        return []

cleaned_web["tweet_ids"] = cleaned_web["tweet_ids"].apply(to_list)

In [37]:
# Compute the length of tweet_ids for each row
cleaned_web["tweet_ids_count"] = cleaned_web["tweet_ids"].apply(len)

# Get the 10 pages with the most tweet_ids
top_10 = cleaned_web.nlargest(10, 'tweet_ids_count')

# Extract and print URLs
print("Top 10 pages with the most tweet_ids:\n")
for idx, row in top_10.iterrows():
    print(f"URL: {row['url']}")
    print(f"  Tweet IDs count: {row['tweet_ids_count']}\n")

Top 10 pages with the most tweet_ids:

URL: http://strikewithus.org
  Tweet IDs count: 35

URL: https://spoti.fi/31be3fm
  Tweet IDs count: 19

URL: http://sealevel.climatecentral.org/maps
  Tweet IDs count: 18

URL: https://youtu.be/ehg2h9ryplq
  Tweet IDs count: 12

URL: https://eos.org/articles/dinosaurs-roar-again-now-including-a-focus-on-climate-change
  Tweet IDs count: 11

URL: http://webtv.un.org
  Tweet IDs count: 10

URL: https://neuage.org/e-books/
  Tweet IDs count: 10

URL: http://mgalleries.org
  Tweet IDs count: 9

URL: https://youtu.be/xlzfg-ir1k4
  Tweet IDs count: 9

URL: https://globalclimatestrike.net/
  Tweet IDs count: 8

