In [1]:
from datetime import datetime, timedelta
import pickle
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import re

from tldextract import extract
import langid

pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 50)

from newspaper import Article

  pd.set_option('display.max_colwidth', -1)


NOTE: Mediacloud news for state-specific collection includes news ABOUT the particular state, and as such, might sometimes contain news from national outlets where the article itself might be focused on the state itself. Most (if not all) of the collection is going to be sourced from local state-specific outlets, but not always. Currently, we are keeping all URLs within the obtained collection for the state, except those from nytimes and foxnews since those are our chosen national outlets. Trusting mediacloud's collections to be truly state-focused even when urls come from national outlets. 

In [2]:
start_date = datetime.strptime('2023-04-01', '%Y-%m-%d').date()
end_date = datetime.strptime('2023-06-30', '%Y-%m-%d').date()

In [3]:
florida_news = pd.read_csv('data/mediacloud_florida.csv')
print(florida_news.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4253 entries, 0 to 4252
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ap_syndicated         4253 non-null   bool   
 1   collect_date          4253 non-null   object 
 2   feeds                 0 non-null      float64
 3   guid                  4253 non-null   object 
 4   language              4253 non-null   object 
 5   media_id              4253 non-null   int64  
 6   media_name            4253 non-null   object 
 7   media_url             4253 non-null   object 
 8   metadata              4253 non-null   object 
 9   processed_stories_id  4253 non-null   int64  
 10  publish_date          4253 non-null   object 
 11  stories_id            4253 non-null   int64  
 12  story_tags            4253 non-null   object 
 13  title                 4253 non-null   object 
 14  url                   4253 non-null   object 
 15  word_count           

In [4]:
florida_news[['language', 'media_name', 'media_url', 'publish_date', 'title', 'url']].tail(20)

Unnamed: 0,language,media_name,media_url,publish_date,title,url
4233,en,Boca Raton Forum,http://www.sun-sentinel.com/news/palm-beach/boca-raton/,2023-06-30 18:43:07,Florida medical boards approve rules for trans care,https://www.sun-sentinel.com/2023/06/30/florida-medical-boards-approve-rules-for-trans-care/
4234,en,Wellington Forum,http://www.sun-sentinel.com/news/palm-beach/wellington/,2023-06-30 18:43:07,Florida medical boards approve rules for trans care,https://www.sun-sentinel.com/2023/06/30/florida-medical-boards-approve-rules-for-trans-care/
4235,en,Deerfield Forum,http://www.sun-sentinel.com/news/broward/deerfield/,2023-06-30 18:43:07,Florida medical boards approve rules for trans care,https://www.sun-sentinel.com/2023/06/30/florida-medical-boards-approve-rules-for-trans-care/
4236,en,Fort Lauderdale News & Events - Sun Sentinel,http://www.sun-sentinel.com/news/broward/fort-lauderdale/,2023-06-30 18:43:07,Florida medical boards approve rules for trans care,https://www.sun-sentinel.com/2023/06/30/florida-medical-boards-approve-rules-for-trans-care/
4237,en,watermarkonline.com,http://watermarkonline.com/,2023-06-30 09:17:44,Marriage ruling looms over Pride celebrations in India,https://watermarkonline.com/2023/06/30/marriage-ruling-looms-over-pride-celebrations-in-india/
4238,en,wesh.com,http://www.wesh.com,2023-06-30 20:01:00,What the Supreme Court's LGBTQ rights decision means,https://www.wesh.com/article/supreme-court-s-lgbtq-rights-decision/44403263
4239,en,wpbf.com,http://wpbf.com/,2023-06-30 20:01:00,What the Supreme Court's LGBTQ rights decision means,https://www.wpbf.com/article/supreme-court-s-lgbtq-rights-decision/44403263
4240,en,baynews9.com,http://www.baynews9.com,2023-06-30 03:37:00,Local drag queen ponders new law restricting performances,http://www.baynews9.com/fl/tampa/news/2023/06/30/local-drag-queen-ponders-new-law-restricting-performances
4241,en,clickorlando.com,http://www.clickorlando.com,2023-06-30 20:09:19,Florida medical boards approve rules for transgender treatments,https://www.clickorlando.com/news/2023/07/01/florida-medical-boards-approve-rules-for-transgender-treatments/
4242,en,biscayne times online,http://biscaynetimes.com/,2023-06-30 15:10:46,Web Designer's Supreme Court Victory Limits LGBTQ Protections,https://www.biscaynetimes.com/noteworthy/web-designer-s-supreme-court-victory-limits-lgbtq-protection/


In [7]:
relevant_news = florida_news[~florida_news['media_name'].isin(['Fox News', 'New York Times'])]
relevant_news = relevant_news[relevant_news['language']=='en']
relevant_news = relevant_news[['media_name', 'publish_date', 'title', 'url']]
print(relevant_news.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4189 entries, 0 to 4252
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   media_name    4189 non-null   object
 1   publish_date  4189 non-null   object
 2   title         4189 non-null   object
 3   url           4189 non-null   object
dtypes: object(4)
memory usage: 163.6+ KB
None


In [8]:
relevant_news.publish_date = pd.to_datetime(relevant_news.publish_date)
relevant_news = relevant_news[relevant_news['publish_date'].dt.date >= start_date]
relevant_news = relevant_news[relevant_news['publish_date'].dt.date <= end_date]
print(relevant_news.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4189 entries, 0 to 4252
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   media_name    4189 non-null   object        
 1   publish_date  4189 non-null   datetime64[ns]
 2   title         4189 non-null   object        
 3   url           4189 non-null   object        
dtypes: datetime64[ns](1), object(3)
memory usage: 163.6+ KB
None


In [10]:
def get_article_from_url(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        subtitle = article.meta_description
        if subtitle == '' or subtitle == ' ':
            subtitle = None
        text = article.text
        if text == '' or text == ' ':
            text = None
    except:
        subtitle = None
        text = None
    return subtitle, text

In [11]:
subtitles, texts = [], []
all_urls = list(relevant_news['url'])
for url in tqdm(all_urls):
    subtitle, text = get_article_from_url(url)
    subtitles.append(subtitle)
    texts.append(text)
relevant_news['subtitle'] = subtitles
relevant_news['text'] = texts
print(relevant_news.info())

100%|██████████| 4189/4189 [44:36<00:00,  1.57it/s]  

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4189 entries, 0 to 4252
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   media_name    4189 non-null   object        
 1   publish_date  4189 non-null   datetime64[ns]
 2   title         4189 non-null   object        
 3   url           4189 non-null   object        
 4   subtitle      3731 non-null   object        
 5   text          3899 non-null   object        
dtypes: datetime64[ns](1), object(5)
memory usage: 229.1+ KB
None





In [12]:
relevant_news.to_csv('data/florida_article_texts_and_info.csv',
                     index=False)