In [1]:
from datetime import datetime, timedelta
import pickle
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import re

from tldextract import extract
import langid

pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 50)

from newspaper import Article

  pd.set_option('display.max_colwidth', -1)


NOTE: Mediacloud news for state-specific collection includes news ABOUT the particular state, and as such, might sometimes contain news from national outlets where the article itself might be focused on the state itself. Most (if not all) of the collection is going to be sourced from local state-specific outlets, but not always. Currently, we are keeping all URLs within the obtained collection for the state, except those from nytimes and foxnews since those are our chosen national outlets. Trusting mediacloud's collections to be truly state-focused even when urls come from national outlets. 

In [2]:
start_date = datetime.strptime('2023-04-01', '%Y-%m-%d').date()
end_date = datetime.strptime('2023-06-30', '%Y-%m-%d').date()

In [3]:
ny_news = pd.read_csv('data/mediacloud_newyork.csv')
print(ny_news.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5088 entries, 0 to 5087
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ap_syndicated         5088 non-null   bool   
 1   collect_date          5088 non-null   object 
 2   feeds                 0 non-null      float64
 3   guid                  5088 non-null   object 
 4   language              5088 non-null   object 
 5   media_id              5088 non-null   int64  
 6   media_name            5088 non-null   object 
 7   media_url             5088 non-null   object 
 8   metadata              5088 non-null   object 
 9   processed_stories_id  5088 non-null   int64  
 10  publish_date          5088 non-null   object 
 11  stories_id            5088 non-null   int64  
 12  story_tags            5088 non-null   object 
 13  title                 5088 non-null   object 
 14  url                   5088 non-null   object 
 15  word_count           

In [4]:
ny_news[['language', 'media_name', 'media_url', 'publish_date', 'title', 'url']].tail(20)

Unnamed: 0,language,media_name,media_url,publish_date,title,url
5068,en,Newsday,http://www.newsday.com/,2023-06-30 18:00:00,Fair medical debt act would help many in state,https://www.newsday.com/opinion/letters/medical-debt-pride-lgbtqia-rights-ji10m9i5
5069,en,indypendent.org,http://www.indypendent.org/#spider,2023-06-30 15:32:57,Queer Liberation March is Both the Past and the Future of Pride,https://indypendent.org/2023/06/queer-liberation-march-is-both-the-past-and-the-future-of-pride/
5070,en,NNY Now,http://www.nnynow.com/,2023-06-30 17:03:16,Georgia families sue state over ban on certain gender-affirming treatments for trans youth,https://www.cnn.com/2023/06/30/politics/georgia-lawsuit-gender-affirming-care-ban/index.html
5071,en,NNY Now,http://www.nnynow.com/,2023-06-30 17:50:14,What the Supreme Court's LGBTQ rights decision means,https://www.cnn.com/2023/06/30/politics/lgbtq-rights-public-accommodations-laws-supreme-court/index.html
5072,en,whec.com,http://www.whec.com,2023-06-30 18:21:16,Wider than websites? LGBTQ+ advocates fear broader discrimination after Supreme Court ruling,https://www.whec.com/national-world/wider-than-websites-lgbtq-advocates-fear-broader-discrimination-after-supreme-court-ruling/
5073,en,Newsday,http://www.newsday.com/,2023-06-30 18:23:18,Wider than websites? LGBTQ+ advocates fear broader discrimination after Supreme Court ruling,https://www.newsday.com/business/wider-than-websites-lgbtq-advocates-fear-broader-discrimination-after-supreme-court-ruling-npgy0hba
5074,en,newzjunky.com,http://newzjunky.com/,2023-06-30 18:19:08,‘Get out of my city’: DeSantis calls them ‘mama bears.’ Protesters call Moms for Liberty extremists,https://www.usatoday.com/story/news/politics/2023/06/30/moms-for-liberty-convention-philadelphia-protests/70363926007/
5075,en,Epoch Times,http://www.theepochtimes.com,2023-06-30 19:05:08,1619 Project’s Reparation Math Curriculum Will Be Harmful: Author,https://www.theepochtimes.com/1619-projects-reparation-math-curriculum-will-be-harmful-author_5368332.html
5076,en,New York Amsterdam News,http://www.amsterdamnews.com,2023-06-30 12:40:12,Rikers Island detainees celebrate Pride 2023,https://amsterdamnews.com/news/2023/06/30/rikers-island-detainees-celebrate-pride-2023/
5077,en,Irish Voice,http://www.irishcentral.com/IrishVoice/,2023-06-30 06:59:52,Irish LGBTQ group marches as part of New York Pride,https://www.irishcentral.com/news/community/irish-lgbtq-new-york-pride


In [5]:
relevant_news = ny_news[~ny_news['media_name'].isin(['Fox News', 'New York Times'])]
relevant_news = relevant_news[relevant_news['language']=='en']
relevant_news = relevant_news[['media_name', 'publish_date', 'title', 'url']]
print(relevant_news.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4645 entries, 0 to 5087
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   media_name    4645 non-null   object
 1   publish_date  4645 non-null   object
 2   title         4645 non-null   object
 3   url           4645 non-null   object
dtypes: object(4)
memory usage: 181.4+ KB
None


In [6]:
relevant_news.publish_date = pd.to_datetime(relevant_news.publish_date)
relevant_news = relevant_news[relevant_news['publish_date'].dt.date >= start_date]
relevant_news = relevant_news[relevant_news['publish_date'].dt.date <= end_date]
print(relevant_news.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4645 entries, 0 to 5087
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   media_name    4645 non-null   object        
 1   publish_date  4645 non-null   datetime64[ns]
 2   title         4645 non-null   object        
 3   url           4645 non-null   object        
dtypes: datetime64[ns](1), object(3)
memory usage: 181.4+ KB
None


In [7]:
def get_article_from_url(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        subtitle = article.meta_description
        if subtitle == '' or subtitle == ' ':
            subtitle = None
        text = article.text
        if text == '' or text == ' ':
            text = None
    except:
        subtitle = None
        text = None
    return subtitle, text

In [8]:
subtitles, texts = [], []
all_urls = list(relevant_news['url'])
for url in tqdm(all_urls):
    subtitle, text = get_article_from_url(url)
    subtitles.append(subtitle)
    texts.append(text)
relevant_news['subtitle'] = subtitles
relevant_news['text'] = texts
print(relevant_news.info())

100%|██████████| 4645/4645 [1:15:40<00:00,  1.02it/s]  

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4645 entries, 0 to 5087
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   media_name    4645 non-null   object        
 1   publish_date  4645 non-null   datetime64[ns]
 2   title         4645 non-null   object        
 3   url           4645 non-null   object        
 4   subtitle      2802 non-null   object        
 5   text          3819 non-null   object        
dtypes: datetime64[ns](1), object(5)
memory usage: 254.0+ KB
None





In [9]:
relevant_news.to_csv('data/newyork_article_texts_and_info.csv',
                     index=False)