In [1]:
from dotenv import load_dotenv
import os
import pandas as pd
load_dotenv()
from gdtm.helpers.common import load_flat_dataset

base_dir = os.getenv('BASEDIR')

In [45]:
from tqdm import tqdm
tqdm.pandas()

In [9]:
data = pd.read_csv(os.path.join(base_dir, 'data', '01_raw', 'allsides_data.csv'))

In [10]:
data= data[~data['rating_num'].isna()]

In [40]:
import requests
from bs4 import BeautifulSoup
import re
def get_domain(news_source):
    page = requests.get("https://www.google.com/search?q={news_source}".format(news_source=news_source))
    soup = BeautifulSoup(page.content)
    links = soup.find_all("a",href=re.compile("(?<=/url\?q=)(htt.*://.*)"))
    return(re.split(":(?=http)",links[0]["href"].replace("/url?q=",""))[0])

In [50]:
data['type'].unique()

array(['Think Tank / Policy Group', 'News Media', 'Author'], dtype=object)

In [41]:
data['news_source']

0                   AARP
1               ABC News
3      Accuracy in Media
4                   ACLU
5                    AJ+
             ...        
541       William McGurn
542    Wisconsin Gazette
543              WND.com
544          Yahoo! News
546        Yes! Magazine
Name: news_source, Length: 533, dtype: object

In [42]:
temp = get_domain('AARP')

In [46]:
temp = data['news_source'].progress_apply(get_domain)

100%|██████████| 533/533 [17:02<00:00,  1.92s/it]


In [51]:
data['news_url'] = temp

In [61]:
from urllib.parse import urljoin, urlparse
remove_url_params = lambda url: urlparse(url).netloc

In [66]:
data['netloc'] = data['news_url'].apply(remove_url_params)

In [67]:
import tldextract
def extract_domain(url):
    ext = tldextract.extract(url)
    return('.'.join([ext.domain, ext.suffix]))

In [68]:
data['domain'] = data['news_url'].apply(extract_domain)

In [70]:
data['stance'] = data['rating_num'] - 3

In [85]:
filtered_data = data[['news_source', 'domain', 'stance', 'type']]

In [88]:
filtered_data = filtered_data[filtered_data['type'] != 'Author']

In [89]:
bad_domains = filtered_data['domain'].value_counts().sort_values(ascending=False).iloc[:20].index

In [90]:
for r in filtered_data[filtered_data['domain'].isin(bad_domains)].iterrows():
    print(r)

(5, news_source              AJ+
domain         wikipedia.org
stance                  -2.0
type              News Media
Name: 5, dtype: object)
(13, news_source    American Enterprise Institute
domain                               aei.org
stance                                   1.0
type               Think Tank / Policy Group
Name: 13, dtype: object)
(18, news_source     Americans for Tax Reform
domain                     wikipedia.org
stance                               2.0
type           Think Tank / Policy Group
Name: 18, dtype: object)
(29, news_source    Association for Psychological Science
domain                                  google.co.in
stance                                           0.0
type                       Think Tank / Policy Group
Name: 29, dtype: object)
(49, news_source       Boston Herald
domain         bostonherald.com
stance                      1.0
type                 News Media
Name: 49, dtype: object)
(50, news_source    Boston Herald Editorial
domain  

In [95]:
filtered_data[filtered_data['domain'].isin(bad_domains)]

Unnamed: 0,news_source,domain,stance,type
5,AJ+,wikipedia.org,-2.0,News Media
13,American Enterprise Institute,aei.org,1.0,Think Tank / Policy Group
18,Americans for Tax Reform,wikipedia.org,2.0,Think Tank / Policy Group
29,Association for Psychological Science,google.co.in,0.0,Think Tank / Policy Group
49,Boston Herald,bostonherald.com,1.0,News Media
50,Boston Herald Editorial,bostonherald.com,1.0,News Media
69,Center - Major Media Sources,forbes.com,0.0,News Media
95,CNN (Web News),cnn.com,-1.0,News Media
96,CNN - Editorial,cnn.com,-2.0,News Media
97,CNS News,cnsnews.com,2.0,News Media


In [94]:
filtered_data[['news_source', 'domain', 'stance']].to_csv(os.path.join(base_dir, 'data', '02_processed', 'allsides_w_domains.csv'))