#### Data Collection
Here I am going to collect data from the MediaStack API to be able to fine-tune the GPT-2 model

In [1]:
import pandas as pd
from mediastack import get_news

In [2]:
## Get recent data for english news from the U.S.
data = []
for page in range(100):
    params= {
        "langauges":"en",
        "countries":"us",
        "sort":"published_desc",
        "limit":100,
        "offset": page*100
    }
    data.extend(get_news(params))

## Get popular data for english news from the U.S.
for page in range(100):
    params= {
        "langauges":"en",
        "countries":"us",
        "sort":"popularity",
        "limit":100,
        "offset": page*100
    }
    data.extend(get_news(params))

In [3]:
df = pd.DataFrame(data).drop_duplicates()

In [4]:
## Create the data folder if it doesn't exist
import os
if not os.path.exists("data"):
    os.mkdir("data")

In [5]:
df.head()

Unnamed: 0,author,title,description,url,source,image,category,language,country,published_at
0,CNN Newssource,Bernie Sanders launches Senate probe into Amaz...,"By Catherine Thorbecke, CNN (CNN) &#8212; Sen....",https://krdo.com/money/cnn-business-consumer/2...,krdo,,general,en,us,2023-06-21T02:03:53+00:00
1,Tech. Sgt. Emili Koonce,New commander accepts 8th OSS guidon [Image 6 ...,"Lt. Col. Jonathan Kay, 8th Operations Support ...",https://www.dvidshub.net/image/7872081/new-com...,dvidshub,https://cdn.dvidshub.net/media/thumbs/photos/2...,general,en,us,2023-06-21T02:02:57+00:00
2,Patrick Ryan | USA Today,"‘And Just Like That...’ stars talk Season 2, K...",Sarah Jessica Parker and John Corbett are show...,https://chicago.suntimes.com/movies-and-tv/202...,suntimes,,general,en,us,2023-06-21T02:02:55+00:00
3,"Jim Ryan, Contributor",Dead And Company Put Their Spin On Motown As F...,"On stage in Chicago, with Blues Fest taking pl...",https://www.forbes.com/sites/jimryan1/2023/06/...,Forbes,https://imageio.forbes.com/specials-images/ima...,general,en,us,2023-06-21T02:02:53+00:00
4,Tech. Sgt. Emili Koonce,New commander accepts 8th OSS guidon [Image 5 ...,"Lt. Col. John Ryan, left, 8th Operations Group...",https://www.dvidshub.net/image/7872080/new-com...,dvidshub,https://cdn.dvidshub.net/media/thumbs/photos/2...,general,en,us,2023-06-21T02:02:16+00:00


In [6]:
df['category'].value_counts(dropna=False)

category
general          9327
sports           4475
business         4415
science           480
technology        274
entertainment     228
politics           35
health              7
Name: count, dtype: int64

In [7]:
## It looks like health, politics, entertainment, technology, and science are underrepresented, so let's get more articles from those specific categories
##  I'm doing this so that our training data isn't too imbalanced.
for page in range(25):
    params= {
        "langauges":"en",
        "countries":"us",
        "categories":"health,politics,entertainment,technology,science",
        "sort":"popularity",
        "limit":100,
        "offset": page*100
    }
    data.extend(get_news(params))
    
for page in range(25):
    params= {
        "langauges":"en",
        "countries":"us",
        "categories":"health,politics,entertainment,technology,science",
        "sort":"published_desc",
        "limit":100,
        "offset": page*100
    }
    data.extend(get_news(params))

In [8]:
pd.DataFrame(data)['category'].value_counts()

category
general          9387
sports           4773
business         4746
science          2642
technology       1671
entertainment    1490
politics          241
health             50
Name: count, dtype: int64

In [15]:
## It looks like politics and health are still underrepresented, so let's get more articles
for page in range(50):
    params= {
        "langauges":"en",
        "countries":"us",
        "categories":"politics,health",
        "sort":"popularity",
        "limit":100,
        "offset": page*100
    }
    data.extend(get_news(params))
    
for page in range(50):
    params= {
        "langauges":"en",
        "countries":"us",
        "categories":"politics,health",
        "sort":"published_desc",
        "limit":100,
        "offset": page*100
    }
    data.extend(get_news(params))

In [16]:
pd.DataFrame(data).drop_duplicates()['category'].value_counts()

category
general          9327
sports           4475
business         4415
science          2019
technology       1322
entertainment    1221
politics          693
health            153
Name: count, dtype: int64

In [17]:
pd.DataFrame(data).drop_duplicates().to_csv("data/english_news.csv", index=False)