# Downloading the Data using the Guardian Open Platform API

In [10]:
import requests
import pandas as pd

In [11]:
# Let's define API key and endpoint
API_KEY = '8692185a-3471-470c-99b7-9c798186ce67'  # change the key
BASE_URL = 'https://content.guardianapis.com/search'

In [12]:
# Now let's set the parameters for fetching editorials
params = {
    'section': 'commentisfree',  # Section for editorials
    'from-date': '2024-01-01',  # Start date
    'to-date': '2024-11-30',  # End date
    'page-size': 50,  # Number of articles per page (max 50 for free plan)
    'show-fields': 'headline,body',  # Include headline and content
    'api-key': API_KEY  # API key
}

In [13]:
# Let's write the function to fetch articles
def fetch_articles(api_url, params, max_pages=100):
    all_articles = []
    for page in range(1, max_pages + 1):
        params['page'] = page
        response = requests.get(api_url, params=params)
        if response.status_code == 200:
            data = response.json()
            results = data.get('response', {}).get('results', [])
            if not results:  # Stop if no articles are found
                break
            for article in results:
                all_articles.append({
                    'date_of_publication': article.get('webPublicationDate', ''),
                    'headline': article['fields'].get('headline', ''),
                    'content': article['fields'].get('body', '')
                })
            print(f"Fetched {len(results)} articles from page {page}.")
        else:
            print(f"Failed to fetch data: {response.status_code}, {response.text}")
            break
    return all_articles

In [14]:
# Now let's fetch the articles
articles = fetch_articles(BASE_URL, params)

# And, save them to a DataFrame
df = pd.DataFrame(articles)

Fetched 50 articles from page 1.
Fetched 50 articles from page 2.
Fetched 50 articles from page 3.
Fetched 50 articles from page 4.
Fetched 50 articles from page 5.
Fetched 50 articles from page 6.
Fetched 50 articles from page 7.
Fetched 50 articles from page 8.
Fetched 50 articles from page 9.
Fetched 50 articles from page 10.
Fetched 50 articles from page 11.
Fetched 50 articles from page 12.
Fetched 50 articles from page 13.
Fetched 50 articles from page 14.
Fetched 50 articles from page 15.
Fetched 50 articles from page 16.
Fetched 50 articles from page 17.
Fetched 50 articles from page 18.
Fetched 50 articles from page 19.
Fetched 50 articles from page 20.
Fetched 50 articles from page 21.
Fetched 50 articles from page 22.
Fetched 50 articles from page 23.
Fetched 50 articles from page 24.
Fetched 50 articles from page 25.
Fetched 50 articles from page 26.
Fetched 50 articles from page 27.
Fetched 50 articles from page 28.
Fetched 50 articles from page 29.
Fetched 50 articles fro

In [15]:
# And save the data to a CSV file format
df.to_csv('guardian_editorials_data.csv', index=False)
print("Data saved to guardian_editorials_data.csv")

Data saved to guardian_editorials_data.csv


# Initial Checks on the Dataset

In [16]:
import pandas as pd

# Let's load the updated dataset
file_path = 'data/guardian_editorials_data.csv'
data = pd.read_csv(file_path)

# Let's inspect the first few rows
print("First few rows of the dataset:")
print(data.head())

# And, check the basic info about the dataset
print("\nDataset Info:")
data.info()

# Also check the summary statistics
print("\nSummary Statistics:")
print(data.describe(include='all'))

First few rows of the dataset:
    date_of_publication                                           headline  \
0  2024-11-30T19:30:34Z  The Observer view: Shaky ceasefire is no victo...   
1  2024-11-30T19:00:33Z  The Observer view: Ignore the stigma and tackl...   
2  2024-11-30T18:00:33Z  Wicked would be fun and forgettable but for th...   
3  2024-11-30T17:00:31Z  Feeding off anger, fuelled by Russia… Enter Că...   
4  2024-11-30T16:00:32Z  What connects Huddersfield’s 1990s football st...   

                                             content  
0  <p>For the people of Lebanon, last week’s agre...  
1  <p>‘I wanted them all to notice.” This is the ...  
2  <p>The “war on woke” has a new target and her ...  
3  <p>Politics in Romania can be a bloody busines...  
4  <p>1994 was a vintage year for architecture. T...  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4503 entries, 0 to 4502
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype 


In [17]:
# Now, let's check for missing values
missing_values = data.isnull().sum()
print("\nMissing Values:")
print(missing_values)

# let's check for duplicate rows
duplicates = data.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")



Missing Values:
date_of_publication    0
headline               0
content                0
dtype: int64

Number of duplicate rows: 0


In [18]:
# Let's check current data types
print("\nColumn Data Types:")
print(data.dtypes)

# And, convert 'date_of_publication' to datetime if applicable
if 'date_of_publication' in data.columns:
    data['date_of_publication'] = pd.to_datetime(data['date_of_publication'], errors='coerce')
    print("\nAfter converting 'date_of_publication' to datetime:")
    print(data.dtypes)



Column Data Types:
date_of_publication    object
headline               object
content                object
dtype: object

After converting 'date_of_publication' to datetime:
date_of_publication    datetime64[ns, UTC]
headline                            object
content                             object
dtype: object


In [19]:
# Let's count total articles and unique headlines
print(f"\nTotal articles: {data.shape[0]}")
print(f"Unique headlines: {data['headline'].nunique()}")

# Let's check headline length distribution
data['headline_length'] = data['headline'].str.len()
print("\nHeadline Length Statistics:")
print(data['headline_length'].describe())

# Also the content length distribution
data['content_length'] = data['content'].str.len()
print("\nContent Length Statistics:")
print(data['content_length'].describe())



Total articles: 4503
Unique headlines: 4503

Headline Length Statistics:
count    4503.000000
mean       85.762825
std        14.392470
min        17.000000
25%        77.000000
50%        86.000000
75%        96.000000
max       134.000000
Name: headline_length, dtype: float64

Content Length Statistics:
count     4503.000000
mean      7165.015989
std       2606.108669
min       1880.000000
25%       5454.000000
50%       6983.000000
75%       8473.000000
max      48873.000000
Name: content_length, dtype: float64


In [20]:
# And, save cleaned data to a new CSV file
cleaned_file_path = 'data/clean_guardian_editorials_data.csv'
data.to_csv(cleaned_file_path, index=False)
print(f"\nCleaned data saved to: {cleaned_file_path}")


Cleaned data saved to: data/clean_guardian_editorials_data.csv
