In [24]:
import pandas as pd
from os import listdir
import json
import re



Error processing file json_responses/news_160161.json: Expecting value: line 1 column 1 (char 0)


In [125]:
import os
import logging
import requests

def search_and_save(
    url_template: str, search_term: str, output_dir: str
):
    # Replace the placeholder with the search term
    search_url = url_template.format(search_term=search_term)
    logging.info(f"Searching for '{search_term}' using {search_url}")
    
    # Create directory to save the search results pages
    os.makedirs(output_dir, exist_ok=True)
    
    # Extract the domain name to use as part of the filename
    domain = search_url.split('/')[2]
    
    # Fetch the search results page
    try:
        response = requests.get(search_url)
        
        # Raise error for bad responses (4xx or 5xx)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        logging.info(f"Error fetching search results: {e}")
        return
    
    # Save the search results page to the output directory
    output_filepath = os.path.join(output_dir, f"{domain}_search_{search_term}.html")
    
    with open(output_filepath, 'wb') as outfile:
        outfile.write(response.content)

    info_msg=f"Fetched search results for '{search_term}' from {search_url} and saved as {output_filepath}"
    logging.info(info_msg)

    return output_filepath

def download_json_responses(
    input_file: str, pattern: str, output_dir: str
):
    # Check if the input file exists
    if not os.path.isfile(input_file):
        print(f"Input file '{input_file}' not found.")
        return
    
    output_file_prefix = 'news'

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Read input file and find all URLs matching the pattern
    with open(input_file, 'r') as file:
        links = re.findall(pattern, file.read())

    # Download each link using requests library
    for link in links:
        # Extract the news number from the link
        news_number = re.search(r'[0-9]+$', link).group()

        # Construct the output file path
        filename = f"{output_file_prefix}_{news_number}.json"
        output_file = os.path.join(output_dir, filename)

        # Perform GET request to download JSON data
        response = requests.get(link)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Save the JSON response to file
            with open(output_file, 'wb') as outfile:
                outfile.write(response.content)
            
            logging.info(f"Downloaded {link} to {output_file}")
        else:
            logging.error(f"Failed to download {link}. Status code: {response.status_code}")

    logging.info(f"Downloaded JSON responses from links in {input_file} to {output_dir}/{output_file_prefix}_*.json")


def extract_data(source_folder, search_phrases):
    # Assuming 'data' contains the JSON data you provided earlier
    filenames = listdir(source_folder)
    filenames = [f for f in filenames if f.endswith('.json')]

    # Initialize lists to store extracted data
    ids = []
    titles = []
    dates = []
    descriptions = []
    picture_filenames = []
    search_phrase_counts = []
    contains_money = []

    for filename in filenames:
        file_path=f'{source_folder}/{filename}'
        try:
            with open(file_path, 'r') as file:
                data = json.load(file)

                # Extract basic fields
                id_ = filename.split('.')[0].split('_')[-1]
                title = data['title']
                date = data['publication_date']
                description = data.get('description', '')
                picture_filename = ''
                for image_data in data.get('lead_asset', []):
                    if image_data['type'] == 'lead_image':    
                        picture_filename = image_data['value']['image']['file']
                        break 
                
                # Count search phrases in title and description
                title_search_count = sum(1 for phrase in search_phrases if phrase in title)
                description_search_count = sum(1 for phrase in search_phrases if phrase in description)

                # Check if title or description contains any money mention
                # Regular expression pattern to detect money formats
                money_pattern = r'\$[\d,]+(\.\d+)?|\d+\s(dollars|USD)'
                
                title_has_money = bool(re.search(money_pattern, title))
                description_has_money = bool(re.search(money_pattern, description))

                # Store extracted data in lists
                ids.append(id_)
                titles.append(title)
                dates.append(date)
                descriptions.append(description)
                picture_filenames.append(picture_filename)
                search_phrase_counts.append(title_search_count + description_search_count)
                contains_money.append(title_has_money or description_has_money)
        
        except Exception as e:
            print(f'Error processing file {file_path}: {e}')

    # Create a Pandas DataFrame
    df_data={
        'id': ids,
        'title': titles,
        'date': dates,
        'description': descriptions,
        'picture_filename': picture_filenames,
        'search_phrase_count': search_phrase_counts,
        'contains_money': contains_money
    }
    df = pd.DataFrame(df_data)

    return df

def setup_logging():
    log_format = '%(asctime)s - %(message)s'
    logging.basicConfig(level=logging.INFO, format=log_format)

# Function to generate mask for given number of months
def generate_month_mask(series_: pd.Series, num_months: int):
    import pytz
    from datetime import datetime

    num_months = max(0, num_months-1)
    
    # Ensure series_ is converted to datetime with UTC timezone
    if not pd.api.types.is_datetime64_any_dtype(series_):
        series_ = pd.to_datetime(series_, utc=True)
    elif series_.dtype != 'datetime64[ns, UTC]':
        series_ = series_.dt.tz_localize('UTC')
    
    # Get the current date and convert to UTC timezone-naive if needed
    current_date = datetime.now(pytz.utc)
    
    # Calculate the start date based on the number of months
    start_date = current_date - pd.DateOffset(months=num_months)
    
    # Create a mask to check if the date is within the specified range
    mask = (series_ >= start_date) & (series_ <= current_date)
    
    return mask



In [130]:
df[generate_month_mask(df['date'], 1)]


Unnamed: 0,id,title,date,description,picture_filename,search_phrase_count,contains_money
7,160144,"At a Queens school, migrant kids gain melodies...",2024-07-03 05:01:00-04:00,The Academy for New Americans creates a welcom...,https://images-prod.gothamist.com/original_ima...,0,False
9,160065,NYC’s graduating ‘COVID babies’ reflect as hig...,2024-06-26 05:01:00-04:00,Gothamist interviewed students who are speakin...,https://images-prod.gothamist.com/original_ima...,0,False
10,160130,Pamphlets and bullhorns: G train riders naviga...,2024-07-01 17:43:10.301503-04:00,The MTA says the closure is necessary to make ...,https://images-prod.gothamist.com/original_ima...,0,False
21,159967,Automated ticketing of drivers blocking MTA bu...,2024-06-17 14:05:00-04:00,Drivers parked or double parked at bus stops w...,https://images-prod.gothamist.com/original_ima...,0,True
28,160039,NYC officials plan rally to stop budget cuts t...,2024-06-22 21:56:00-04:00,As libraries stare down $58.3 million in cuts ...,https://images-prod.gothamist.com/original_ima...,0,True
31,160239,NYC nightlife legend DJ Rekha brings 'Basement...,2024-07-11 11:01:08.444117-04:00,"“Every time I try to get out, they pull me bac...",https://images-prod.gothamist.com/original_ima...,0,False
35,160138,Was that bang a gunshot or a firework? How New...,2024-07-03 06:00:00-04:00,While both acts are generally illegal in New Y...,https://images-prod.gothamist.com/original_ima...,0,False
56,160228,NYC's massive Link5G towers aren't actually pr...,2024-07-11 06:31:00-04:00,"Almost all of the towers stand empty, CityBrid...",https://images-prod.gothamist.com/original_ima...,0,False
69,160100,Brooklyn and Queens prepare yourselves: The G ...,2024-06-28 11:01:00-04:00,Not since the MTA announced the closure of L t...,https://images-prod.gothamist.com/original_ima...,0,False
75,160147,NJ lawmaker looks to ban algorithms blamed for...,2024-07-06 09:01:04.483498-04:00,Critics say using software like RealPage's ser...,https://images-prod.gothamist.com/original_ima...,0,False


In [88]:
df['date']=df['date'].apply(pd.to_datetime)



2
2024


AttributeError: 'RangeIndex' object has no attribute 'month'

In [131]:
import re
import shutil

# URL template with placeholders for the search term
url_template = "https://gothamist.com/search?q={search_term}"
search_term='technology'
months_horizon=3

setup_logging()

logging.info(f'Searching for {search_term} on {url_template} and saving the search results')
filepath = search_and_save(url_template, search_term, 'search_results')

if filepath:
    # Define the pattern to match URLs
    pattern = r'https://api-prod.gothamist.com/api/v2/pages/[0-9]{4,}'

    logging.info('Downloading JSON responses')
    download_json_responses(filepath, pattern, 'json_responses')

shutil.rmtree('search_results')

filenames = listdir('json_responses')

if not filenames:
    logging.info('No JSON responses to process. Exiting...')
    exit()
else: 
    # Extract data from JSON responses
    logging.info('Extracting data')
    df = extract_data('json_responses', [search_term])

    shutil.rmtree('json_responses')

    # Save the extracted data to a CSV file
    logging.info(f"Saving extracted data to 'output.csv'")
    output_folder='output'
    output_filename='output.csv'
    os.makedirs(output_folder, exist_ok=True)

    # Filter the data based on the number of months
    filter_mask = generate_month_mask(df['date'], months_horizon)
    df_filtered=df[filter_mask]

    df_filtered.to_csv(f'{output_folder}/{output_filename}', index=False)
    logging.info('Done!')


Error processing file json_responses/news_160239.json: Expecting value: line 5 column 1 (char 4)


ValueError: time data "2024-03-25T15:42:00-04:00" doesn't match format "%Y-%m-%dT%H:%M:%S.%f%z", at position 3. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.