In [None]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import json

In [None]:
#Police Logs Website URL 
policelogs_url = 'https://www.police.ucsd.edu/docs/reports/callsandarrests/Calls_and_Arrests.asp'
pdf_url_prepend = 'https://www.police.ucsd.edu/docs/reports/callsandarrests/'
extended_url_prepend = 'https://www.police.ucsd.edu/docs/reports/callsandarrests/CallsForService/'

In [None]:
# Simply reads the first line of a filepath. 
def get_last_date_tweeted(filepath): 
    try:
        with open(filepath, 'r') as file:       
            return file.readline()
    except FileNotFoundError:
        print(f"The file '{filepath}' was not found.")
    except IOError as e:
        print(f"An error occurred: {e}")

# Simply writes a single line to a filepath .
def write_last_date_tweeted(filepath, line_to_write): 
    try:
        with open(filepath, 'w') as file:
            file.write(line_to_write)
            print(f"Line written to '{filepath}': {line_to_write}")
    except IOError as e:
        print(f"An error occurred: {e}")

'''
    Goes to the police pdf website and gets the 
    option values of all tags that have values. 
'''
def get_all_pdf_paths():
    response = requests.get(policelogs_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('option')
    return [link['value'].split('/')[1] for link in links if link['value']]

'''
    formats a pdf date into a date object.
     ex: 'September 27, 2023 UPDATED.pdf' or 'October 4, 2023.pdf'
'''
def get_date_object(unformatted_date): 
    split_u_d = unformatted_date.split(' ')
    month_part, day_part, year_part = split_u_d[0],  split_u_d[1], split_u_d[2].split('.')[0]
    date_string = month_part + " " + day_part + " " + year_part
    return datetime.strptime(date_string, '%B %d, %Y')

'''
    Gets the pdf names of all the files that need to be
     downloaded based on the last date tweeted. 
'''
def get_download_batch_names(all_dates, last_date): 
    return [date_str for date_str in all_dates if get_date_object(date_str) > get_date_object(last_date)]

'''
order the batch dates 2222
'''

def get_download_batch_names_222(all_dates, last_date):
    ret_ar = []
    for date_str in all_dates:
        if get_date_object(date_str) > get_date_object(last_date) and get_date_object(date_str) < get_date_object("September 1, 2023.pdf"):
            ret_ar.append(date_str)

    return ret_ar




# GOT LAST DATE PRINTED

In [None]:
'''
Lets say that there is a file with a single date on it
and it is the last date that the bot tweeted the crime logs. 
We need to find all the dates following this date. 
'''

# Get the LAST DATE TWEETED
last_date_filepath = 'last_date_tweeted.txt'
last_date = get_last_date_tweeted(last_date_filepath)

print(last_date)


# GOT ALL THE PDF LINK PATHS/DATES

In [None]:
# Get the list of ALL pdfs/dates currently on the website.
link_dates = get_all_pdf_paths()

In [None]:
print(link_dates)

# GOT THE PDF BATCH WE NEED TO TWEET BASED ON THE LAST TIME WE TWEETED

In [None]:
batch_of_pdfs_needed_to_be_tweeted = get_download_batch_names_222(link_dates, last_date) #### CHANGE THIS BACK
print(batch_of_pdfs_needed_to_be_tweeted)

# ORDER THE BATCH BEFORE WE EXPORT IT

In [None]:
sorted_batch = sorted(batch_of_pdfs_needed_to_be_tweeted, key=get_date_object)

for item in sorted_batch:
    print(item)

In [None]:

#DONT LEAVE THIS IN


sorted_batch = sorted_batch[:5]

In [None]:
sorted_batch

# WRITE to JSON FILE FUNCTION

In [None]:
def export_batch_to_json_file(filepath, batch):
    try:
        # Open the JSON file in write mode ('w')
        with open(filepath, 'w') as file:
            # Write the Python list as a JSON list to the file
            json.dump(batch, file)
            
        print(f"Data written to '{filepath}' successfully.")

    except IOError as e:
        print(f"An error occurred: {e}")


# MOMENT WHERE WE EXPORT THIS BATCH. 

In [None]:
json_file_path = 'batch_to_download.json'

export_batch_to_json_file(json_file_path, sorted_batch)