# Read the batch_to_download JSON file

In [None]:
import json    

In [None]:
def read_batch_json(filepath):
    try:
        # Open and read the JSON file
        with open(filepath, 'r') as file:
            # Parse the JSON content into a Python list
            data_list = json.load(file)

            
            # Now, data_list contains the content of the JSON file as a Python list
            print(data_list)
            return data_list
            
    except FileNotFoundError:
        print(f"The file '{filepath}' was not found.")
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
    except IOError as e:
        print(f"An error occurred: {e}")


In [None]:
json_file_path = 'batch_to_download.json'
pdf_batch_list = read_batch_json(json_file_path)

# Notice: if we print out the pdf batch list, we either have an empty array or a sorted array with 1 element or more. 

In [None]:
import requests

# DOWNLOAD THE PDF FILES.

In [None]:
def download_pdf_files():

    extended_url_prepend = 'https://www.police.ucsd.edu/docs/reports/callsandarrests/CallsForService/'
    
    for pdf in pdf_batch_list:
        pdf_url = (extended_url_prepend + pdf).replace(' ', "%20")
        save_path = pdf.replace(' ', '_').replace(',', '')
        
        try:
            # Send an HTTP GET request to the PDF URL
            response = requests.get(pdf_url)
            # Check if the request was successful (HTTP status code 200)
            if response.status_code == 200:
                # Open the file in binary write ('wb') mode and save the content
                with open(save_path, 'wb') as pdf_file:
                    pdf_file.write(response.content)
                
                print(f"PDF downloaded and saved as '{save_path}'")
            else:
                print(f"Failed to download PDF. Status code: {response.status_code}")

        except requests.exceptions.RequestException as e:
            print(f"An error occurred: {e}")


In [None]:
download_pdf_files()

# ALL PDFS ThAT ARE DONWLOADED WILL Go TO THE BATCH_DOWNLOADS FOLDER

In [None]:
import os
import shutil

SOURCE_DIR = '/Users/axelsagundo/x_crimelog'
DEST_DIR = '/Users/axelsagundo/x_crimelog/batch_downloads'

for fname in os.listdir(SOURCE_DIR):
    if fname.lower().endswith('.pdf'):
        shutil.move(os.path.join(SOURCE_DIR, fname), DEST_DIR)



# NOW WE CAN START CONVERTING EVERY PDF TO TEXT AND TWEET IT OUT. 

In [None]:
import pdfplumber

# Function to generate the Output .txt files that contain the content of each pdf download

In [None]:
def generate_output_txt_files(sorted_saved_pdf_names):
    counter2 = -1

    for pdf_filename in sorted_saved_pdf_names:
        pdf = pdfplumber.open(pdf_filename)
        outputtxt = "{n}.txt".format(n = pdf_filename.split('.pdf')[0])
        try:
            # Open and read the JSON file
            with open(outputtxt, 'w') as file:

                for page in pdf.pages:
                    txt = page.extract_text()
                    txt = txt.split('\n')

                    for line in txt:
                        if 'UCSD POLICE DEPARTMENT' in line or 'CRIME AND FIRE LOG/MEDIA BULLETIN' in line:
                            counter2 = 0
                            continue
                        if counter2 != -1:
                            counter2 = -1
                            continue
                        if line == "\n":
                            continue
                        if line == " ":
                            continue

                        file.write(line)
                        file.write("\n")
                        
                        if 'Disposition' in line:
                            file.write(">")
                            file.write("\n")
        
        except IOError as e:
            print(f"An error occurred: {e}")
    

In [None]:
sorted_saved_pdf_names = [pdf.replace(' ', '_').replace(',', '') for pdf in pdf_batch_list]

sorted_saved_pdf_names

In [None]:
generate_output_txt_files(sorted_saved_pdf_names)

# We have succesfully made text files for all of aug 6 - 10


# Time to start tweeting the content inside these text files. 

In [None]:
import tweepy
import os
import traceback
import time

bearer_token = os.environ.get('BEARER_TOKEN')
consumer_key = os.environ.get('CONSUMER_KEY')
consumer_secret = os.environ.get('CONSUMER_SECRET')
api_key = os.environ.get('API_KEY')
api_key_secret = os.environ.get('API_KEY_SECRET')
access_token = os.environ.get('ACCESS_TOKEN')
access_token_secret = os.environ.get('ACCESS_TOKEN_SECRET')

client = tweepy.Client(bearer_token, api_key, api_key_secret, access_token, access_token_secret)


In [None]:
def get_array_cases(pdfpath):
    case_array = []
    linecounter= 0
    case_text = ""

    with open(pdfpath) as f:
        for line in f:
            linecounter += 1
            if line.split(): #line is not empty
                if '>' not in line:
                    case_text +=  line
                else:
                    case_array.append(case_text)
                    case_text = ""

    return case_array



In [None]:
def get_array_cases_2(pdfpath):
    case_array = []
    nu_case_array = []

    case = []
    case_array_line = ""


    try:
        with open(pdfpath, 'r') as file:
            for line in file:
                if '>' in line:
                    if len(case_array_line) > 280:
                        length = len(case_array_line)

                        while length > 280:
                            length = length - len(case[2])
                            del case[2]
                        arr= ''
                        for lyne in case:
                            arr+=lyne             
                        case_array.append(case)
                        nu_case_array.append(arr)
                        case = []
                        case_array_line = ""

                    else:


                        case_array.append(case)
                        nu_case_array.append(case_array_line)

                        case = []
                        case_array_line = ""
                else:
                    case_array_line += line
                    case.append(line)

    except IOError as e:
        print(f"An error occurred: {e}")

    return nu_case_array



# def get_array_cases_2(pdfpath):
#     case_array = []
#     case = []
#     case_array_line = ""


#     try:
#         with open(pdfpath, 'r') as file:
#             for line in file:
#                 if line.split(): #line is not empty
#                     if '>' in line:
#                         if len(case_array_line) > 280:
#                             length = len(case_array_line)

#                             while length > 280:
#                                 length = length - len(case[1])
#                                 case[1] = ''

#                             print(case_array_line)
#                             print(len(case_array_line))
#                             print("------------------")

#                         case_array.append(case)
#                         case = []
#                         case_array_line = ""
#                     else:
#                         case_array_line += line
#                         case.append(line)
#     except IOError as e:
#         print(f"An error occurred: {e}")



#     return case_array

In [None]:
pdfpaf='./August_10_2023.txt'
cases = get_array_cases_2(pdfpaf)
print(cases)

In [None]:
def filtr(msgs):
    ye =  []
    for case in msgs:
        if 'Incomplete' in case or 'False Alarm' in case:
            continue
        else:
            ye.append(case)
    return ye

In [None]:
filtered_cases = filtr(cases)
print(filtered_cases)

In [None]:
def tweetallcases(array):
    for msg in array:
        try: 
            client.create_tweet(text=msg)
        except Exception:
            traceback.print_exc()
            print("This message gave an error: ", msg)
            continue
        
        time.sleep(50)

In [None]:
tweetallcases(filtered_cases)

In [None]:
len(filtered_cases)