## Import Standard Packages

In [100]:
import os, base64, pickle, re
import pandas as pd
import numpy as np

## Import Google Packages

In [2]:
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build

## ENV Variables

In [21]:
SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
CLIENT_SECRET_PATH = os.path.expanduser("XXX.json")  # Update path if needed
TOKEN_PATH = os.path.expanduser("~/Desktop/Auth/token.json")

## Parsing/AUTH Functions

In [23]:
def auth_gmail():
    creds = None
    if os.path.exists(TOKEN_PATH):
        creds = Credentials.from_authorized_user_file(TOKEN_PATH,SCOPES)

    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            print("## REFRESH EXPIRED TOKEN ##")
            creds.refresh(Request())
        else:
            print("## OAUTH FLOW AUTH ##")
            flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRET_PATH,SCOPES)
            creds = flow.run_local_server(port=0)

        with open(TOKEN_PATH,'w') as token:
            token.write(creds.to_json())

    print(f"## TOKEN SAVED TO {TOKEN_PATH} ##")
    return creds

In [52]:
def search_emails(service,phrase,sender,max_results=10000):
    # Including archived emails, there are ~9000 emails in my account
    query = f'from:{sender}'
    results = service.users().messages().list(userId='me',q=query,maxResults=max_results).execute()
    messages = results.get('messages',[])
    matching_list = []
    for msg in messages:
        msg_id = msg['id']
        message = service.users().messages().get(userId='me',id=msg_id).execute()
        headers = message['payload']['headers']
        header_dict = {h['name']:h['value'] for h in headers}

        body = "NONE"
        if "parts" in message['payload']:
            for part in message['payload']['parts']:
                if part['mimeType'] == "text/plain":
                    body = base64.urlsafe_b64decode(part['body']['data']).decode()
        
        if any(phrase.lower() in value.lower() for value in header_dict.values()):
            matching_list.append({
                "From": header_dict.get("From","Unknown"),
                "Subject": header_dict.get("Subject","No Subject"),
                "Message ID": msg_id,
                "Body":body
            })
    print(f'{len(matching_list)} SUBJECTS FOUND MATCHING \"{phrase}\"')
    return matching_list

In [108]:
def parse_emails(email_body):
    parsed_data = {
        "Crime Type":"N/A",
        "Date":"N/A",
        "Time":"N/A",
        "Location":"N/A",
        "Incident Description":"N/A",
        "Vehicle Description":"N/A",
        "Age":"N/A",
        "Eye Color":"N/A",
        "Hair Color":"N/A",
        "Height":"N/A",
        "Race":"N/A",
        "Sex":"N/A",
        "Weight":"N/A",
        "Additional Description":"N/A",
        "Weapon":"N/A"
    }

    crime_type_match = re.search(r"Reported Offense:\s*([\w\s-]+?)(?:Date & Time|$)",email_body)
    if crime_type_match:
        parsed_data["Crime Type"] = crime_type_match.group(1).strip()
    
    date_time_match = re.search(r"Date & Time of Occurrence:\s*(\d{2}/\d{2}/\d{4})\s*(\d{1,2}:\d{2}\s*[apAP]\.?m\.?)?",email_body)
    if date_time_match:
        parsed_data["Date"] = date_time_match.group(1)
        parsed_data["Time"] = date_time_match.group(2) if date_time_match.group(2) else "N/A"

    location_match = re.search(r"Location:\s*(.+)",email_body)
    if location_match:
        location_text = location_match.group(1).strip()
        if "Report Number" in location_text:
            location_text = location_text.split("Report Number")[0].strip()
        parsed_data["Location"] = location_text
    
    incident_desc_match = re.search(r"Incident Description:\s*(.+)",email_body)
    if incident_desc_match:
        parsed_data["Incident Description"] = incident_desc_match.group(1).strip()

    vehicle_desc_match = re.search(r"Vehicle Description:\s*(.+)",email_body)
    if vehicle_desc_match:
        parsed_data["Vehicle Description"] = vehicle_desc_match.group(1).strip()

    details = {
        "Age":r"Age:\s*(\d{1,2})",
        "Eye Color":r"Eye Color:\s*(.+)",
        "Hair Color":r"Hair Color:\s*(.+)",
        "Height":r"Height:\s*(\d{1,2}\s*feet\s*\d{1,2}\s*inches)",
        "Race":r"Race:\s*(.+)",
        "Sex":r"Sex:\s*(Male|Female|Other)",
        "Weight":r"Weight:\s*(.+)",
        "Additional Description":r"Additional Description:\s*(.+)",
        "Weapon":r"Weapon:\s*(.+)"
    }

    for key, pattern in details.items():
        match = re.search(pattern,email_body)
        if match:
            parsed_data[key] = match.group(1).strip()

    parsed_data = {key: ("N/A" if value in ["None","Unknown"] else value) for key, value in parsed_data.items()}
    
    return parsed_data 

## Workflow

In [45]:
creds = auth_gmail()

## TOKEN SAVED TO /home/aly/Desktop/Auth/token.json ##


In [46]:
service = build('gmail','v1',credentials=creds)

In [53]:
emails = search_emails(service,"Crime Alert","uscpublicsafety@msg.adm.usc.edu")

155 SUBJECTS FOUND MATCHING "Crime Alert"


In [109]:
parsed_emails = []
for email in emails:
    temp = parse_emails(email['Body'])
    temp['From'] = email['From']
    temp['Subject'] = email['Subject']
    temp['Message ID'] = email['Message ID']
    parsed_emails.append(temp)

parsed_data = pd.DataFrame(parsed_emails)

In [110]:
parsed_data

Unnamed: 0,Crime Type,Date,Time,Location,Incident Description,Vehicle Description,Age,Eye Color,Hair Color,Height,Race,Sex,Weight,Additional Description,Weapon,From,Subject,Message ID
0,Aggravated Assault,03/02/2025,11:05 a.m.,Northeast corner of Jefferson Boulevard & McCl...,The suspect brandished a knife at the victim. ...,Teal bicycle,39,,Gray,5 feet 6 inches,,Male,Slim build,"Wearing a plaid shirt, black pants, black shoe...",Knife,USC Department of Public Safety <uscpublicsafe...,Timely Warning Crime Alert - Aggravated Assault,19558e0be8cd7c02
1,,02/27/2025,4:56 p.m.,At the intersection of 28th Street & Hoover St...,The suspect ran up behind the victim and grope...,,39,Brown,Short brown hair,5 feet 5 inches,Hispanic,Male,Medium build,Wearing black t-shirt and dark sweatpants or j...,,USC Department of Public Safety <uscpublicsafe...,Crime Alert - Sexual Battery/Fondling,1954a92157ad3085
2,Robbery,02/25/2025,8:49 a.m.,At the intersection of 30th Street & Hoover St...,The suspect snatched the victim's phone from t...,,25,,Dark short hair,5 feet 7 inches,Hispanic,Male,Slim build,Wearing a blue shirt and blue jeans.,,USC Department of Public Safety <uscpublicsafe...,Timely Warning Crime Alert - Robbery,1953e62c1b20aab1
3,Burglary,02/20/2025,3:55 a.m.,UPC Troy Hall at 3025 Royal Street,The Department of Public Safety received a del...,,25,Brown,Dark brown,,Hispanic,Male,Slim build,"Wearing beige cargo pants, white shoes, and a ...",,USC Department of Public Safety <uscpublicsafe...,Timely Warning Crime Alert - Burglary,19524e109810f11f
4,,02/19/2025,5:30 a.m.,Off UPC campus in the 1200 block of 30th Street,"While the victim was asleep, the suspect enter...",,,,,,Hispanic,Male,Medium build,"Wearing a black beanie, black sweatshirt, gray...",,USC Department of Public Safety <uscpublicsafe...,Crime Alert - Burglary & Motor Vehicle Theft,1951fc48caee7216
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,Burglary,08/28/2022,,2600 block of Portland Street,Two suspects gained access to an apartment com...,,25,,Black,,Hispanic,Male,,Black hooded sweatshirt with white letters. On...,No weapon,USC Department of Public Safety <uscpublicsafe...,Timely Warning Crime Alert - Burglary,182e5aa22b77b0ac
151,Burglary,08/26/2022,10:30 pm,"2617 Menlo Ave. Los Angeles CA, 90007",The suspect(s) entered the rear of the residen...,,,,,,,,,,,USC Department of Public Safety <uscpublicsafe...,Crime Alert - Burglary,182e1ccbacf6c9c5
152,Robbery,08/22/2022,4:00 p.m.,Adjacent to campus at the southeast corner of ...,"The suspect, riding a skateboard, rode past a ...",,20,,unknown,,White,Male,,"Wearing a white T-shirt, jeans, black Converse...",,USC Department of Public Safety <uscpublicsafe...,Timely Warning Crime Alert - Robbery,182c8202fd7a6051
153,,08/21/2022,,2300 block of Portland Street,"A student, who was asleep in bed, was awakened...",,25,unknown,unknown,,Hispanic,Male,unknown,"Black long sleeve shirt, dark pants",,USC Department of Public Safety <uscpublicsafe...,Crime Alert - Sexual Battery/Fondling,182c17427dd24629
