# Python for Data Science
## Session 5 
### Basic Libraries II

---

## Outline

1. Json, pickle and parquet formats

2. Re library

3. Time and Datetime libraries

### Exercise


Reusing the same annotations we work with in the previous session, answer the following items using the libraries we saw today: 

1. How many annotations you have per month and year. Which month has more annotation files.
2. Create a dictionary where each **key** is a month, and the corresponding **value** is a list containing all the annotation names with where their date corresponds to the month. 
    a. Save it following the json format, and load it again to check that everything is ok.
    b. Save it this time using Pickle.
    c. Instead of storing a list of all the annotation names happening that month, let's create for each annotation a dictionary with keys: name and date (using a datetime object).
3. Print all the annotations from the oldest ones to the newest one during the seconf half of the 2024. 

1. How many annotations you have per month and year. Which month has more annotation files.

In [127]:
import re
import glob
import os
from datetime import datetime


# Regex pattern to extract date and time from filenames
pattern = r'(\d{8})_(\d{6})_SN(\d+)_QUICKVIEW_VISUAL_([\d_]+)_([A-Za-z0-9\-\_.]+)\.txt'

# Get list of annotation files
annotations = glob.glob('session_4/annotations/*.txt')

# Count annotations per month and year
annotations_by_month = {}

for annotation in annotations:
    filename = os.path.basename(annotation)
    match = re.match(pattern, filename)
    if match:
        date, time, _, _, _ = match.groups()
        datetime_str = date + time
        datetime_obj = datetime.strptime(datetime_str, "%Y%m%d%H%M%S")
        year_month = datetime_obj.strftime('%Y-%m')
        if year_month not in annotations_by_month:
            annotations_by_month[year_month] = []
        annotations_by_month[year_month].append(filename)

annotations_count_by_month = {month: len(files) for month, files in annotations_by_month.items()}
most_annotations_month = max(annotations_count_by_month, key=annotations_count_by_month.get)
print(f"The month with the most annotations is {most_annotations_month} with {annotations_count_by_month[most_annotations_month]} annotations.")


The month with the most annotations is 2024-06 with 52 annotations.


2. Create a dictionary where each **key** is a month, and the corresponding **value** is a list containing all the annotation names with where their date corresponds to the month.

    a. Save it following the json format, and load it again to check that everything is ok.

    b. Save it this time using Pickle.
    
    c. Instead of storing a list of all the annotation names happening that month, let's create for each annotation a dictionary with keys: name and date (using a datetime object).

In [128]:
import os
import glob
import pickle
import re
import json
from datetime import datetime

annotations_folder = '/Users/abbi23/Downloads/session_4/annotations'
annotations = glob.glob(os.path.join(annotations_folder, '*.txt'))

pattern = r'(\d{8})_(\d{6})_SN(\d+)_QUICKVIEW_VISUAL_([\d_]+)_([A-Za-z0-9\-\_.]+)\.txt'

# Create a dictionary: key = month, value = list of annotation filenames
annotations_by_month = {}
for annotation in annotations:
    filename = os.path.basename(annotation)
    match = re.match(pattern, filename)
    if match:
        date, time, _, _, _ = match.groups()
        datetime_obj = datetime.strptime(date + time, "%Y%m%d%H%M%S")
        year_month = datetime_obj.strftime('%Y-%m')
        if year_month not in annotations_by_month:
            annotations_by_month[year_month] = []
        annotations_by_month[year_month].append(filename)

# a.Save as JSON
json_file_path = os.path.join(annotations_folder, "annotations_by_month.json")
with open(json_file_path, 'w') as json_file:
    json.dump(annotations_by_month, json_file, indent=4)

# Reload to check
with open(json_file_path, 'r') as json_file:
    loaded_annotations_by_month = json.load(json_file)


# b. Save the dictionary in Pickle format
pickle_file_path = os.path.join(annotations_folder, "annotations_by_month.pkl")
with open(pickle_file_path, 'wb') as pickle_file:
    pickle.dump(annotations_by_month, pickle_file)


# c. Modify the dictionary: list of dictionaries with 'name' and 'date' keys
annotations_details_by_month = {
    month: [{'name': file, 'date': datetime.strptime(file.split('_')[0] + file.split('_')[1], "%Y%m%d%H%M%S")}
            for file in files]
    for month, files in annotations_by_month.items()
}

# Save the detailed dictionary in JSON
detailed_json_file_path = os.path.join(annotations_folder, "annotations_details_by_month.json")
with open(detailed_json_file_path, 'w') as json_file:
    json.dump({k: [{'name': d['name'], 'date': d['date'].strftime('%Y-%m-%d %H:%M:%S')} 
                    for d in v] for k, v in annotations_details_by_month.items()}, json_file, indent=4)


3. Print all the annotations from the oldest ones to the newest one during the seconf half of the 2024. 

In [129]:
annotations_second_half_2024 = []

for month, files in annotations_details_by_month.items():
    year, month_num = map(int, month.split('-'))
    if year == 2024 and month_num >= 7:
        for annotation in files:
            annotations_second_half_2024.append({
                'name': annotation['name'],
                'date': annotation['date']
            })

if len(annotations_second_half_2024) == 0:
    print("No annotations were found for the second half of 2024.")
else:
    annotations_second_half_2024_sorted = sorted(annotations_second_half_2024, key=lambda x: x['date'])

    print("Annotations from the second half of 2024 (sorted chronologically):")
    for annotation in annotations_second_half_2024_sorted:
        print(f"Name: {annotation['name']}, Date: {annotation['date'].strftime('%Y-%m-%d %H:%M:%S')}")


No annotations were found for the second half of 2024.
