# Python for Data Science
## Session 5 
### Basic Libraries II

---

### ***Note Before Proceeding***
1. Download the "annotations" folder in the GitHub Repository
2. Make sure to update the "annotations" variable to the specific path where you downloaded the "Annotations"
3. Make sure you have a Python version of at least 3.10+ for the match function to work
4. The dataset contains annotation files only from the first half of the year. 

---

### **Question 1. How many annotations you have per month and year? Which month has more annotation files?**

In [51]:
""" This code will count how many annotations are in the folder 
per month and year and it also looks for which month has the most files"""

#Disclaimer: Ensure you have a Python version of at least 3.10+ for the match function to work

import re
import glob
import os
from datetime import datetime
from collections import Counter

pattern = r'(\d{8})_(\d{6})_SN(\d+)_QUICKVIEW_VISUAL_([\d_]+)_([A-Za-z0-9\-_.]+)\.txt' 

annotations = glob.glob('/Users/biancabaldonado/Desktop/ESADE/session_4/annotations/*.txt') 
#Edit this path depending on where you saved the annotations folder

# Create empty lists to store date-related information
ann_datetime = []
total_years = []
total_months = []
total_year_month = []

#Check each file in the folder and add to the list depending on the date and time
for annotation in annotations:
    filename = os.path.basename(annotation)
    match = re.match(pattern, filename)
    if match:
        date, time, _, _, _ = match.groups()
        datetime_str = date + time
        datetime_obj = datetime.strptime(datetime_str, "%Y%m%d%H%M%S")
        year = datetime_obj.year
        month = datetime_obj.month
        ann_datetime.append((year, month))

for year, month in ann_datetime:
    total_years.append(year)
    total_months.append(month)
    total_year_month.append((year, month))

# Count files per year, month, and year-month pair
years_count = Counter(total_years)
months_count = Counter(total_months)
yearmonth_count = Counter(total_year_month)

# Define a function to convert the month numbers to the month name
def convert_month(month):
    match month:
        case 1: return "January"
        case 2: return "February"
        case 3: return "March"
        case 4: return "April"
        case 5: return "May"
        case 6: return "June"
        case 7: return "July"
        case 8: return "August"
        case 9: return "September"
        case 10: return "October"
        case 11: return "November"
        case 12: return "December"
        case _: return "Error"

# Sort year-month pairs by month and year and ensuring its in ascending order; using lambda function to make process simpler
sorted_year_months = sorted(yearmonth_count.items(), key=lambda x: (x[0][1], x[0][0]))

print("\nGiven that there is only 1 year (2024), we sort by Year and Month as follows:")
print("\n----Number of Files per Year and Month----")
for (year, month), count in sorted_year_months:
    print(f"Year & Month: ({year}, {month}), Count: {count}")


# Finding the month with the most annotations
most_annotations_month = max(yearmonth_count, key=yearmonth_count.get)
most_annotations_count = yearmonth_count[most_annotations_month]

# Converting the month number to the month name
year, month = most_annotations_month
month_name = convert_month(month)

print("\n----Month with the Most Annotations----")
print(f'{month_name} is the month with the most annotations, as it has {most_annotations_count} files ')


Given that there is only 1 year (2024), we sort by Year and Month as follows:

----Number of Files per Year and Month----
Year & Month: (2024, 1), Count: 27
Year & Month: (2024, 2), Count: 45
Year & Month: (2024, 3), Count: 17
Year & Month: (2024, 4), Count: 25
Year & Month: (2024, 5), Count: 28
Year & Month: (2024, 6), Count: 52

----Month with the Most Annotations----
June is the month with the most annotations, as it has 52 files 


### **Question 2. Create a dictionary where each key is a month, and the corresponding value is a list containing all the annotation names with where their date corresponds to the month.**

#### *a. Save it following the json format, and load it again to check that everything is ok.*

In [57]:
""" 
This code will create a dictionary that contains every year month as the key ('YYYY-MM') and the value pertains 
to a list of the annotations that correspond to that given year and month.
"""

import json
from datetime import datetime
from collections import defaultdict

#Created an empty dictionary, which will include the months as the key and the values will be a list of annotations
annotations_per_month = {}


for annotation in annotations:
    filename = os.path.basename(annotation)
    
    match = re.match(pattern, filename)
    if match:
        date, time, _, _, name = match.groups()

        datetime_str = date + time 
        datetime_obj = datetime.strptime(datetime_str, "%Y%m%d%H%M%S")

        #Storing the year and month of each file
        year_month = datetime_obj.strftime('%Y-%m')
        
        if year_month not in annotations_per_month:
            annotations_per_month[year_month] = [] 

        #Adding the annotations that correspond to each specific month
        annotations_per_month[year_month].append(filename)


#Saving the dictionary to a JSON file
with open('month_annotations.json', 'w') as json_file:
    json.dump(annotations_per_month, json_file, default=str)

# Loading the JSON file to check 
with open('month_annotations.json', 'r') as json_file:
    json_annotations_per_month = json.load(json_file)

print("Annotations Per Month from JSON:")
print(json.dumps(json_annotations_per_month))

Annotations Per Month from JSON:
{"2024-01": ["20240102_185527_SN27_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_740_3850.txt", "20240101_174301_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_404_3770.txt", "20240101_192856_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_552_4164.txt", "20240102_185954_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_414_3786.txt", "20240104_220339_SN31_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_556_4178.txt", "20240115_213834_SN28_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_376_3722.txt", "20240126_173752_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_386_3722.txt", "20240101_174301_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_404_3772.txt", "20240130_173903_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_366_3756.txt", "20240127_190620_SN27_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_500_3600.txt", "20240101_192856_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_552_4162.txt", "20240127_190620_SN27_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_500_3602.txt", "20240101_192856_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_554_416

#### *b. Save it this time using Pickle.*

In [56]:
"""
This code creates a dictionary where each key represents a year month pair ('YYYY-MM'), 
and the value is a list of annotation filenames that correspond to that specific year month pair.
The dictionary is saved using a pickle file.
"""

import pickle

print("\nAnnotations Per Month from Pickle:")

# Saving the dictionary to a pickle file
with open('month_annotations.json', 'wb') as f:
    pickle.dump(json_annotations_per_month, f)

# Loading the pickle file to check
with open('month_annotations.json', 'rb') as f:
    pickle_annotations_per_month = pickle.load(f)

print(pickle_annotations_per_month)


Annotations Per Month from Pickle:
{'2024-01': [{'Name': '20240102_185527_SN27_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_740_3850.txt', 'Date': '2024-01-02 18:55:27'}, {'Name': '20240101_174301_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_404_3770.txt', 'Date': '2024-01-01 17:43:01'}, {'Name': '20240101_192856_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_552_4164.txt', 'Date': '2024-01-01 19:28:56'}, {'Name': '20240102_185954_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_414_3786.txt', 'Date': '2024-01-02 18:59:54'}, {'Name': '20240104_220339_SN31_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_556_4178.txt', 'Date': '2024-01-04 22:03:39'}, {'Name': '20240115_213834_SN28_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_376_3722.txt', 'Date': '2024-01-15 21:38:34'}, {'Name': '20240126_173752_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_386_3722.txt', 'Date': '2024-01-26 17:37:52'}, {'Name': '20240101_174301_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_404_3772.txt', 'Date': '2024-01-01 17:43:01'}, {'Name': '20240130_173903_SN33_

#### *c. Instead of storing a list of all the annotation names happening that month, let's create for each annotation a dictionary with keys: name and date (using a datetime object). How to solve: {January: [file names],...} {Name: [], Date:[]}*

In [58]:
"""
This code creates a dictionary where each key is formatted as 'YYYY-MM',
and the value is a list of annotation file names further broken down into its Name and Date.
The dictionary is saved and loaded using a JSON file.
"""

annotations_per_month = {}


for annotation in annotations:
    filename = os.path.basename(annotation)
    
    match = re.match(pattern, filename)
    if match:
        date, time, _, _, name = match.groups()

        datetime_str = date + time 
        datetime_obj = datetime.strptime(datetime_str, "%Y%m%d%H%M%S")

        #Storing the year and month of each file
        year_month = datetime_obj.strftime('%Y-%m')

        if year_month not in annotations_per_month:
            annotations_per_month[year_month] = [] 

        # Appending the name and date of each annotation
        annotations_per_month[year_month].append({
            'Name': filename,
            'Date': datetime_obj
        })

#Saving the dictionary to a JSON file
with open('month_annotations.json', 'w') as json_file:
    json.dump(annotations_per_month, json_file, default=str)

# Loading the JSON file to check 
with open('month_annotations.json', 'r') as json_file:
    json_annotations_per_month = json.load(json_file)
print("Annotations Per Month (Classified by Name and Date):")
print(json.dumps(json_annotations_per_month))


Annotations Per Month (Classified by Name and Date):
{"2024-01": [{"Name": "20240102_185527_SN27_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_740_3850.txt", "Date": "2024-01-02 18:55:27"}, {"Name": "20240101_174301_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_404_3770.txt", "Date": "2024-01-01 17:43:01"}, {"Name": "20240101_192856_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_552_4164.txt", "Date": "2024-01-01 19:28:56"}, {"Name": "20240102_185954_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_414_3786.txt", "Date": "2024-01-02 18:59:54"}, {"Name": "20240104_220339_SN31_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_556_4178.txt", "Date": "2024-01-04 22:03:39"}, {"Name": "20240115_213834_SN28_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_376_3722.txt", "Date": "2024-01-15 21:38:34"}, {"Name": "20240126_173752_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_386_3722.txt", "Date": "2024-01-26 17:37:52"}, {"Name": "20240101_174301_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_404_3772.txt", "Date": "2024-01-01 17:43:01"}, {"Name": "2024

### **Question 3. Print all the annotations from the oldest ones to the newest one during the second half of the 2024.**

In [59]:
"""
This code processes annotation files, extracts date and time from filenames, 
filters for annotations for the second half of the year, which corresponds to July to December 2024, 
and sorts them in ascending chronological order. 
"""

# Creating a list to store all annotations with their datetime objects
all_annotations = []

for annotation in annotations:
    filename = os.path.basename(annotation)
    match = re.match(pattern, filename)
    
    if match:
        date, time, _, _, _ = match.groups()
        datetime_str = date + time
        
        datetime_obj = datetime.strptime(datetime_str, "%Y%m%d%H%M%S")
        
        #Appending file name and datetime objects 
        all_annotations.append((filename, datetime_obj))

# Creating a list to store all annotations whose dates correspond to the second half of the year
second_half_annotations = []

for filename, date in all_annotations: 

    #Second half of the year means July onwards, which is the 7th month until the 12th month of the year 
    if date.year == 2024 and date.month >= 7: 
        second_half_annotations.append((filename, date))

#Sorting the second half of the year annotations in ascending order
sorted_second_half_annotations = sorted(second_half_annotations, key=lambda x: x[1])

#Checking if there are annotatons for the second half of the year
if second_half_annotations: 
    print(f"Number of annotations for the second half of 2024: {len(second_half_annotations)}")
    print("Annotations from the oldest ones to the newest one during the second half of year 2024:")
    for filename, date in sorted_second_half_annotations:
        print(f"{date}: {filename}")

#If there are no annotations, we print the following
else:
    print("----There are no annotations available for the second half of 2024 (July to December)----")
    print(f"We cannot arrange the annotations from the oldest ones to the newest during the second half of the year 2024 since there are {len(second_half_annotations)} files for the second half of 2024.")


----There are no annotations available for the second half of 2024 (July to December)----
We cannot arrange the annotations from the oldest ones to the newest during the second half of the year 2024 since there are 0 files for the second half of 2024.
