In [2]:
import pandas as pd
import shutil

## Load the topics file

In [37]:
topics = pd.read_csv('data/topics_by_minute.csv')
topics = topics[['date', 'channel', 'minute', 'text', 'words_topic']]

# Filter out the 'tve' channel
topics = topics[topics['channel'] != 'tve']

topics.head()

Unnamed: 0,date,channel,minute,text,words_topic
18731,2024-02-24,a3,0.0,-No las habíamos visto hasta ahora. Son imágen...,"['bomberos', 'edificio', 'fuego', 'incendio', ..."
18732,2024-02-24,a3,1.0,-Uno de los dos bomberos heridos en el incendi...,"['incendio', 'edificio', 'bomberos', 'explosió..."
18733,2024-02-24,a3,2.0,Estamos atendiendo caso a caso de manera indiv...,"['bomberos', 'edificio', 'fuego', 'incendio', ..."
18734,2024-02-24,a3,3.0,venga de donde venga y caiga quien caiga. Es e...,"['ucrania', 'rusia', 'putin', 'guerra', 'ruso'..."
18735,2024-02-24,a3,4.0,lo que queda define semana. Temporal que afect...,"['borrasca', 'viento', 'lluvia', 'precipitacio..."


In [38]:
topics.shape

(50124, 5)

In [39]:
topics['date'] = pd.to_datetime(topics['date'])

# Running the code for November only to fill in previously missing data
topics = topics[(topics['date'].dt.month == 10) & (topics['date'].dt.year == 2023) & (topics['date'].dt.day == 7)]
topics.shape

(77, 5)

In [40]:
topics.channel.unique()

array(['a3', 'la6'], dtype=object)

In [41]:
def get_channel_name(channel):
    if channel == 'a3':
        return 'atres'
    elif channel == 'la6':
        return 'la6'
    elif channel == 't5':
        return 'telecinco'
    elif channel == 'tve':
        return 'tve'

topics['channel_fullname'] = topics['channel'].apply(get_channel_name)

In [42]:
# Filter the DataFrame
topics_ua = topics[topics['words_topic'].apply(lambda x: 'ucrania' in x)]
topics_ua.shape

(1, 6)

In [44]:
# Add a column for the filename
topics_ua['filename'] = topics_ua['channel_fullname'] + '_noche_' + topics_ua['date'].dt.strftime('%Y-%m-%d') + '.mp4'
topics_ua.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  topics_ua['filename'] = topics_ua['channel_fullname'] + '_noche_' + topics_ua['date'].dt.strftime('%Y-%m-%d') + '.mp4'


Unnamed: 0,date,channel,minute,text,words_topic,channel_fullname,filename
53372,2023-10-07,la6,1591.0,intentaba lanzar al Miura 1. En mayo las condi...,"['ucrania', 'rusia', 'putin', 'guerra', 'ruso'...",la6,la6_noche_2023-10-07.mp4


In [51]:
topics_ua.words_topic.unique()

array(["['ucrania', 'rusia', 'putin', 'guerra', 'ruso', 'moscú', 'rusos', 'kiev', 'ucraniano', 'zelenski']"],
      dtype=object)

In [45]:
topics_ua['channel'].unique()

array(['la6'], dtype=object)

## Extract and save frames (1 frame per second)

In [47]:
import cv2
import os

topics_ua_temp = topics_ua[topics_ua['channel'] == 'la6']

processed_files = []

# Iterate through the DataFrame
for index, row in topics_ua_temp.iterrows():
    filename = row['filename']
    print(filename)
    start_minute = row['minute']
    
    folder_name = row['channel_fullname']
    
    # Define video path
    video_path = os.path.join('data', 'videos', folder_name, filename)
    
    # Define the folder to save the frames
    output_folder = 'data/videos_processed'
    output_folder = os.path.join('data', 'frames', folder_name)
    
    
    # Open the video
    cap = cv2.VideoCapture(video_path)
    
    # Check if the video opened successfully
    if not cap.isOpened():
        print(f"Oops! We couldn't open the video: {filename}")
        continue
    
    # Set the frame position to the start of the specified minute
    start_frame = start_minute * 60 * cap.get(cv2.CAP_PROP_FPS)
    cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
    
    # Read frames for one minute after start time
    frame_count = 0
    
    while frame_count < cap.get(cv2.CAP_PROP_FPS) * 60:
        ret, frame = cap.read()
        if not ret:
            break
        
        # Get the current frame's timestamp in seconds
        current_time_sec = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000
        
        # Check if the current frame corresponds to the start of a new second
        if int(current_time_sec) % 1 == 0:
            # Save the frame as an image
            image_filename = os.path.join(output_folder, f"frame_{filename.replace('.mp4', '')}_sec_{int(current_time_sec)}.jpg") # - (start_minute * 60)
            cv2.imwrite(image_filename, frame)
        
        frame_count += 1
    
    # Release the video capture object
    cap.release()
    
    processed_files.append(filename)

la6_noche_2023-10-07.mp4


In [88]:
len(list(set(processed_files)))

288

In [90]:
output_folder

'data/frames/telecinco'

In [91]:
# Get a list of all files in the folder
file_names = os.listdir(output_folder)

# Print the list of file names
print(len(file_names))

45630


In [92]:
names = []
for n in file_names:
    names.append('_'.join(n.split('_')[1:-2]))
    
names = list(set(names))[1:]
    
print(len(names))

288


In [93]:
names = list(set(processed_files)) #[n + '.mp4' for n in names]
names

['telecinco_noche_2023-03-13.mp4',
 'telecinco_noche_2023-04-27.mp4',
 'telecinco_noche_2023-03-24.mp4',
 'telecinco_noche_2023-03-22.mp4',
 'telecinco_noche_2023-01-20.mp4',
 'telecinco_noche_2023-04-02.mp4',
 'telecinco_noche_2023-01-27.mp4',
 'telecinco_noche_2023-10-25.mp4',
 'telecinco_noche_2023-05-16.mp4',
 'telecinco_noche_2023-09-11.mp4',
 'telecinco_noche_2023-08-02.mp4',
 'telecinco_noche_2023-02-23.mp4',
 'telecinco_noche_2024-01-17.mp4',
 'telecinco_noche_2023-07-29.mp4',
 'telecinco_noche_2023-05-21.mp4',
 'telecinco_noche_2023-06-09.mp4',
 'telecinco_noche_2024-02-19.mp4',
 'telecinco_noche_2023-06-07.mp4',
 'telecinco_noche_2023-10-08.mp4',
 'telecinco_noche_2023-01-01.mp4',
 'telecinco_noche_2023-03-31.mp4',
 'telecinco_noche_2022-12-19.mp4',
 'telecinco_noche_2024-05-20.mp4',
 'telecinco_noche_2023-01-18.mp4',
 'telecinco_noche_2022-12-02.mp4',
 'telecinco_noche_2023-07-20.mp4',
 'telecinco_noche_2022-12-14.mp4',
 'telecinco_noche_2024-01-12.mp4',
 'telecinco_noche_20

In [94]:
# Add the file_missing column
topics_ua_temp['file_missing'] = topics_ua_temp['filename'].apply(lambda x: 0 if x in names else 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  topics_ua_temp['file_missing'] = topics_ua_temp['filename'].apply(lambda x: 0 if x in names else 1)


In [96]:
filtered_telecinco = topics_ua_temp
filtered_telecinco.to_csv('data_telecinco.csv', index=False)

## Choose frames for labelling

In [15]:
import random
import os

In [19]:
file_names_telecinco = sorted(os.listdir('data/frames/telecinco'))
file_names_atres = sorted(os.listdir('data/frames/atres'))
file_names_la6 = sorted(os.listdir('data/frames/la6'))

print(len(file_names_telecinco))

45630


In [20]:
file_names_telecinco = [x for x in file_names_telecinco if x != '.DS_Store']
file_names_atres = [x for x in file_names_atres if x != '.DS_Store']
file_names_la6 = [x for x in file_names_la6 if x != '.DS_Store']

In [21]:
# Select every 20th item starting from index 0
selected_files_telecinco = file_names_telecinco[::20]
selected_files_atres = file_names_atres[::20]
selected_files_la6 = file_names_la6[::20]
len(selected_files_la6)

2800

In [26]:
# Select every 10th item
every_10th_telecinco = [file for i, file in enumerate(file_names_telecinco) if i % 10 == 0]
every_10th_atres = [file for i, file in enumerate(file_names_atres) if i % 10 == 0]
every_10th_la6 = [file for i, file in enumerate(file_names_la6) if i % 10 == 0]

# Remove items already selected in every 20th
selected_files_telecinco_new = [file for file in every_10th_telecinco if file not in selected_files_telecinco]
selected_files_atres_new = [file for file in every_10th_atres if file not in selected_files_atres]
selected_files_la6_new = [file for file in every_10th_la6 if file not in selected_files_la6]

# Check the length of the resulting list for la6
len(selected_files_telecinco_new)

2281

In [31]:
# Create the destination directory if it doesn't exist
destination_folder = 'data/data_to_label/telecinco_subset_final_2'
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

# Copy the selected files to the destination folder
for file_name in selected_files_telecinco_new:
    source_path = os.path.join('data/frames/telecinco', file_name)
    destination_path = os.path.join(destination_folder, file_name)
    shutil.copyfile(source_path, destination_path)

print("Files copied successfully.")

Files copied successfully.
