In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

channel= "google_takeout/c_profile4/channels/channel.csv"
comments= "google_takeout/c_profile4/comments/comments.csv"
search_history= "google_takeout/c_profile4/history/search-history.html"
watch_history="google_takeout/c_profile4/history/watch-history.html"

channel_df = pd.read_csv(channel)
comments_df = pd.read_csv(comments)

In [2]:
def parse_date_time(date_str) -> tuple:
    cleaned = date_str.replace(' GMT-03:00', '').replace('\u202f', '').strip()
    dt = datetime.strptime(cleaned, '%B %d, %Y, %I:%M:%S%p')
    date_formatted = dt.strftime('%d-%m-%Y')  # ex. 25-05-2025
    time_formatted = dt.strftime('%H:%M:%S')  # ex. 01:03:32
    return date_formatted, time_formatted


In [3]:
def get_historic_info(hist_type='search'):
    cell_info = []
    filename = ""
    if hist_type == 'search':
        filename = search_history
    elif hist_type == 'watch':
        filename = watch_history
    else:
        raise Exception("Invalid hist_type.")
    
    with open(filename, 'r', encoding='utf-8') as f:
        html_content = f.read()
    
    soup = BeautifulSoup(html_content, 'html.parser')
    for info in soup.find_all('div', class_='outer-cell mdl-cell mdl-cell--12-col mdl-shadow--2dp'):
        personal_info = info.find('div', class_='content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1')
        google_info = info.find('div', class_='content-cell mdl-cell mdl-cell--12-col mdl-typography--caption')
        # cell_info[f"register {count}"] = {"personal_info": list(personal_info.stripped_strings), "google_info": list(google_info.stripped_strings)}
        # count = count + 1
        p_info = list(personal_info.stripped_strings)
        g_info = list(google_info.stripped_strings)
        
        if hist_type == 'search':
            if 'Searched for' in p_info:
                date_time = parse_date_time(p_info[2])
                row = {
                    'Action': p_info[0].replace(' for', ''),
                    'Query': p_info[1],
                    'Date': date_time[0],
                    'Time': date_time[1]
                }
                cell_info.append(row)
                
        if hist_type == 'watch':
            #print(p_info)
            a_tag = personal_info.find('a')
            if len(p_info) > 3:
                if 'From Google Ads' in g_info:
                    author_channel = "Google Ads"
                else:
                    author_channel = p_info[2]
                video_id = a_tag['href'].replace('https://www.youtube.com/watch?v=', '')
                date_time = parse_date_time(p_info[3])
                row = {
                    'Action': p_info[0],
                    'Title': p_info[1],
                    'Video ID': video_id,
                    'Author Channel': author_channel,
                    'Date': date_time[0],
                    'Time': date_time[1]
                }
                cell_info.append(row)
    return cell_info
        

In [4]:
# possui uma unica linha dai posso usar iloc
comments_df['Profile Name/ID'] = channel_df['Channel Title (Original)'].iloc[0] + " " + channel_df['Channel ID'].iloc[0]

prof4_df = comments_df[['Profile Name/ID', 'Video ID', 'Comment Create Timestamp', 'Comment Text']]
watch = get_historic_info(hist_type='watch')
search = get_historic_info(hist_type='search')

watch_df = pd.DataFrame(watch)
print(watch_df.head())

search_df = pd.DataFrame(search)
print(search_df.head())

# Etapa 1: Criar coluna Datetime
watch_df['Datetime'] = pd.to_datetime(watch_df['Date'] + ' ' + watch_df['Time'], format='%d-%m-%Y %H:%M:%S')
search_df['Datetime'] = pd.to_datetime(search_df['Date'] + ' ' + search_df['Time'], format='%d-%m-%Y %H:%M:%S')

# Etapa 2: Unir os dois DataFrames
combined_df = pd.concat([watch_df, search_df], ignore_index=True)

combined_df = combined_df.sort_values(by='Datetime').reset_index(drop=True)

# Etapa 4 (opcional): Remover coluna Datetime
combined_df.drop(columns=['Datetime'], inplace=True)

# Mostrar resultado
print(combined_df)

    Action                                              Title     Video ID  \
0  Watched  TOP 50: JOGOS QUE VOCÊ PRECISA JOGAR URGENTEMENTE  kyj_FL1doyU   
1  Watched  VALORANT ARES21 RETAKECINEMATIC 16x9 60 PT BR v02  8xXC7ced_g4   
2  Watched  JOGUEI 5 JOGOS DO TUNG TUNG TUNG SAHUR PRA TEN...  JDDZek9qCRE   
3  Watched                      Lula | O Criminoso Paz e Amor  JXSvcGdFP-E   
4  Watched                     Magalu - Quem Lacra, Não Lucra  urNk6N4Djhs   

         Author Channel        Date      Time  
0  O Meu Canal de Games  31-05-2025  11:48:41  
1            Google Ads  31-05-2025  11:48:17  
2             Jazzghost  31-05-2025  11:47:28  
3          Daniel Penin  30-05-2025  20:14:50  
4          Daniel Penin  30-05-2025  20:14:48  
     Action     Query        Date      Time
0  Searched     jogos  31-05-2025  11:47:08
1  Searched  politica  20-05-2025  18:19:44
2  Searched     jogos  20-05-2025  17:39:54
3  Searched  politica  20-05-2025  17:39:44
      Action           