<a href="https://colab.research.google.com/github/cemvardar/pragmatic_google_colab/blob/main/dslab_colab_utility_main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q pymongo
import json
import requests
import pandas as pd
import random
import folium
from pymongo import MongoClient, UpdateOne
import urllib.parse
from datetime import datetime
from google.oauth2 import service_account
import mimetypes
import os
from google.cloud import storage
from oauth2client.service_account import ServiceAccountCredentials
import httplib2

from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
from google.colab import sheets
from IPython.display import HTML, display


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.4/1.4 MB[0m [31m43.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/313.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:


def read_df_from_sheet_gspread(sheet_url, worksheet_name=None):
    creds, _ = default()
    gc = gspread.authorize(creds)

    # Extract the sheet ID from the URL
    sheet_id = sheet_url.split('/d/')[1].split('/')[0]

    sheet = gc.open_by_key(sheet_id)
    if worksheet_name:
        worksheet = sheet.worksheet(worksheet_name)
        values = worksheet.get_all_values()
        df = pd.DataFrame(values[1:], columns=values[0])  # This assumes the first row is the header
        return df

    if worksheet_name is None and len(sheet.worksheets())==1:
        worksheet = sheet.worksheets()[0]
        values = worksheet.get_all_values()
        df = pd.DataFrame(values[1:], columns=values[0])  # This assumes the first row is the header
        return df

    df_dict = {}
    for worksheet in sheet.worksheets():
    # Use the first sheet by default, or specify the name of the sheet you want to access

        # Get all values from the sheet
        values = worksheet.get_all_values()

        # Convert to a pandas DataFrame
        df = pd.DataFrame(values[1:], columns=values[0])  # This assumes the first row is the header
        df_dict[worksheet.title] = df
    return df_dict


def post_to_rest_api(payload, url):
    headers = {'Content-type': 'application/json', 'Accept': 'application/json'}
    r = requests.post(url,
                      json.dumps(payload),
                      headers=headers)
    return r

def get_df_from_sheet(key, sheet_name):
    url = 'http://decisionsciencelab.com/api/v1.0/get_sheet_json'
    payload = {'key':key,
               'sheet_name':sheet_name}
    r = post_to_rest_api(payload, url)
    return pd.DataFrame(r.json())


def get_mongodb_url():
    userid = secrets['mongodb_user']
    password = urllib.parse.quote_plus(secrets['mongodb_password'])
    mongodb_uri = "mongodb+srv://" + userid + ":" + password + "@location-selection.vfmji.gcp.mongodb.net/location_selection?retryWrites=true&w=majority"
    return mongodb_uri


def get_document_list_from_mongodb(db_name, collection_name):
    client = MongoClient(get_mongodb_url(), retryWrites=False)
    database = client[db_name]
    list_records = [doc for doc in database[collection_name].find()]
    return list_records


def get_df_from_mongodb(db_name, collection_name):
    list_records = get_document_list_from_mongodb(db_name, collection_name)
    df = pd.DataFrame(list_records)
    return df


def get_collection(db_name, collection_name):
    client = MongoClient(get_mongodb_url(), retryWrites=False)
    database = client[db_name]
    return database[collection_name]


def insert(db_name, collection_name, json_doc):
    collection = get_collection(db_name, collection_name)
    collection.insert_one(json_doc)

def insert_many(db_name, collection_name, json_docs):
    """Inserts multiple JSON documents into a MongoDB collection.

    Args:
        db_name: The name of the database.
        collection_name: The name of the collection.
        json_docs: A list of JSON documents to insert.
    """
    collection = get_collection(db_name, collection_name)
    collection.insert_many(json_docs)


def upsert(db_name, collection_name, query, doc_to_upsert):
    collection = get_collection(db_name, collection_name)
    # collection.update(query, doc_to_upsert, upsert=True, safe=True)
    collection.update_one(query, {'$set': doc_to_upsert}, upsert=True)

def upsert_many(db_name, collection_name, json_docs, filter_key='_id'):
    """Upserts multiple JSON documents into a MongoDB collection.

    Args:
        db_name: The name of the database.
        collection_name: The name of the collection.
        json_docs: A list of JSON documents to upsert.
        filter_key: The key to use for filtering existing documents.
                    Defaults to '_id'.
    """
    collection = get_collection(db_name, collection_name)
    requests = []
    for doc in json_docs:
        filter = {filter_key: doc[filter_key]} if filter_key in doc else doc
        update = {'$set': doc}
        request = UpdateOne(filter, update, upsert=True)
        requests.append(request)

    if requests:
        result = collection.bulk_write(requests)
        print(f"Upserted {result.upserted_count} documents, "
              f"modified {result.modified_count} documents.")


def now():
    return datetime.now()


def get_gcp_bucket_credentials():
    creds = {
    "type": "service_account",
    "project_id": "cem-k8-test",
    "private_key_id": "",
    "private_key": "",
    "client_email": "dslab-gcp-bucket@cem-k8-test.iam.gserviceaccount.com",
    "client_id": "101834349465593903398",
    "auth_uri": "https://accounts.google.com/o/oauth2/auth",
    "token_uri": "https://oauth2.googleapis.com/token",
    "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
    "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/dslab-gcp-bucket%40cem-k8-test.iam.gserviceaccount.com"
    }
    creds["private_key_id"]=secrets['gcp_private_key_id']
    creds["private_key"]=secrets['gcp_private_key'].replace('\\n', '\n')
    gcp_bucket_credentials = service_account.Credentials.from_service_account_info(creds)
    return gcp_bucket_credentials


def upload_file_to_gcp_generic_mime_type(file_name, sub_folder_path, delete_file_from_local = False):
    bucket_name = 'decision-science-lab-bucket'
    project_id = 'cem-k8-test'
    client = storage.Client(project=project_id, credentials=get_gcp_bucket_credentials())

    mime_type, _ = mimetypes.guess_type(file_name)
    if mime_type is None:
        mime_type = 'application/octet-stream'  # Default/fallback MIME type
    upload_file_name = file_name.replace(' ', '_')
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(f"{sub_folder_path}/{upload_file_name}")
    blob.upload_from_filename(file_name, content_type=mime_type)

    # Delete local file
    if delete_file_from_local and os.path.exists(file_name):
        os.remove(file_name)
        print(f"{file_name} successfully uploaded to GCP and deleted from local")

    uploaded_file_gcp_link = f'https://storage.googleapis.com/decision-science-lab-bucket/' \
                     f'{sub_folder_path}/{upload_file_name}'
    print(uploaded_file_gcp_link)
    return uploaded_file_gcp_link


def get_all_urls_in_gcp_sub_folder_path(sub_folder_path):
    bucket_name = 'decision-science-lab-bucket'
    project_id = 'cem-k8-test'
    client = storage.Client(project=project_id, credentials=get_gcp_bucket_credentials())
    bucket = client.bucket(bucket_name)
    blobs = client.list_blobs(bucket_name, prefix=sub_folder_path + '/')
    url_prefix = 'https://storage.googleapis.com/decision-science-lab-bucket/'
    url_list = []
    for blob in blobs:
        url_list.append(url_prefix+blob.name)
    return url_list


def get_secrets():
    sheet_url = 'https://docs.google.com/spreadsheets/d/1mLwdiSnTi0KoB8Zg6kMclTXXm3f_JavMX-5lAUp-Ry0/edit#gid=0'
    sheet_name = 'dev'
    df_keys = read_df_from_sheet_gspread(sheet_url, worksheet_name=sheet_name)
    key_dict = dict(zip(df_keys['key'], df_keys['value']))
    return key_dict


secrets = get_secrets()

In [3]:
import shutil

def upload_gdrive_file_to_gcp_generic_mime_type(file_path, gcp_path):
    video_filename = file_path.split('/')[-1]
    destination_path = f'/content/{video_filename}' # Copy to current Colab directory

    if os.path.exists(file_path):
        shutil.copy2(file_path, destination_path) # copy2 preserves metadata
        print(f"File '{file_path}' copied successfully to '{destination_path}'")
    else:
        print(f"Error: Source file '{file_path}' not found.")
    video_url = upload_file_to_gcp_generic_mime_type(video_filename, gcp_path, delete_file_from_local = True)
    return video_url


# sub_folder_path = 'liplips/roma_silvers_products'
# urls = get_all_urls_in_gcp_sub_folder_path(sub_folder_path)
# urls[:5]
# len(urls)

In [4]:
def get_file_name_for_export_with_date_time(file_name_header, file_extenstion):
    formatted_datetime = now().strftime("%m_%d_%Y_%H_%M_%S")
    file_name = f"{file_name_header}_{formatted_datetime}.{file_extenstion}"
    file_name = file_name.replace(' ', '_')
    return file_name

In [5]:
def get_time_stamp_string():
    return now().strftime("%m_%d_%Y_%H_%M_%S")

# get_time_stamp_string()

In [6]:
from pandas import json_normalize

def sum_last_3_views(performance_metrics, field):
    """Sums the views from the last 3 metric records.

    Args:
        performance_metrics: A list of performance metrics.

    Returns:
        The sum of views from the last 3 records, or 0 if there are fewer than 3 records.
    """
    try:
        last_3_records = performance_metrics[-3:]
        total_views = sum([record.get(field, 0) for record in last_3_records])
        return total_views
    except (TypeError, IndexError):
        return 0  # Handle cases with missing data or fewer than 3 records

def get_start_end_dates(performance_metrics):
    """Gets the start date of the -3 record and end date of the last record.

    Args:
        performance_metrics: A list of performance metrics.

    Returns:
        A tuple containing the start and end dates, or (None, None) if there are issues.
    """


    try:
        length = len(performance_metrics)
        start_date = performance_metrics[-min(3, length)]['start_date']  # Assuming 'start_date' is the key
        end_date = performance_metrics[-1]['end_date']   # Assuming 'end_date' is the key
        return pd.to_datetime(start_date), pd.to_datetime(end_date)
    except (TypeError, IndexError, KeyError):
        return None, None  # Handle potential errors

def get_tag_list(df_row):
    tag_list = []
    for i in range(1,14):
        if pd.notnull(df_row[f'tag_{i}']) and len(df_row[f'tag_{i}'])>0:
            tag_list.append(df_row[f'tag_{i}'])
    # print(tag_list)
    return tag_list

def get_current_etsy_data_df():
    etsy_all_data_df = get_df_from_mongodb('location_selection', 'etsy_listings')
    etsy_all_data_df['performance_metrics_last_read'] = etsy_all_data_df['performance_metrics'].apply(lambda x: x[-1] if type(x)==list else None)
    columns = ['views', 'favourites', 'order', 'revenue']
    for column in columns:
        etsy_all_data_df[f'total_{column}_last_3'] = etsy_all_data_df['performance_metrics'].apply(sum_last_3_views, field=column)
    etsy_all_data_df[['start_date', 'end_date']] = etsy_all_data_df['performance_metrics'].apply(lambda x: pd.Series(get_start_end_dates(x)))
    etsy_all_data_df['start_date'] = pd.to_datetime(etsy_all_data_df['start_date'])
    etsy_all_data_df['end_date'] = pd.to_datetime(etsy_all_data_df['end_date'])
    etsy_all_data_df['metric_days'] = (etsy_all_data_df['end_date'] - etsy_all_data_df['start_date']).dt.days
    etsy_all_data_df['tag_list'] = etsy_all_data_df.apply(get_tag_list, axis=1)
    return etsy_all_data_df

# df = get_current_etsy_data_df()
# df[['listing_id', 'total_views_last_3',
#                   'total_favourites_last_3','total_revenue_last_3',
#                   'total_order_last_3',
#                   'start_date', 'end_date', 'metric_days' ]]

In [8]:
from moviepy.editor import VideoFileClip

def get_video_duration(video_path):
  """Gets the duration of a video file in seconds.

  Args:
    video_path: The path to the video file.

  Returns:
    The duration of the video in seconds, or None if the file is not found.
  """
  try:
    clip = VideoFileClip(video_path)
    duration = clip.duration
    clip.close()  # Close the clip to release resources
    return duration
  except Exception as e:
    print(f"Error getting video duration for {video_path}: {e}")
    return None

# get_video_duration("/content/Fenomenlik_aslanın_ağzında.mp4")

44.17

In [9]:
def cut_video(video_path, start_time, end_time, output_path):
  """Cuts a video file from start_time to end_time and saves the output.

  Args:
    video_path: The path to the input video file.
    start_time: The start time in seconds.
    end_time: The end time in seconds.
    output_path: The path to save the cut video.
  """
  try:
    clip = VideoFileClip(video_path)
    subclip = clip.subclip(start_time, end_time)
    subclip.write_videofile(output_path)
    clip.close()  # Close the clip to release resources
    print(f"Video successfully cut and saved to {output_path}")
  except Exception as e:
    print(f"Error cutting video {video_path}: {e}")

# cut_video("Fenomenlik_aslanın_ağzında.mp4", 2, 5, "Fenomenlik_aslanın_ağzında_kisa.mp4")

Moviepy - Building video Fenomenlik_aslanın_ağzında_kisa.mp4.
MoviePy - Writing audio in Fenomenlik_aslanın_ağzında_kisaTEMP_MPY_wvf_snd.mp3




MoviePy - Done.
Moviepy - Writing video Fenomenlik_aslanın_ağzında_kisa.mp4





Moviepy - Done !
Moviepy - video ready Fenomenlik_aslanın_ağzında_kisa.mp4
Video successfully cut and saved to Fenomenlik_aslanın_ağzında_kisa.mp4


In [12]:
def download_file_from_url(url, filename=None):
    """Downloads a file from a URL.

    Args:
        url: The URL of the file to download.
        filename: The local filename to save the downloaded file as.
    """
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Raise an exception for bad status codes
        if filename==None:
            filename = os.path.basename(url)
        with open(filename, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)

        print(f"File downloaded successfully as {filename}")
        return filename
    except requests.exceptions.RequestException as e:
        print(f"Error downloading file: {e}")

# url = 'https://storage.googleapis.com/decision-science-lab-bucket/derin_seinfeld/tvittvitanadolu/bolum_edits/domates_kamyonu/shorts/Ak%C5%9Fam_yeme%C4%9Finde_domat_var..mp4'
# download_file_from_url(url, '/content/sample_data/testing.mp4')

File downloaded successfully as /content/sample_data/testing.mp4


'/content/sample_data/testing.mp4'

In [17]:
def save_audio(video_file_path, gcp_folder_path):
    file_name = os.path.basename(video_file_path)
    if '.mp4' in file_name:
        video_clip = VideoFileClip(video_file_path)
        audio_clip = video_clip.audio
        audio_file_name = video_file_path.replace('.mp4', '.mp3')
        audio_clip.write_audiofile(audio_file_name, codec='mp3')

    # sub_folder_path = f'derin_seinfeld/tvittvitanadolu/bolum_edits/{bolum_folder_name}'
    file_name = os.path.basename(audio_file_name).replace('.mp4', '.mp3')
    if audio_file_name!= file_name:
        shutil.copy(audio_file_name, file_name)
    saved_audio_test_url = upload_file_to_gcp_generic_mime_type(file_name, gcp_folder_path, delete_file_from_local = False)
    return saved_audio_test_url

def save_audio_from_url(url, gcp_folder_path):
    file_name = download_file_from_url(url)
    return save_audio(file_name, gcp_folder_path)

# gcp_folder_path = f'derin_seinfeld/tvittvitanadolu/test'
# save_audio('/content/sample_data/testing.mp4', gcp_folder_path)
# url = 'https://storage.googleapis.com/decision-science-lab-bucket/derin_seinfeld/tvittvitanadolu/bolum_edits/domates_kamyonu/shorts/Bir_hedef_u%C4%9Fruna_yek_v%C3%BCcut.mp4'
# save_audio_from_url(url, gcp_folder_path)



File downloaded successfully as Bir_hedef_u%C4%9Fruna_yek_v%C3%BCcut.mp4
MoviePy - Writing audio in Bir_hedef_u%C4%9Fruna_yek_v%C3%BCcut.mp3




MoviePy - Done.
https://storage.googleapis.com/decision-science-lab-bucket/derin_seinfeld/tvittvitanadolu/test/Bir_hedef_u%C4%9Fruna_yek_v%C3%BCcut.mp3


'https://storage.googleapis.com/decision-science-lab-bucket/derin_seinfeld/tvittvitanadolu/test/Bir_hedef_u%C4%9Fruna_yek_v%C3%BCcut.mp3'

In [18]:
def list_files_and_folders(folder_path):
  """Lists all files and folders in a given directory.

  Args:
    folder_path: The path to the directory.

  Returns:
    A tuple containing two lists: one for files and one for folders.
  """
  files = []
  folders = []
  for item in os.listdir(folder_path):
    item_path = os.path.join(folder_path, item)
    if os.path.isfile(item_path):
      files.append(item)
    elif os.path.isdir(item_path):
      folders.append(item)
  return files, folders

# list_files_and_folders('/content')

(['Bir_hedef_u%C4%9Fruna_yek_v%C3%BCcut.mp3',
  'Fenomenlik_aslanın_ağzında.mp4',
  'Ak%C5%9Fam_yeme%C4%9Finde_domat_var..mp4',
  'Fenomenlik_aslanın_ağzında_kisa.mp4',
  'testing.mp3',
  'Bir_hedef_u%C4%9Fruna_yek_v%C3%BCcut.mp4'],
 ['.config', '.ipynb_checkpoints', 'sample_data'])

In [20]:
from google.colab import sheets
def get_transcript_from_audio(audio_url):
    audio_filename = audio_url.split('/')[-1]
    api_url = 'https://www.decisionsciencelab.com/load_utterances'
    data = {
        'audio_url': audio_url,
        'podcast_name': 'tvittvitanadolu',  # Replace YOUR_PODCAST_NAME with the actual podcast name.
        'episode_title': f'manual_episode {audio_filename}',  # Replace YOUR_EPISODE_TITLE with the actual episode title.
        'current_speaker_mapping': {}  # Populate with your current speaker mapping.
    }
    response = requests.post(api_url, json=data)

    if response.status_code == 200:
        print('Utterances loaded successfully!')
        response_data = response.json()  # Store the response data in a variable
        utterances = response_data['utterances']
        transcript = response_data['transcript']
        speaker_mapping = response_data['speaker_mapping']
        topics = response_data['topics']
        audio_gcp_link = response_data['audio_gcp_link']

        # Now you can use the variables 'utterances', 'transcript', etc.
        # For example, to print the utterances:
        print(utterances)
    else:
        print(f'Error loading utterances: {response.status_code}')

    # import pandas as pd
    utterance_rows = []
    for utterance in utterances.values():
        utterance_rows.append([utterance['start'], utterance['end'], utterance['transcript']])
        # print(utterance['transcript'])
    df = pd.DataFrame(utterance_rows, columns=['start', 'end', 'transcript'])
    df = df.sort_values(by='start')
    # from google.colab import sheets
    df_sheet = sheets.InteractiveSheet(df= df, display=False)
    return response_data, df_sheet

# audio_url = 'https://storage.googleapis.com/decision-science-lab-bucket/derin_seinfeld/tvittvitanadolu/test/testing.mp3'
# response_data, df_sheet = get_transcript_from_audio(audio_url)

Utterances loaded successfully!
{'0': {'end': 2.82, 'id': '0', 'speaker': 0, 'start': 0.08, 'transcript': 'Akşam yemeği yiyemeyecek kadar yedim, diyor.'}, '1': {'end': 10.179999, 'id': '1', 'speaker': 0, 'start': 5.2799997, 'transcript': 'Bedava da olsa bir insan akşam yemeği yiyemeyecek kadar domates yiyebilir mi?'}, '2': {'end': 14.171062, 'id': '2', 'speaker': 0, 'start': 10.24, 'transcript': 'Yani İmkansız bir şey Yani imkansız.'}, '3': {'end': 20.506124, 'id': '3', 'speaker': 0, 'start': 14.231062, 'transcript': 'Böyle bir senaryoyu ancak gerçekten bizim ana akım medyamız icat edebilir'}}
https://docs.google.com/spreadsheets/d/1Ifp4Tlw6qxMVGrTyopx7MhUIV93bMEVmtLSYWafrg7Q/edit#gid=0
