# Data Exploration

### converting downloaded data to json

In [1]:
import pandas as pd
import json
import urllib.request
import cv2
import os
import time
from tqdm import tqdm

In [2]:
import json

# reading json
with open('curl_output.json', 'r') as file:
    data = json.load(file)

# print(data)

In [3]:
# print(json.dumps(data, indent=2))

In [4]:
type(data)

dict

In [5]:
pd.DataFrame(data['rows'], )

Unnamed: 0,row_idx,row,truncated_cells
0,0,"{'videoid': 21179416, 'contentUrl': 'https://a...",[]
1,1,"{'videoid': 5629184, 'contentUrl': 'https://ak...",[]
2,2,"{'videoid': 1063125190, 'contentUrl': 'https:/...",[]
3,3,"{'videoid': 1039695998, 'contentUrl': 'https:/...",[]
4,4,"{'videoid': 9607838, 'contentUrl': 'https://ak...",[]
...,...,...,...
95,95,"{'videoid': 9005344, 'contentUrl': 'https://ak...",[]
96,96,"{'videoid': 31040395, 'contentUrl': 'https://a...",[]
97,97,"{'videoid': 24421982, 'contentUrl': 'https://a...",[]
98,98,"{'videoid': 16522138, 'contentUrl': 'https://a...",[]


In [6]:
data_intermediate = pd.DataFrame(data['rows'])['row']

## dataframe creation

In [7]:
df = pd.DataFrame([
    {
        'videoid': item['videoid'],
        'contentUrl': item['contentUrl'],
        'name': item['name']
    }
    for item in data_intermediate
])

df.head()

Unnamed: 0,videoid,contentUrl,name
0,21179416,https://ak.picdn.net/shutterstock/videos/21179...,Aerial shot winter forest
1,5629184,https://ak.picdn.net/shutterstock/videos/56291...,Senior couple looking through binoculars on sa...
2,1063125190,https://ak.picdn.net/shutterstock/videos/10631...,A beautiful cookie with oranges lies on a gree...
3,1039695998,https://ak.picdn.net/shutterstock/videos/10396...,Japanese highrise office skyscrapers tokyo square
4,9607838,https://ak.picdn.net/shutterstock/videos/96078...,"Zrenjanin,serbia march 21 2015: fans watching ..."


In [11]:
print(df.iloc[50])

videoid                                              1021208680
contentUrl    https://ak.picdn.net/shutterstock/videos/10212...
name          Circa 1950s - natural gas is used as power in ...
Name: 50, dtype: object


### video extraction

In [12]:
video_info = df.iloc[50]
url_link = video_info['contentUrl']

os.makedirs('videos')

import urllib.request
urllib.request.urlretrieve(url_link, f'videos/{video_info['videoid']}.mp4') 

('videos/1021208680.mp4', <http.client.HTTPMessage at 0x12d3b6de0>)

In [13]:
import cv2
vidcap = cv2.VideoCapture(f'videos/{video_info['videoid']}.mp4')
total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
print( total_frames )
success,image = vidcap.read()
count = 0

while success:
  success,image = vidcap.read()
  if count == total_frames//2: 
    cv2.imwrite(f"images/{video_info['videoid']}.jpg", image)     # save frame as JPEG file      
#   print('Read a new frame: ', success)
  count += 1


1904


## Dataset creating Function

In [14]:
def extract_mid_frame(video_path, output_path):
    vidcap = cv2.VideoCapture(video_path)
    if not vidcap.isOpened():
        raise IOError(f"Cannot open video: {video_path}")
    
    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
    if total_frames <= 0:
        raise ValueError(f"No frames found in: {video_path}")

    mid_frame = total_frames // 2
    vidcap.set(cv2.CAP_PROP_POS_FRAMES, mid_frame) # jump directly to middle frame instead of reading all frames

    success, image = vidcap.read()
    vidcap.release()

    if not success:
        raise ValueError("Failed to read middle frame")

    cv2.imwrite(output_path, image)

In [15]:
# function check 

extract_mid_frame(f'videos/{video_info['videoid']}.mp4', f"images/{video_info['videoid']}.jpg")

In [20]:
def getData(json_file):
    # reading json and creating DataFrame
    with open('curl_output.json', 'r') as file:
        data = json.load(file)

    data_intermediate = pd.DataFrame(data['rows'])['row']

    df = pd.DataFrame([
        {
            'videoid': item['videoid'],
            'contentUrl': item['contentUrl'],
            'name': item['name']
        }
        for item in data_intermediate
    ])[:5] # limiting to 5 for testing ** REMOVE FOR FULL RUN **

    # ** UNCOMMENT IF FOLDERS DO NOT EXIST **
    os.makedirs("videos", exist_ok=True)
    os.makedirs("images", exist_ok=True)

    # video and frame extraction
    for row in tqdm(df.itertuples(index=False), total=len(df)):
        # downloading videos
        video_id = row.videoid
        url = row.contentUrl
        video_filename = f"videos/{video_id}.mp4"

        try:
            urllib.request.urlretrieve(url, video_filename)
            time.sleep(0.5)  # prevent 429 errors
        except Exception as e:
            print(f"Failed: {video_id} - {e}")

        # extracting frames
        extract_mid_frame(video_filename, f'images/{video_id}.jpg')

    # appending to dataframe
    df['video_path'] = df['videoid'].apply(lambda vid: os.path.join('videos', f'{vid}.mp4'))
    df['image_path'] = df['videoid'].apply(lambda vid: os.path.join('images', f'{vid}.jpg'))

    df.to_csv("webvid_extended_dataset.csv", index=False)

    return df

In [21]:
final_df = getData('curl_output.json')

100%|██████████| 5/5 [00:03<00:00,  1.56it/s]


## Uploading on AWS S3

In [None]:
import boto3
import os
from boto3.session import Session

# Set these
bucket_name = 'it-ml-bucket'  
base_dirs = ['images', 'videos']  # Folders to upload
s3_prefix = 'ivc_data/'        # S3 folder prefix

session = Session(profile_name="default")   # solution to signature mismatch
s3 = session.client("s3")

for base_dir in base_dirs:
    for file_name in os.listdir(base_dir):

        local_path = os.path.join(base_dir, file_name)

        if os.path.isfile(local_path):
            s3_path = f"{s3_prefix}{base_dir}/{file_name}"  # e.g., webvid_data/images/xxx.jpg
            try:
                print("Uploading to:", f"s3://{bucket_name}/{s3_path}")
                s3.upload_file(local_path, bucket_name, s3_path)
                print(f" Uploaded: {local_path} → s3://{bucket_name}/{s3_path}")
            except Exception as e:
                print(f" Failed: {local_path} - {e}")


Uploading to: s3://it-ml-bucket/ivc_data/images/21179416.jpg
 Uploaded: images/21179416.jpg → s3://it-ml-bucket/ivc_data/images/21179416.jpg
Uploading to: s3://it-ml-bucket/ivc_data/images/1063125190.jpg
 Uploaded: images/1063125190.jpg → s3://it-ml-bucket/ivc_data/images/1063125190.jpg
Uploading to: s3://it-ml-bucket/ivc_data/images/1039695998.jpg
 Uploaded: images/1039695998.jpg → s3://it-ml-bucket/ivc_data/images/1039695998.jpg
Uploading to: s3://it-ml-bucket/ivc_data/images/9607838.jpg
 Uploaded: images/9607838.jpg → s3://it-ml-bucket/ivc_data/images/9607838.jpg
Uploading to: s3://it-ml-bucket/ivc_data/images/5629184.jpg
 Uploaded: images/5629184.jpg → s3://it-ml-bucket/ivc_data/images/5629184.jpg
Uploading to: s3://it-ml-bucket/ivc_data/videos/1039695998.mp4
 Uploaded: videos/1039695998.mp4 → s3://it-ml-bucket/ivc_data/videos/1039695998.mp4
Uploading to: s3://it-ml-bucket/ivc_data/videos/5629184.mp4
 Uploaded: videos/5629184.mp4 → s3://it-ml-bucket/ivc_data/videos/5629184.mp4
Uplo

Manually specifying (using session profile) in the notebook to access profile default since it is not set in the environment

In [60]:
import os
# print(os.path.expanduser("~/.aws/credentials"))
print(os.environ.get("AWS_PROFILE")) # here aws profile is not getting default but is set to None

None
