# Codes 

Part 1 - Labels 

In [1]:
import pandas
import boto3
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import io
import time

s3 = boto3.resource('s3')
bucket = s3.Bucket('bpbucket24')

print(bucket)

session = boto3.Session()
client = session.client('rekognition')

#I am creating an empty dictionary here
output = {"videoname":[], "labels":[], "confidencescores":[], "timestamps":[]}

for file in bucket.objects.filter(Prefix="videos"):
    #Print image name for each video in the S3 bucket
    print(file.key)
    response = client.start_label_detection(
    Video={
        "S3Object": {
            "Bucket": "bpbucket24",
            "Name": file.key
        }})
    job_id = response["JobId"]
    print(f"{job_id=}")
    while True:
        response = client.get_label_detection(JobId=job_id)
        print(f"{response['JobStatus']=}")
        if response['JobStatus'] == 'IN_PROGRESS':
            print("time.sleep(10)")
            time.sleep(10)
        else:
            break

    next_token = None

    while True:
        if next_token is None:
            response = client.get_label_detection(JobId=job_id)
        else:
            response = client.get_label_detection(JobId=job_id, NextToken=next_token)
        next_token = response.get("NextToken")
        labels = response["Labels"]

        if next_token is None:
            break

    for labelDetection in response['Labels']:
        label = labelDetection['Label']
        if label['Confidence']> 90.00:
            if label['Name'] not in output["labels"]:
                print("Video Name: "+ file.key)
                output["videoname"].append(file.key)
                print("Label: " + label['Name'])
                output["labels"].append(label['Name'])
                print("Confidence: " + str(label['Confidence']))
                output["confidencescores"].append(label['Confidence'])
                print("Timestamp: " + str(labelDetection['Timestamp']))
                output["timestamps"].append(labelDetection['Timestamp'])


#Save results into a CSV file
#The code below fixes the issue of array lengths do not match issue 
results = pandas.DataFrame.from_dict(output, orient="index")
results = results.transpose()
results.to_csv('example3results.csv', index=True, index_label="Index")

print("done")


Matplotlib is building the font cache; this may take a moment.


s3.Bucket(name='bpbucket24')
videos/
job_id='5cbc8cff44bced30eff07bc205e81a19dff836553fc769619db49610d7e193eb'
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='FAILED'
videos/Kim_1.mp4
job_id='3b489cdfa84d651598377c9649c01f7ad71f8de251026036cd4fdd28278c0137'
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='SUCCEEDED'
Video Name: videos/Kim_1.mp4
Label: Male
Confidence: 90.29597473144531
Timestamp: 38700
Video Name: videos/Kim_1.mp4
Label: Man
Confidence: 90.08468627929688
Timestamp: 38700
Video Name: videos/Kim_1.mp4
Label: Person
Confidence: 96.89376831054688
Timestamp: 38700
Video Name: videos/Kim_1.mp4
Label: Adult
Confidence: 97.5428466796875
Timestamp: 39200
Video Name: videos/Kim_1.mp4
Label: Female
Confidence: 92.63230895996094
Timestamp: 39200
Video Name: videos/Kim_1.mp4
Label: Woman
Confidence: 92.63230895996094
Timestamp: 392

Part 2 - Text

In [2]:
import pandas
import boto3
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import io
import time

s3 = boto3.resource('s3')
bucket = s3.Bucket('bpbucket24')

print(bucket)

session = boto3.Session()
client = session.client('rekognition')

#I am creating an empty dictionary here
output = {"videoID":[], "text":[]}

for file in bucket.objects.filter(Prefix="videotext"):
    videoID_list = []
    text_list = []
    #Print image name for each video in the S3 bucket
    print(file.key)
    response = client.start_text_detection(
    Video={
        "S3Object": {
            "Bucket": "bpbucket24",
            "Name": file.key
        }})
    job_id = response["JobId"]
    print(f"{job_id=}")
    while True:
        response = client.get_text_detection(JobId=job_id)
        print(f"{response['JobStatus']=}")
        if response['JobStatus'] == 'IN_PROGRESS':
            print("time.sleep(10)")
            time.sleep(10)
        else:
            break

    next_token = None

    while True:
        if next_token is None:
            response = client.get_text_detection(JobId=job_id)
        else:
            response = client.get_text_detection(JobId=job_id, NextToken=next_token)
            next_token = response.get("NextToken")
        if next_token is None:
            break

    for textDetection in response['TextDetections']:
        text = textDetection['TextDetection']
        #print("Timestamp: " + str(textDetection['Timestamp']))
        if text['Confidence']> 90.00:
            #print("Text Detected: " + text['DetectedText'])
            text_list.append(text['DetectedText'])
            #output["text"].append(text['DetectedText'])
            #print("Video ID:", file.key)
            videoID_list.append(file.key)
            #output["videoID"].append(file.key)
            
    output["text"].append(text_list)
    output["videoID"].append(videoID_list)

#Save results into a CSV file
#The code below fixes the issue of array lengths do not match issue 
results = pandas.DataFrame.from_dict(output)
#Without this you will get a blank first row 
results = results.iloc[1:, :]
results.to_csv('textresult.csv', index=True, index_label="Index")
print("done")


s3.Bucket(name='bpbucket24')
videotext/
job_id='bd0d6e81e8f932012699bd29a0e94f60e5e0a1a075f044f84eec06a003d4635f'
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='FAILED'
videotext/Kim_1.mp4
job_id='c04da9cfd1098348759ace4c8aaf578a19165e6710b3c18c2b2f777725d203e7'
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='SUCCEEDED'
videotext/Kim_2.mp4
job_id='a55a77ff153e0c519fb876b99b910e0ba6a304aa25f8df6a8d50265b220dcf05'
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='SUCCEEDED'
videotext/Kim_3.mp4
job_id='3f54bbf1f7cd0db6144eb12c029f50b49cfaf107ded99899188822146f90a457'
response['JobStatus']='IN_PROGRESS'
time.sleep

Part 3 - Celebrity 

In [3]:
import pandas
import boto3
import time

session = boto3.Session()
client = session.client('rekognition')

#I am creating an empty dictionary here
#output = {"videoID""celebrity":[], "furtherinfo":[]}

output = {"videoID":[], "celebrity":[], "furtherinfo":[]}

for file in bucket.objects.filter(Prefix="videotext"):
    
    videoID_list = []

    #I am creating a list her that will contain duplicate celebrity names extracted from the video
    celebritylist =[]

    #This is for the further information data
    furtherinfolist = []

    #Print image name for each video in the S3 bucket
    print(file.key)

    response = client.start_celebrity_recognition(
        Video={
            "S3Object": {
                "Bucket": "bpbucket24"
                , "Name": file.key
            }
        }
    )

    job_id = response["JobId"]
    print(f"{job_id=}")

    while True:
        response = client.get_celebrity_recognition(JobId=job_id)
        print(f"{response['JobStatus']=}")

        if response['JobStatus'] == 'IN_PROGRESS':
            print("time.sleep(10)")
            time.sleep(10)
        else:
            break

    next_token = None

    while True:
        if next_token is None:
            response = client.get_celebrity_recognition(JobId=job_id)
        else:
            response = client.get_celebrity_recognition(JobId=job_id, NextToken=next_token)
        next_token = response.get("NextToken")

        if next_token is None:
            break

    for celebrityRecognition in response['Celebrities']:
        #I used this print to find what data is inside the  response['Celebrities']. I am searching for an ID number
        #print('Celebrity: ' + str(celebrityRecognition['Celebrity']))
        #FYI there is a nested data structure here: celebrityRecognition['Celebrity']['Name']
        print('Celebrity: ' + str(celebrityRecognition['Celebrity']['Name']))
        print('Celebrity: ' + str(celebrityRecognition['Celebrity']['Id']))
        celebrityid = celebrityRecognition['Celebrity']['Id']
        response2 = client.get_celebrity_info(Id=celebrityid)
        print('Further information (if available):')
        for url in response2['Urls']:
            print(url)
            furtherinfolist.append(url)
        celebritylist.append(celebrityRecognition['Celebrity']['Name'])
        #Remove duplicate celebrity names in the list
        celebritylist = list(set(celebritylist))
        #Remove duplicates from the further info list
        furtherinfolist = list(set(furtherinfolist))
        print()

    output["celebrity"].append(celebritylist)
    output["furtherinfo"].append(furtherinfolist)

#Save results into a CSV file
results = pandas.DataFrame.from_dict(output)
results.to_csv('celebrity.csv', index=True, index_label="Index")
print("done")


videotext/
job_id='c3535c731ec58c2b71d0da4c75e5aec09eec76386e93c9a83c7e1c558cac6e43'
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='FAILED'
videotext/Kim_1.mp4
job_id='7e07e77a4158e86df7130af8ecbfbfae768e12591beeb6a0a92631e42c6cbccd'
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN

ValueError: All arrays must be of the same length

In [11]:
import pandas
import boto3
import time

session = boto3.Session()
client = session.client('rekognition')

#I am creating an empty dictionary here
#output = {"videoID""celebrity":[], "furtherinfo":[]}

output = {"videoID":[], "celebrity":[], "furtherinfo":[]}

for file in bucket.objects.filter(Prefix="videotext"):
    
    videoID_list = []

    #I am creating a list her that will contain duplicate celebrity names extracted from the video
    celebritylist =[]

    #This is for the further information data
    furtherinfolist = []

    #Print image name for each video in the S3 bucket
    print(file.key)

    response = client.start_celebrity_recognition(
        Video={
            "S3Object": {
                "Bucket": "bpbucket24"
                , "Name": file.key
            }
        }
    )

    job_id = response["JobId"]
    print(f"{job_id=}")

    while True:
        response = client.get_celebrity_recognition(JobId=job_id)
        print(f"{response['JobStatus']=}")

        if response['JobStatus'] == 'IN_PROGRESS':
            print("time.sleep(10)")
            time.sleep(10)
        else:
            break

    next_token = None

    while True:
        if next_token is None:
            response = client.get_celebrity_recognition(JobId=job_id)
        else:
            response = client.get_celebrity_recognition(JobId=job_id, NextToken=next_token)
        next_token = response.get("NextToken")

        if next_token is None:
            break

    for celebrityRecognition in response['Celebrities']:
        #I used this print to find what data is inside the  response['Celebrities']. I am searching for an ID number
        #print('Celebrity: ' + str(celebrityRecognition['Celebrity']))
        #FYI there is a nested data structure here: celebrityRecognition['Celebrity']['Name']
        print('Celebrity: ' + str(celebrityRecognition['Celebrity']['Name']))
        print('Celebrity: ' + str(celebrityRecognition['Celebrity']['Id']))
        celebrityid = celebrityRecognition['Celebrity']['Id']
        response2 = client.get_celebrity_info(Id=celebrityid)
        print('Further information (if available):')
        for url in response2['Urls']:
            print(url)
            furtherinfolist.append(url)
        celebritylist.append(celebrityRecognition['Celebrity']['Name'])
        #Remove duplicate celebrity names in the list
        celebritylist = list(set(celebritylist))
        #Remove duplicates from the further info list
        furtherinfolist = list(set(furtherinfolist))
        print()
        
    output["videoID"].append(videoID_list)
    output["celebrity"].append(celebritylist)
    output["furtherinfo"].append(furtherinfolist)

#Save results into a CSV file
results = pandas.DataFrame.from_dict(output)
results.to_csv('celebrity.csv', index=True, index_label="Index")
print("done")


videotext/
job_id='168bd23fb15cc6f2263436e3a1880ac9dbbd74431daa176ed6ee14eee820bdcf'
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='FAILED'
videotext/Kim_1.mp4
job_id='87edb910a648d315bb5b595ca3436dfd2ec4d722fd8a40245365efecf2514ea4'
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='SUCCEEDED'
Celebrity: Laura Cremaschi
Celebrity: d880ca
Further information (if available):
www.wikidata.org/wiki/Q65966463
www.imdb.com/name/nm6713058

Celebrity: Laura Cremaschi
Celebrity: d880ca
Further information (if available):
www.wikidata.org/wiki/Q

Part 8 - Explicit Review

In [4]:
import pandas
import boto3
import time

session = boto3.Session()
client = session.client('rekognition')

#I am creating an empty dictionary here
output = {"celebrity":[], "furtherinfo":[]}

videoID_list = []

#I am creating a list her that will contain duplicate celebrity names extracted from the video
celebritylist =[]

#This is for the further information data
furtherinfolist = []

response = client.start_content_moderation(
    Video={
        "S3Object": {
            "Bucket": "bpbucket24"
            , "Name": "Kim_1.mp4"
        }
    }
)

job_id = response["JobId"]
print(f"{job_id=}")

while True:
    response = client.get_content_moderation(JobId=job_id)
    print(f"{response['JobStatus']=}")
    if response['JobStatus'] == 'IN_PROGRESS':
        print("time.sleep(10)")
        time.sleep(10)
    else:
        break

next_token = None

while True:
    if next_token is None:
        response = client.get_content_moderation(JobId=job_id)
    else:
        response = client.get_content_moderation(JobId=job_id, NextToken=next_token)
    next_token = response.get("NextToken")
    if next_token is None:
        break

#print(response['ModerationLabels'])
if len(response['ModerationLabels']) == 0:
    print("No Problem Detected")
else:
    for contentModerationDetection in response['ModerationLabels']:
        print(contentModerationDetection)        
        print('Label: ' + str(contentModerationDetection['ModerationLabel']['Name']))
        print('Confidence: ' + str(contentModerationDetection['ModerationLabel']['Confidence']))
        print('Parent category: ' + str(contentModerationDetection['ModerationLabel']['ParentName']))
        print('Timestamp: ' + str(contentModerationDetection['Timestamp']))
        print()
print("done")


job_id='979a2e116426f285be8ccbfb85c36a62ed5efc55d86c67aa6d7a06179d580f5f'
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='SUCCEEDED'
{'Timestamp': 0, 'ModerationLabel': {'Confidence': 92.21143341064453, 'Name': 'Explicit', 'ParentName': '', 'TaxonomyLevel': 1}, 'ContentTypes': []}
Label: Explicit
Confidence: 92.21143341064453
Parent category: 
Timestamp: 0

{'Timestamp': 0, 'ModerationLabel': {'Confidence': 92.21143341064453, 'Name': 'Explicit Nudity', 'ParentName': 'Explicit', 'TaxonomyLevel': 2}, 'ContentTypes': []}
Label: Explicit Nudity
Confidence: 92.21143341064453
Parent category: Explicit
Timestamp: 0

{'Timestamp': 0, 'ModerationLabel': {'Confidence': 85.16895294189453, 'Name': 'Exposed Female Genitalia', 'ParentName': 'Explicit Nudity', 'TaxonomyLevel': 3}, 'ContentTypes': []}
Label: Exposed Female Genitalia
Confidence: 85.16895294189453
Parent category: Explicit Nudity
Timestamp: 0

{'Timestamp': 0, 

In [5]:
import pandas
import boto3
import time

session = boto3.Session()
client = session.client('rekognition')

#I am creating an empty dictionary here
output = {"celebrity":[], "furtherinfo":[]}

videoID_list = []

#I am creating a list her that will contain duplicate celebrity names extracted from the video
celebritylist =[]

#This is for the further information data
furtherinfolist = []

response = client.start_content_moderation(
    Video={
        "S3Object": {
            "Bucket": "bpbucket24"
            , "Name": "Kim_2.mp4"
        }
    }
)

job_id = response["JobId"]
print(f"{job_id=}")

while True:
    response = client.get_content_moderation(JobId=job_id)
    print(f"{response['JobStatus']=}")
    if response['JobStatus'] == 'IN_PROGRESS':
        print("time.sleep(10)")
        time.sleep(10)
    else:
        break

next_token = None

while True:
    if next_token is None:
        response = client.get_content_moderation(JobId=job_id)
    else:
        response = client.get_content_moderation(JobId=job_id, NextToken=next_token)
    next_token = response.get("NextToken")
    if next_token is None:
        break

#print(response['ModerationLabels'])
if len(response['ModerationLabels']) == 0:
    print("No Problem Detected")
else:
    for contentModerationDetection in response['ModerationLabels']:
        print(contentModerationDetection)        
        print('Label: ' + str(contentModerationDetection['ModerationLabel']['Name']))
        print('Confidence: ' + str(contentModerationDetection['ModerationLabel']['Confidence']))
        print('Parent category: ' + str(contentModerationDetection['ModerationLabel']['ParentName']))
        print('Timestamp: ' + str(contentModerationDetection['Timestamp']))
        print()
print("done")


job_id='b6facfed7f0f67e25e455f9dc1774436db7406694bf340917e6f2edc59e4f3ec'
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='SUCCEEDED'
{'Timestamp': 0, 'ModerationLabel': {'Confidence': 88.92416381835938, 'Name': 'Non-Explicit Nudity', 'ParentName': 'Non-Explicit Nudity of Intimate parts and Kissing', 'TaxonomyLevel': 2}, 'ContentTypes': []}
Label: Non-Explicit Nudity
Confidence: 88.92416381835938
Parent category: Non-Explicit Nudity of Intimate parts and Kissing
Timestamp: 0

{'Timestamp': 0, 'ModerationLabel': {'Confidence': 88.92416381835938, 'Name': 'Non-Explicit Nudity of Intimate parts and Kissing', 'ParentName': '', 'TaxonomyLevel': 1}, 'ContentTypes': []}
Label: Non-Explicit Nudity of Intimate parts and Kissing
Confidence: 88.92416381835938
Parent category: 
Timestamp: 0

{'Timestamp': 0, 'ModerationLabel': {'Confidence': 88.92416381835938, 'Name': 'Part

In [6]:
import pandas
import boto3
import time

session = boto3.Session()
client = session.client('rekognition')

#I am creating an empty dictionary here
output = {"celebrity":[], "furtherinfo":[]}

videoID_list = []

#I am creating a list her that will contain duplicate celebrity names extracted from the video
celebritylist =[]

#This is for the further information data
furtherinfolist = []

response = client.start_content_moderation(
    Video={
        "S3Object": {
            "Bucket": "bpbucket24"
            , "Name": "Kim_3.mp4"
        }
    }
)

job_id = response["JobId"]
print(f"{job_id=}")

while True:
    response = client.get_content_moderation(JobId=job_id)
    print(f"{response['JobStatus']=}")
    if response['JobStatus'] == 'IN_PROGRESS':
        print("time.sleep(10)")
        time.sleep(10)
    else:
        break

next_token = None

while True:
    if next_token is None:
        response = client.get_content_moderation(JobId=job_id)
    else:
        response = client.get_content_moderation(JobId=job_id, NextToken=next_token)
    next_token = response.get("NextToken")
    if next_token is None:
        break

#print(response['ModerationLabels'])
if len(response['ModerationLabels']) == 0:
    print("No Problem Detected")
else:
    for contentModerationDetection in response['ModerationLabels']:
        print(contentModerationDetection)        
        print('Label: ' + str(contentModerationDetection['ModerationLabel']['Name']))
        print('Confidence: ' + str(contentModerationDetection['ModerationLabel']['Confidence']))
        print('Parent category: ' + str(contentModerationDetection['ModerationLabel']['ParentName']))
        print('Timestamp: ' + str(contentModerationDetection['Timestamp']))
        print()
print("done")


job_id='7d9c58afc1bb01e3178a4218f8beb5eeabd8521e8543433b4c79a1686093640e'
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='SUCCEEDED'
{'Timestamp': 8008, 'ModerationLabel': {'Confidence': 81.43177795410156, 'Name': 'Female Swimwear or Underwear', 'ParentName': 'Swimwear or Underwear', 'TaxonomyLevel': 2}, 'ContentTypes': []}
Label: Female Swimwear or Underwear
Confidence: 81.43177795410156
Parent category: Swimwear or Underwear
Timestamp: 8008

{'Timestamp': 8008, 'ModerationLabel': {'Confidence': 85.3780517578125, 'Name': 'Non-Explicit Nudity', 'ParentName': 'Non-Explicit Nudity of Intimate parts and Kissing', 'TaxonomyLevel': 2}, 'ContentTypes': []}
Label: Non-Explicit Nudity
Confidence: 85.3780517578125
Parent category: Non-Explicit Nudity of Intimate parts and Kissing
Timestamp: 8008

{'Timestamp': 8008, 'ModerationLabel': {'Confidence': 85.3780517578125, 'Name': 'Non-Explicit Nudity of Intimate parts and K

In [7]:
import pandas
import boto3
import time

session = boto3.Session()
client = session.client('rekognition')

#I am creating an empty dictionary here
output = {"celebrity":[], "furtherinfo":[]}

videoID_list = []

#I am creating a list her that will contain duplicate celebrity names extracted from the video
celebritylist =[]

#This is for the further information data
furtherinfolist = []

response = client.start_content_moderation(
    Video={
        "S3Object": {
            "Bucket": "bpbucket24"
            , "Name": "Won_1.mp4"
        }
    }
)

job_id = response["JobId"]
print(f"{job_id=}")

while True:
    response = client.get_content_moderation(JobId=job_id)
    print(f"{response['JobStatus']=}")
    if response['JobStatus'] == 'IN_PROGRESS':
        print("time.sleep(10)")
        time.sleep(10)
    else:
        break

next_token = None

while True:
    if next_token is None:
        response = client.get_content_moderation(JobId=job_id)
    else:
        response = client.get_content_moderation(JobId=job_id, NextToken=next_token)
    next_token = response.get("NextToken")
    if next_token is None:
        break

#print(response['ModerationLabels'])
if len(response['ModerationLabels']) == 0:
    print("No Problem Detected")
else:
    for contentModerationDetection in response['ModerationLabels']:
        print(contentModerationDetection)        
        print('Label: ' + str(contentModerationDetection['ModerationLabel']['Name']))
        print('Confidence: ' + str(contentModerationDetection['ModerationLabel']['Confidence']))
        print('Parent category: ' + str(contentModerationDetection['ModerationLabel']['ParentName']))
        print('Timestamp: ' + str(contentModerationDetection['Timestamp']))
        print()
print("done")


job_id='84d8ebe88a48a8601f7d174745f5b310df64d6afbabff1dae7656e52faa2c942'
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='SUCCEEDED'
{'Timestamp': 516, 'ModerationLabel': {'Confidence': 62.72012710571289, 'Name': 'Explicit', 'ParentName': '', 'TaxonomyLevel': 1}, 'ContentTypes': []}
Label: Explicit
Confidence: 62.72012710571289
Parent category: 
Timestamp: 516

{'Timestamp': 516, 'ModerationLabel': {'Confidence': 62.72012710571289, 'Name': 'Explicit Nudity', 'ParentName': 'Explicit', 'TaxonomyLevel': 2}, 'ContentTypes': []}
Label: Explicit Nudity
Confidence: 62.72012710571289
Parent category: Explicit
Timestamp: 516

{'Timestamp': 516, 'ModerationLabel': {'Confidence': 62.72012710571289, 'Name': 'Exposed Female Nipple', 'ParentName': 'Explicit Nudity', 'TaxonomyLevel': 3}, 'ContentTypes': []}
Label: Exposed Female Nipple
Confidence: 62.72012710571289
Parent category: Explicit Nudity
Timestamp: 516

{'Timestamp

In [8]:
import pandas
import boto3
import time

session = boto3.Session()
client = session.client('rekognition')

#I am creating an empty dictionary here
output = {"celebrity":[], "furtherinfo":[]}

videoID_list = []

#I am creating a list her that will contain duplicate celebrity names extracted from the video
celebritylist =[]

#This is for the further information data
furtherinfolist = []

response = client.start_content_moderation(
    Video={
        "S3Object": {
            "Bucket": "bpbucket24"
            , "Name": "Won_2.mp4"
        }
    }
)

job_id = response["JobId"]
print(f"{job_id=}")

while True:
    response = client.get_content_moderation(JobId=job_id)
    print(f"{response['JobStatus']=}")
    if response['JobStatus'] == 'IN_PROGRESS':
        print("time.sleep(10)")
        time.sleep(10)
    else:
        break

next_token = None

while True:
    if next_token is None:
        response = client.get_content_moderation(JobId=job_id)
    else:
        response = client.get_content_moderation(JobId=job_id, NextToken=next_token)
    next_token = response.get("NextToken")
    if next_token is None:
        break

#print(response['ModerationLabels'])
if len(response['ModerationLabels']) == 0:
    print("No Problem Detected")
else:
    for contentModerationDetection in response['ModerationLabels']:
        print(contentModerationDetection)        
        print('Label: ' + str(contentModerationDetection['ModerationLabel']['Name']))
        print('Confidence: ' + str(contentModerationDetection['ModerationLabel']['Confidence']))
        print('Parent category: ' + str(contentModerationDetection['ModerationLabel']['ParentName']))
        print('Timestamp: ' + str(contentModerationDetection['Timestamp']))
        print()
print("done")


job_id='85be678e91a59d2e60f7ecab3b49392a65ff69e19d56deb502a34a9fb99998cc'
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='SUCCEEDED'
No Problem Detected
done


In [9]:
import pandas
import boto3
import time

session = boto3.Session()
client = session.client('rekognition')

#I am creating an empty dictionary here
output = {"celebrity":[], "furtherinfo":[]}

videoID_list = []

#I am creating a list her that will contain duplicate celebrity names extracted from the video
celebritylist =[]

#This is for the further information data
furtherinfolist = []

response = client.start_content_moderation(
    Video={
        "S3Object": {
            "Bucket": "bpbucket24"
            , "Name": "Won_3.mp4"
        }
    }
)

job_id = response["JobId"]
print(f"{job_id=}")

while True:
    response = client.get_content_moderation(JobId=job_id)
    print(f"{response['JobStatus']=}")
    if response['JobStatus'] == 'IN_PROGRESS':
        print("time.sleep(10)")
        time.sleep(10)
    else:
        break

next_token = None

while True:
    if next_token is None:
        response = client.get_content_moderation(JobId=job_id)
    else:
        response = client.get_content_moderation(JobId=job_id, NextToken=next_token)
    next_token = response.get("NextToken")
    if next_token is None:
        break

#print(response['ModerationLabels'])
if len(response['ModerationLabels']) == 0:
    print("No Problem Detected")
else:
    for contentModerationDetection in response['ModerationLabels']:
        print(contentModerationDetection)        
        print('Label: ' + str(contentModerationDetection['ModerationLabel']['Name']))
        print('Confidence: ' + str(contentModerationDetection['ModerationLabel']['Confidence']))
        print('Parent category: ' + str(contentModerationDetection['ModerationLabel']['ParentName']))
        print('Timestamp: ' + str(contentModerationDetection['Timestamp']))
        print()
print("done")


job_id='d4410439b79a629092df04b392a3ea0ce8010d0b64294b2f8d086f8576a4a8ec'
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='SUCCEEDED'
No Problem Detected
done


In [10]:
import pandas
import boto3
import time

session = boto3.Session()
client = session.client('rekognition')

#I am creating an empty dictionary here
output = {"celebrity":[], "furtherinfo":[]}

videoID_list = []

#I am creating a list her that will contain duplicate celebrity names extracted from the video
celebritylist =[]

#This is for the further information data
furtherinfolist = []

response = client.start_content_moderation(
    Video={
        "S3Object": {
            "Bucket": "bpbucket24"
            , "Name": "Won_4.mp4"
        }
    }
)

job_id = response["JobId"]
print(f"{job_id=}")

while True:
    response = client.get_content_moderation(JobId=job_id)
    print(f"{response['JobStatus']=}")
    if response['JobStatus'] == 'IN_PROGRESS':
        print("time.sleep(10)")
        time.sleep(10)
    else:
        break

next_token = None

while True:
    if next_token is None:
        response = client.get_content_moderation(JobId=job_id)
    else:
        response = client.get_content_moderation(JobId=job_id, NextToken=next_token)
    next_token = response.get("NextToken")
    if next_token is None:
        break

#print(response['ModerationLabels'])
if len(response['ModerationLabels']) == 0:
    print("No Problem Detected")
else:
    for contentModerationDetection in response['ModerationLabels']:
        print(contentModerationDetection)        
        print('Label: ' + str(contentModerationDetection['ModerationLabel']['Name']))
        print('Confidence: ' + str(contentModerationDetection['ModerationLabel']['Confidence']))
        print('Parent category: ' + str(contentModerationDetection['ModerationLabel']['ParentName']))
        print('Timestamp: ' + str(contentModerationDetection['Timestamp']))
        print()
print("done")


job_id='d9995b27883dacaa21ecbef3cb8daff1441bae1156e78b2ee4b696bd50ee34c8'
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='IN_PROGRESS'
time.sleep(10)
response['JobStatus']='SUCCEEDED'
{'Timestamp': 4500, 'ModerationLabel': {'Confidence': 85.9253921508789, 'Name': 'Exposed Male Nipple', 'ParentName': 'Non-Explicit Nudity', 'TaxonomyLevel': 3}, 'ContentTypes': []}
Label: Exposed Male Nipple
Confidence: 85.9253921508789
Parent category: Non-Explicit Nudity
Timestamp: 4500

{'Timestamp': 4500, 'ModerationLabel': {'Confidence': 85.9253921508789, 'Name': 'Non-Explicit Nudity', 'ParentName': 'Non-Explicit Nudity of Intimate parts and Kissing', 'TaxonomyLevel': 2}, 'ContentTypes': []}
Label: Non-Explicit Nudity
Confidence: 85.9253921508789
Parent category: Non-Explicit Nudity of Intimate parts and Kissing
Timestamp: 4500

{'Timestamp': 4500, 'ModerationLabel': {'Confidence': 85.92549896240234, 'Name': 'Non-Explicit Nudity of Intimate parts and Kissing', 'ParentName': 

# Analysis

## Question 1

Please see file "textresult"

## Question 2 

Please see files "labels"

## Question 3

#### Sampled video background
I sampled 3 videos from media influencer Kim Kardashian and 4 videos from K-pop star Wonho. In both of these samples, I chose some that included on-screen texts and suggestive content to review the appropriate algorithm. 



### Part 1 - Labels 

In [46]:
!pip install pandas 

import pandas as pd 





In [6]:
df = pd.read_csv('C:\\Users\\Brian\\Desktop\\Tiktok videos\\labels.csv')

df

Unnamed: 0,Index,videoname,labels,confidencescores,timestamps
0,0,videos/Kim_1.mp4,Male,90.295975,38700
1,1,videos/Kim_1.mp4,Man,90.084686,38700
2,2,videos/Kim_1.mp4,Person,96.893768,38700
3,3,videos/Kim_1.mp4,Adult,97.542847,39200
4,4,videos/Kim_1.mp4,Female,92.632309,39200
5,5,videos/Kim_1.mp4,Woman,92.632309,39200
6,6,videos/Kim_1.mp4,Body Part,90.241516,40200
7,7,videos/Kim_1.mp4,Torso,90.221718,40200
8,8,videos/Kim_1.mp4,Clothing,92.472626,41200
9,9,videos/Kim_2.mp4,Face,92.200043,33766


In [9]:
# Check if any common features among videos. If any common features, value_counts for any listed should be > 1 

df['labels'].value_counts()

Male                1
Home Decor          1
Baby                1
Fitness             1
Gym                 1
Gym Weights         1
Sport               1
Working Out         1
Bench Press         1
Coat                1
Cushion             1
Indoors             1
Man                 1
Interior Design     1
Shirt               1
Boy                 1
Romantic            1
Performer           1
Solo Performance    1
Happy               1
Laughing            1
Eating              1
Wedding             1
Hair                1
Blonde              1
Jewelry             1
Person              1
Adult               1
Female              1
Woman               1
Body Part           1
Torso               1
Clothing            1
Face                1
Head                1
Underwear           1
Swimwear            1
Lingerie            1
Bra                 1
Photography         1
Portrait            1
Nature              1
Outdoors            1
Accessories         1
Earring             1
Food      

#### Label Review 

For labels, the videos do not seem to have any common labels. According to AWS Rekognition, each video is very distinct. 

### Part 2 - Text 



In [23]:
df = pd.read_csv('C:\\Users\\Brian\\Desktop\\Tiktok videos\\textresult.csv')
df

Unnamed: 0,Index,videoID,text
0,1,"['videotext/Kim_1.mp4', 'videotext/Kim_1.mp4',...","['TikTok', '@kimkardashian', 'TikTok', '@kimka..."
1,2,"['videotext/Kim_2.mp4', 'videotext/Kim_2.mp4',...","['ل TikTok', '@kimkardashian', 'TikTok', '@kim..."
2,3,"['videotext/Kim_3.mp4', 'videotext/Kim_3.mp4',...","['ل', 'TikTok', '@kimkardashian', 'ل', 'TikTok..."
3,4,"['videotext/Won_1.mp4', 'videotext/Won_1.mp4',...","['()', 'TikTok', '@wonhosexy', '()', 'TikTok',..."
4,5,"['videotext/Won_2.mp4', 'videotext/Won_2.mp4',...","['Wonho demonstrating how', 'he fell between t..."
5,6,"['videotext/Won_3.mp4', 'videotext/Won_3.mp4',...","['ل TikTok', '@wonhosexy', 'On this day', '11/..."
6,7,"['videotext/Won_4.mp4', 'videotext/Won_4.mp4',...","['Me walking around with the', 'most unhinged ..."
7,8,"['videotext/Won_5.mp4', 'videotext/Won_5.mp4',...","['STAGE', 'WONHO', '1:32', 'ل TikTok', '@wonho..."


In [25]:
# Create sets to par down on duplicated texts within each video 
df['sets'] = df['text'].apply(lambda x: set(x.split()))
df

Unnamed: 0,Index,videoID,text,sets
0,1,"['videotext/Kim_1.mp4', 'videotext/Kim_1.mp4',...","['TikTok', '@kimkardashian', 'TikTok', '@kimka...","{['TikTok',, '@kimkardashian'], 'ل, TikTok',, ..."
1,2,"['videotext/Kim_2.mp4', 'videotext/Kim_2.mp4',...","['ل TikTok', '@kimkardashian', 'TikTok', '@kim...","{'@kimkardashian'], ['ل, 'ل, TikTok',, 'TikTok..."
2,3,"['videotext/Kim_3.mp4', 'videotext/Kim_3.mp4',...","['ل', 'TikTok', '@kimkardashian', 'ل', 'TikTok...","{'@kimkardashian'], 'LOTION',, 'SUNTA',, 'ل, '..."
3,4,"['videotext/Won_1.mp4', 'videotext/Won_1.mp4',...","['()', 'TikTok', '@wonhosexy', '()', 'TikTok',...","{'()',, '@wonhosexy'], 'ل, ['()',, 'TikTok',, ..."
4,5,"['videotext/Won_2.mp4', 'videotext/Won_2.mp4',...","['Wonho demonstrating how', 'he fell between t...","{'ل, 'Wonho, 'he, 'the',, ['Wonho, 'ل',, 'demo..."
5,6,"['videotext/Won_3.mp4', 'videotext/Won_3.mp4',...","['ل TikTok', '@wonhosexy', 'On this day', '11/...","{'this',, ['ل, 'On, this, day',, '11/16/2021',..."
6,7,"['videotext/Won_4.mp4', 'videotext/Won_4.mp4',...","['Me walking around with the', 'most unhinged ...","{'Me, 'walking',, unhinged, the',, 'around',, ..."
7,8,"['videotext/Won_5.mp4', 'videotext/Won_5.mp4',...","['STAGE', 'WONHO', '1:32', 'ل TikTok', '@wonho...","{'STAGE',, 'this',, '3/8/2022'], '1:32',, 'On,..."


In [26]:
# Compare every pair of rows and find common strings
for i in range(len(df)):
    for j in range(i + 1, len(df)):
        common_strings = df['sets'][i].intersection(df['sets'][j])
        if common_strings:
            print(f"Common strings between row {i} and row {j}: {common_strings}")

Common strings between row 0 and row 1: {"'@kimkardashian']", "'ل", "'TikTok',", "TikTok',", "'@kimkardashian',", "'ل',"}
Common strings between row 0 and row 2: {"'@kimkardashian']", "'ل", "'TikTok',", "TikTok',", "'@kimkardashian',", "'ل',"}
Common strings between row 0 and row 3: {"'ل", "TikTok',", "'TikTok',"}
Common strings between row 0 and row 4: {"'ل", "'ل',", "'TikTok',", "TikTok',"}
Common strings between row 0 and row 5: {"'ل", "'ل',", "'TikTok',", "TikTok',"}
Common strings between row 0 and row 6: {"'ل", "'TikTok',", "TikTok',"}
Common strings between row 0 and row 7: {"'ل", "'ل',", "'TikTok',", "TikTok',"}
Common strings between row 1 and row 2: {"'@kimkardashian']", "'ل", "'TikTok',", "TikTok',", "'@kimkardashian',", "'ل',"}
Common strings between row 1 and row 3: {"'ل", "TikTok',", "'TikTok',"}
Common strings between row 1 and row 4: {"'ل", "'ل',", "'TikTok',", "TikTok',"}
Common strings between row 1 and row 5: {"['ل", "'ل", "TikTok',", "'TikTok',", "'ل',"}
Common stri

#### Text Review

Based on AWS, it looks like the most appeared word is Tiktok, likely from the Tiktok logo tagged in each video. Besides, that the username of the poster the commonly appeared texts. 

### Part 3 - Celebrity

In [27]:
df = pd.read_csv('C:\\Users\\Brian\\Desktop\\Tiktok videos\\celebrity.csv')
df

Unnamed: 0,Index,videoID,celebrity,furtherinfo
0,0,[],[],[]
1,1,[],['Laura Cremaschi'],"['www.wikidata.org/wiki/Q65966463', 'www.imdb...."
2,2,[],['Huda Kattan'],['www.wikidata.org/wiki/Q42966319']
3,3,[],"['Judy Landers', 'Jourdan Dunn']","['www.imdb.com/name/nm5435984', 'www.wikidata...."
4,4,[],['Nie Yuan'],"['www.imdb.com/name/nm1983064', 'www.wikidata...."
5,5,[],['Qi Guangpu'],['www.wikidata.org/wiki/Q955639']
6,6,[],[],[]
7,7,[],[],[]
8,8,[],"['Jungkook', 'Bai Jinshi']","['www.wikidata.org/wiki/Q22338877', 'www.wikid..."


In [28]:
# Populate value_count to see if any celebrity appears more than once. If so, suggests commonality between videos 
df['celebrity'].value_counts()

[]                                  3
['Laura Cremaschi']                 1
['Huda Kattan']                     1
['Judy Landers', 'Jourdan Dunn']    1
['Nie Yuan']                        1
['Qi Guangpu']                      1
['Jungkook', 'Bai Jinshi']          1
Name: celebrity, dtype: int64

#### Celebrity Review

From the value_counts() report, '[]' seems to have 3 instances. This suggests that AWS was not able to detect any celebrities in those video. However, in manually reviewing the outputs, each of the celebrity suggested by Rekognition are incorrect. This is because the sampled videos are from Kim Kardashian and Wonho (see usertags from Part 2 - text Review). It not only misses the correct celebrity but also describes a different celebrity. 

### Part 4 - Explicit Content detection 

Below is the summarized result of the explicit content review referencing the output results from the code used earlier. If Tag has N/A, this suggests that explicit tag are available but the confidence failed to reach at least 90% threshold. If no problems were detected, 'No Problem Detected' is listed. 

| Video  | Tags                  | Confidence above 90% |
|--------|-----------------------|-----------------------|
| Kim_1  | Explicit               | 92.21                 |
| Kim_1  | Explicit Nudity        | 92.21                 |
| Kim_2  | N/A                    | N/A                   |
| Kim_3  | N/A                    | N/A                   |
| Won_1  | N/A                    | N/A                   |
| Won_2  | No Problem Detected    | N/A                   |
| Won_3  | No Problem Detected    | N/A                   |
| Won_4  | N/A                    | N/A                   |


#### Explicit content review

Based on the algorithm, all the videos except the first one are moderate to mild explicity. 


### Question 3 Analysis

Based on the findings above, the videos between the two content creators are very different. This seems to suggest and reflect the different target audience between the two celebrities. 

| Algorithm       | Similarities                          | Differences                                                   |
|------------------|---------------------------------------|--------------------------------------------------------------|
| Labels           | N/A                                   | Content very different. Kim has labels such as underwear, lingerie, outdoor, jewelry, blonde, hair. Wonho has labels such as Indoor, Performer, Happy, Laughing. |
| Text             | Tiktok logo                           | Username                                                     |
| Celebrity        | Algorithm could not produce accurate result | Different celebrities generated                               |
| Explicit Review   | Mildly suggestive content             | Kim has one video marked explicit                                       |



## Question 4



In [66]:
df = pd.read_csv('C:\\Users\\Brian\\Desktop\\Tiktok videos\\Tiktok_numbers.csv')

df

Unnamed: 0,Video,Follower_Count,Likes,Comment_Count,Bookmarks,Comments
0,Kim_1,9700000,129900,1261,5404,"""That’s how I dress for the gym too"",""Is this ..."
1,Kim_2,9700000,119700,2300,8241,"""Salma Hayek Nailed it"",""beautiful salma"",""Omh..."
2,Kim_3,9700000,87000,668,3104,"""I am a really a big fan of yours Kim Kardashi..."
3,Won_1,55400,10600,83,1632,"""My name is Regina"",""Beautiful 🔥🔥🔥🥰🥰🥰🥰😍😍😍😍"",""e..."
4,Won_2,55400,3585,31,599,"""HE KNOWS WHAT HES DOING AND I APPRECIATE IT"",..."
5,Won_3,55400,791,20,172,"""this felt illegal to watch n for him to post ..."
6,Won_4,55400,123500,660,11300,"""CROP"",""When I tell you my jaw DROPPED"",""I hav..."
7,Won_5,55400,1377,18,209,"""Can I have that job🤭"",""how many people applie..."


In [67]:
# To make the comparison fair, we will adjust for follower count by dividing the likes, comments, and bookmarks. 

df['Likes '] = df['Likes ']/df['Follower_Count']
df['Comment_Count'] = df['Comment_Count']/df['Follower_Count']
df['Bookmarks'] = df['Bookmarks']/df['Follower_Count']

df

Unnamed: 0,Video,Follower_Count,Likes,Comment_Count,Bookmarks,Comments
0,Kim_1,9700000,0.013392,0.00013,0.000557,"""That’s how I dress for the gym too"",""Is this ..."
1,Kim_2,9700000,0.01234,0.000237,0.00085,"""Salma Hayek Nailed it"",""beautiful salma"",""Omh..."
2,Kim_3,9700000,0.008969,6.9e-05,0.00032,"""I am a really a big fan of yours Kim Kardashi..."
3,Won_1,55400,0.191336,0.001498,0.029458,"""My name is Regina"",""Beautiful 🔥🔥🔥🥰🥰🥰🥰😍😍😍😍"",""e..."
4,Won_2,55400,0.064711,0.00056,0.010812,"""HE KNOWS WHAT HES DOING AND I APPRECIATE IT"",..."
5,Won_3,55400,0.014278,0.000361,0.003105,"""this felt illegal to watch n for him to post ..."
6,Won_4,55400,2.229242,0.011913,0.203971,"""CROP"",""When I tell you my jaw DROPPED"",""I hav..."
7,Won_5,55400,0.024856,0.000325,0.003773,"""Can I have that job🤭"",""how many people applie..."


In [68]:
label_df = pd.read_csv('C:\\Users\\Brian\\Desktop\\Tiktok videos\\labels.csv')
label_df.head(5)

Unnamed: 0,Index,videoname,labels,confidencescores,timestamps
0,0,videos/Kim_1.mp4,Male,90.295975,38700
1,1,videos/Kim_1.mp4,Man,90.084686,38700
2,2,videos/Kim_1.mp4,Person,96.893768,38700
3,3,videos/Kim_1.mp4,Adult,97.542847,39200
4,4,videos/Kim_1.mp4,Female,92.632309,39200


In [69]:
# Isolate name of each video 
label_df['Video'] = label_df['videoname'].str.extract(r'/(.*)\.mp4')
label_df.head(5)

Unnamed: 0,Index,videoname,labels,confidencescores,timestamps,Video
0,0,videos/Kim_1.mp4,Male,90.295975,38700,Kim_1
1,1,videos/Kim_1.mp4,Man,90.084686,38700,Kim_1
2,2,videos/Kim_1.mp4,Person,96.893768,38700,Kim_1
3,3,videos/Kim_1.mp4,Adult,97.542847,39200,Kim_1
4,4,videos/Kim_1.mp4,Female,92.632309,39200,Kim_1


In [56]:
# Aggregate labels

df_combined_label = label_df.groupby('Video').agg({
    'videoname': 'first',
    'labels': ', '.join,
    'confidencescores': lambda x: ', '.join(x.astype(str)),
    'timestamps': lambda x: ', '.join(x.astype(str))
}).reset_index()

df_combined_label.head(5)

Unnamed: 0,Video,videoname,labels,confidencescores,timestamps
0,Kim_1,videos/Kim_1.mp4,"Male, Man, Person, Adult, Female, Woman, Body ...","90.29597473144533, 90.08468627929688, 96.89376...","38700, 38700, 38700, 39200, 39200, 39200, 4020..."
1,Kim_2,videos/Kim_2.mp4,"Face, Head, Underwear, Swimwear, Lingerie, Bra...","92.20004272460938, 92.6380386352539, 92.070762...","33766, 33766, 35766, 36266, 36766, 37266, 3826..."
2,Kim_3,videos/Kim_3.mp4,"Accessories, Earring, Jewelry, Blonde, Hair, W...","97.35279083251952, 97.29650115966795, 97.29650...","21521, 21521, 21521, 23523, 23523, 23523"
3,Won_1,videos/Won_1.mp4,"Baby, Fitness, Gym, Gym Weights, Sport, Workin...","90.79158020019533, 90.12174224853516, 90.12174...","2533, 4516, 4516, 4516, 4516, 4516, 5516"
4,Won_2,videos/Won_2.mp4,"Coat, Cushion, Home Decor, Indoors, Interior D...","90.61336517333984, 91.1993179321289, 91.516601...","59933, 59933, 59933, 60433, 60433, 60933"


In [70]:
print(df.columns)
print(df_combined_label.columns)


Index(['Video ', 'Follower_Count', 'Likes ', 'Comment_Count', 'Bookmarks',
       'Comments'],
      dtype='object')
Index(['Video', 'videoname', 'labels', 'confidencescores', 'timestamps'], dtype='object')


In [71]:
df.rename(columns={'Video ':'Video'},inplace=True)
df.columns

Index(['Video', 'Follower_Count', 'Likes ', 'Comment_Count', 'Bookmarks',
       'Comments'],
      dtype='object')

In [72]:
# Merging dataframes between the label analysis and Tiktok data 

main_df = pd.merge(df, df_combined_label, on=['Video'], how='inner')
main_df

Unnamed: 0,Video,Follower_Count,Likes,Comment_Count,Bookmarks,Comments,videoname,labels,confidencescores,timestamps
0,Kim_1,9700000,0.013392,0.00013,0.000557,"""That’s how I dress for the gym too"",""Is this ...",videos/Kim_1.mp4,"Male, Man, Person, Adult, Female, Woman, Body ...","90.29597473144533, 90.08468627929688, 96.89376...","38700, 38700, 38700, 39200, 39200, 39200, 4020..."
1,Kim_2,9700000,0.01234,0.000237,0.00085,"""Salma Hayek Nailed it"",""beautiful salma"",""Omh...",videos/Kim_2.mp4,"Face, Head, Underwear, Swimwear, Lingerie, Bra...","92.20004272460938, 92.6380386352539, 92.070762...","33766, 33766, 35766, 36266, 36766, 37266, 3826..."
2,Kim_3,9700000,0.008969,6.9e-05,0.00032,"""I am a really a big fan of yours Kim Kardashi...",videos/Kim_3.mp4,"Accessories, Earring, Jewelry, Blonde, Hair, W...","97.35279083251952, 97.29650115966795, 97.29650...","21521, 21521, 21521, 23523, 23523, 23523"
3,Won_1,55400,0.191336,0.001498,0.029458,"""My name is Regina"",""Beautiful 🔥🔥🔥🥰🥰🥰🥰😍😍😍😍"",""e...",videos/Won_1.mp4,"Baby, Fitness, Gym, Gym Weights, Sport, Workin...","90.79158020019533, 90.12174224853516, 90.12174...","2533, 4516, 4516, 4516, 4516, 4516, 5516"
4,Won_2,55400,0.064711,0.00056,0.010812,"""HE KNOWS WHAT HES DOING AND I APPRECIATE IT"",...",videos/Won_2.mp4,"Coat, Cushion, Home Decor, Indoors, Interior D...","90.61336517333984, 91.1993179321289, 91.516601...","59933, 59933, 59933, 60433, 60433, 60933"
5,Won_3,55400,0.014278,0.000361,0.003105,"""this felt illegal to watch n for him to post ...",videos/Won_3.mp4,"Boy, Romantic","90.18084716796876, 90.68704223632812","2033, 12500"
6,Won_4,55400,2.229242,0.011913,0.203971,"""CROP"",""When I tell you my jaw DROPPED"",""I hav...",videos/Won_4.mp4,"Performer, Solo Performance","99.99828338623048, 99.99828338623048","0, 0"
7,Won_5,55400,0.024856,0.000325,0.003773,"""Can I have that job🤭"",""how many people applie...",videos/Won_5.mp4,"Happy, Laughing, Eating, Food","95.04277801513672, 94.8965072631836, 91.308601...","520, 520, 5520, 5520"


In [76]:
# Sort table with most likes at the top and least likes at the bottom

main_df = main_df.sort_values(by='Likes ', ascending=False)  # Sort in descending order for highest Likes first
main_df

Unnamed: 0,Video,Follower_Count,Likes,Comment_Count,Bookmarks,Comments,videoname,labels,confidencescores,timestamps
6,Won_4,55400,2.229242,0.011913,0.203971,"""CROP"",""When I tell you my jaw DROPPED"",""I hav...",videos/Won_4.mp4,"Performer, Solo Performance","99.99828338623048, 99.99828338623048","0, 0"
3,Won_1,55400,0.191336,0.001498,0.029458,"""My name is Regina"",""Beautiful 🔥🔥🔥🥰🥰🥰🥰😍😍😍😍"",""e...",videos/Won_1.mp4,"Baby, Fitness, Gym, Gym Weights, Sport, Workin...","90.79158020019533, 90.12174224853516, 90.12174...","2533, 4516, 4516, 4516, 4516, 4516, 5516"
4,Won_2,55400,0.064711,0.00056,0.010812,"""HE KNOWS WHAT HES DOING AND I APPRECIATE IT"",...",videos/Won_2.mp4,"Coat, Cushion, Home Decor, Indoors, Interior D...","90.61336517333984, 91.1993179321289, 91.516601...","59933, 59933, 59933, 60433, 60433, 60933"
7,Won_5,55400,0.024856,0.000325,0.003773,"""Can I have that job🤭"",""how many people applie...",videos/Won_5.mp4,"Happy, Laughing, Eating, Food","95.04277801513672, 94.8965072631836, 91.308601...","520, 520, 5520, 5520"
5,Won_3,55400,0.014278,0.000361,0.003105,"""this felt illegal to watch n for him to post ...",videos/Won_3.mp4,"Boy, Romantic","90.18084716796876, 90.68704223632812","2033, 12500"
0,Kim_1,9700000,0.013392,0.00013,0.000557,"""That’s how I dress for the gym too"",""Is this ...",videos/Kim_1.mp4,"Male, Man, Person, Adult, Female, Woman, Body ...","90.29597473144533, 90.08468627929688, 96.89376...","38700, 38700, 38700, 39200, 39200, 39200, 4020..."
1,Kim_2,9700000,0.01234,0.000237,0.00085,"""Salma Hayek Nailed it"",""beautiful salma"",""Omh...",videos/Kim_2.mp4,"Face, Head, Underwear, Swimwear, Lingerie, Bra...","92.20004272460938, 92.6380386352539, 92.070762...","33766, 33766, 35766, 36266, 36766, 37266, 3826..."
2,Kim_3,9700000,0.008969,6.9e-05,0.00032,"""I am a really a big fan of yours Kim Kardashi...",videos/Kim_3.mp4,"Accessories, Earring, Jewelry, Blonde, Hair, W...","97.35279083251952, 97.29650115966795, 97.29650...","21521, 21521, 21521, 23523, 23523, 23523"


#### Analysis of Likes

Based on the most number of likes, the top video with an average of 2.2 likes per follower seems to be about a performance as suggested by the labels. Contrast to this, the video with least amount of likes (0.009) seems to be about showing off fashion or jewelry with labels such as accessories, earrings, jewelry, blonde, hair. 

This implies that performance's wow factor impresses a lot of people, as opposed to showing off a personal fashion statement. 

In [78]:
# Sort table with most comments at the top and least likes at the bottom

main_df = main_df.sort_values(by='Comment_Count', ascending=False)  # Sort in descending order for highest Likes first
main_df

Unnamed: 0,Video,Follower_Count,Likes,Comment_Count,Bookmarks,Comments,videoname,labels,confidencescores,timestamps
6,Won_4,55400,2.229242,0.011913,0.203971,"""CROP"",""When I tell you my jaw DROPPED"",""I hav...",videos/Won_4.mp4,"Performer, Solo Performance","99.99828338623048, 99.99828338623048","0, 0"
3,Won_1,55400,0.191336,0.001498,0.029458,"""My name is Regina"",""Beautiful 🔥🔥🔥🥰🥰🥰🥰😍😍😍😍"",""e...",videos/Won_1.mp4,"Baby, Fitness, Gym, Gym Weights, Sport, Workin...","90.79158020019533, 90.12174224853516, 90.12174...","2533, 4516, 4516, 4516, 4516, 4516, 5516"
4,Won_2,55400,0.064711,0.00056,0.010812,"""HE KNOWS WHAT HES DOING AND I APPRECIATE IT"",...",videos/Won_2.mp4,"Coat, Cushion, Home Decor, Indoors, Interior D...","90.61336517333984, 91.1993179321289, 91.516601...","59933, 59933, 59933, 60433, 60433, 60933"
5,Won_3,55400,0.014278,0.000361,0.003105,"""this felt illegal to watch n for him to post ...",videos/Won_3.mp4,"Boy, Romantic","90.18084716796876, 90.68704223632812","2033, 12500"
7,Won_5,55400,0.024856,0.000325,0.003773,"""Can I have that job🤭"",""how many people applie...",videos/Won_5.mp4,"Happy, Laughing, Eating, Food","95.04277801513672, 94.8965072631836, 91.308601...","520, 520, 5520, 5520"
1,Kim_2,9700000,0.01234,0.000237,0.00085,"""Salma Hayek Nailed it"",""beautiful salma"",""Omh...",videos/Kim_2.mp4,"Face, Head, Underwear, Swimwear, Lingerie, Bra...","92.20004272460938, 92.6380386352539, 92.070762...","33766, 33766, 35766, 36266, 36766, 37266, 3826..."
0,Kim_1,9700000,0.013392,0.00013,0.000557,"""That’s how I dress for the gym too"",""Is this ...",videos/Kim_1.mp4,"Male, Man, Person, Adult, Female, Woman, Body ...","90.29597473144533, 90.08468627929688, 96.89376...","38700, 38700, 38700, 39200, 39200, 39200, 4020..."
2,Kim_3,9700000,0.008969,6.9e-05,0.00032,"""I am a really a big fan of yours Kim Kardashi...",videos/Kim_3.mp4,"Accessories, Earring, Jewelry, Blonde, Hair, W...","97.35279083251952, 97.29650115966795, 97.29650...","21521, 21521, 21521, 23523, 23523, 23523"


In [97]:
# Remove truncate effect of jupyter
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)     # Show all rows


print("\n Most engagement comments \n")

comments_series = main_df.iloc[0,5:6].values[0]
comments_list = comments_series.strip('"').split('","')
comments_list = [comment.strip('"') for comment in comments_list]

for comments in comments_list:
    print(comments)
    
    
    
print("\n Least engagement comments \n")

comments_series = main_df.iloc[7,5:6].values[0]
comments_list = comments_series.strip('"').split('","')
comments_list = [comment.strip('"') for comment in comments_list]

for comments in comments_list:
    print(comments)


 Most engagement comments 

CROP
When I tell you my jaw DROPPED
I have the one where he just wears a towel wrapped around his waist.
He's so beautiful 😭🫶🏾
Aww miss his ig posts 😩 military service killing us all

 Least engagement comments 

I am a really a big fan of yours Kim Kardashian yep
📞 still in 2022? 👀🎾🕊
KAREN WHEELER KIM ?????
Giving Kim in the 80s vibes 😍🤍
slayed the 80s vibes


#### Analysis of Comments
The reel with the most comments and the least are also the same for likes. Top video has 0.011913 comments per follower, while worst video has 0.000069	per follower. The video with most engagement, as expected, has comments about the performance regarding the wow factor using words like "jaw dropped" and a series of other fan sentiments. With the least engaging video, most comments involve guessing the style with words like "2022 and 80's vibe." 

Comments like these may suggest that a more dynamic video is likely to prompt more engagement from users. 

In [98]:
# Sort table with most Bookmarks at the top and least likes at the bottom

main_df = main_df.sort_values(by='Bookmarks', ascending=False)  # Sort in descending order for highest Likes first
main_df

Unnamed: 0,Video,Follower_Count,Likes,Comment_Count,Bookmarks,Comments,videoname,labels,confidencescores,timestamps
6,Won_4,55400,2.229242,0.011913,0.203971,"""CROP"",""When I tell you my jaw DROPPED"",""I have the one where he just wears a towel wrapped around his waist."",""He's so beautiful 😭🫶🏾"",""Aww miss his ig posts 😩 military service killing us all""",videos/Won_4.mp4,"Performer, Solo Performance","99.99828338623048, 99.99828338623048","0, 0"
3,Won_1,55400,0.191336,0.001498,0.029458,"""My name is Regina"",""Beautiful 🔥🔥🔥🥰🥰🥰🥰😍😍😍😍"",""either watch again after so many times or stare big titty chest probably both"",""@Larry-Shit OMGGGG"",""I have... I- I have THOUGHTS""",videos/Won_1.mp4,"Baby, Fitness, Gym, Gym Weights, Sport, Working Out, Bench Press","90.79158020019533, 90.12174224853516, 90.12174224853516, 90.12174224853516, 90.12174224853516, 90.12174224853516, 93.14005279541016","2533, 4516, 4516, 4516, 4516, 4516, 5516"
4,Won_2,55400,0.064711,0.00056,0.010812,"""HE KNOWS WHAT HES DOING AND I APPRECIATE IT"",""I am but a humble watermelon to be crushed"",""I feel I should be paying for this 😳"",""Yes. We’re all paying attention to the demonstration…"",""QUADS 🤯""",videos/Won_2.mp4,"Coat, Cushion, Home Decor, Indoors, Interior Design, Shirt","90.61336517333984, 91.1993179321289, 91.5166015625, 97.29479217529295, 97.29479217529295, 93.06817626953124","59933, 59933, 59933, 60433, 60433, 60933"
7,Won_5,55400,0.024856,0.000325,0.003773,"""Can I have that job🤭"",""how many people applied for that job"",""Ooo my"",""I miss it! 😀"",""El trabajo que necesito 😭""",videos/Won_5.mp4,"Happy, Laughing, Eating, Food","95.04277801513672, 94.8965072631836, 91.30860137939452, 91.30860137939452","520, 520, 5520, 5520"
5,Won_3,55400,0.014278,0.000361,0.003105,"""this felt illegal to watch n for him to post that n went to jail happily after watching this 😌"",""I was literally ab to post this 🙏😭"",""🥵🥵🥵🥵🥵🥵"",""💋💋💋💋💋💋🥰🥰💘"",""The reason was to distress us""",videos/Won_3.mp4,"Boy, Romantic","90.18084716796876, 90.68704223632812","2033, 12500"
1,Kim_2,9700000,0.01234,0.000237,0.00085,"""Salma Hayek Nailed it"",""beautiful salma"",""Omh"",""Yesssss"",""wow""",videos/Kim_2.mp4,"Face, Head, Underwear, Swimwear, Lingerie, Bra, Photography, Portrait, Nature, Outdoors","92.20004272460938, 92.6380386352539, 92.07076263427734, 90.20240783691406, 92.28104400634766, 94.37992095947266, 90.3376235961914, 90.3376235961914, 95.35426330566406, 96.16068267822266","33766, 33766, 35766, 36266, 36766, 37266, 38266, 38266, 48266, 48266"
0,Kim_1,9700000,0.013392,0.00013,0.000557,"""That’s how I dress for the gym too"",""Is this your Britney Spears era"",""I wore that same thing to the gym yesterday"",""This a breakup revenge video what did he do!!???"",""She’s an energy vampire she fed off of Pete and is becoming more powerful and beautiful""",videos/Kim_1.mp4,"Male, Man, Person, Adult, Female, Woman, Body Part, Torso, Clothing","90.29597473144533, 90.08468627929688, 96.89376831054688, 97.5428466796875, 92.63230895996094, 92.63230895996094, 90.24151611328124, 90.22171783447266, 92.47262573242188","38700, 38700, 38700, 39200, 39200, 39200, 40200, 40200, 41200"
2,Kim_3,9700000,0.008969,6.9e-05,0.00032,"""I am a really a big fan of yours Kim Kardashian yep"",""📞 still in 2022? 👀🎾🕊"",""KAREN WHEELER KIM ?????"",""Giving Kim in the 80s vibes 😍🤍"",""slayed the 80s vibes""",videos/Kim_3.mp4,"Accessories, Earring, Jewelry, Blonde, Hair, Wedding","97.35279083251952, 97.29650115966795, 97.29650115966795, 97.02468872070312, 97.02468872070312, 93.59627532958984","21521, 21521, 21521, 23523, 23523, 23523"


#### Analysis of bookmarks

As demonstrated with the other engagement performance indicators, bookmarks are highest for the video that has a performance and lowest for the fashion statement. 

This may suggest that the performance video has more re-watch value and merits a bookmark to return to it. In contrast, fashion videos are more of a passing interest and viewers do not find so much value in keeping it. 

### Conclusion

Based on the tags pulled from AWS Sagemaker and Tiktok metrics, videos involving performances have a wow factor that draws more engagements and has re-watch values for users. This is for when the data is controlled for other factors like reach, where Kim Kardashian has a larger audience than Wonho. However, when reach is not controlled, Kim will have more actual engagement than Wonho. 

This might suggest that there is a larger market for media influencers like Kim Kardashian, who has built her career on social media and press. Whereas Wonho is a celebrity for a niche music market, which may have less of a draw because of its specialized market. 

## Question 5

Results from Explicit Content review

| Video  | Tags                  | Confidence above 90% |
|--------|-----------------------|-----------------------|
| Kim_1  | Explicit               | 92.21                 |
| Kim_1  | Explicit Nudity        | 92.21                 |
| Kim_2  | N/A                    | N/A                   |
| Kim_3  | N/A                    | N/A                   |
| Won_1  | N/A                    | N/A                   |
| Won_2  | No Problem Detected    | N/A                   |
| Won_3  | No Problem Detected    | N/A                   |
| Won_4  | N/A                    | N/A                   |


In [107]:
# Extracting comments from Kim_1 (Explicit video)
main_df.loc[[0], :]


Unnamed: 0,Video,Follower_Count,Likes,Comment_Count,Bookmarks,Comments,videoname,labels,confidencescores,timestamps
0,Kim_1,9700000,0.013392,0.00013,0.000557,"""That’s how I dress for the gym too"",""Is this your Britney Spears era"",""I wore that same thing to the gym yesterday"",""This a breakup revenge video what did he do!!???"",""She’s an energy vampire she fed off of Pete and is becoming more powerful and beautiful""",videos/Kim_1.mp4,"Male, Man, Person, Adult, Female, Woman, Body Part, Torso, Clothing","90.29597473144533, 90.08468627929688, 96.89376831054688, 97.5428466796875, 92.63230895996094, 92.63230895996094, 90.24151611328124, 90.22171783447266, 92.47262573242188","38700, 38700, 38700, 39200, 39200, 39200, 40200, 40200, 41200"


In [119]:
print("\n Explicit flagged comments \n")

comments_series = main_df.iloc[6,5:6].values[0]
comments_list = comments_series.strip('"').split('","')
comments_list = [comment.strip('"') for comment in comments_list]

for comments in comments_list:
    print(comments)
    
print("\n Explicit tags \n")

labels_series = main_df.iloc[6,7:8].values[0]
labels_list = labels_series.strip('"').split('","')
labels_list = [tag.strip('"') for tag in labels_list]

for tag in labels_list:
    print(tag)
    


 Explicit flagged comments 

That’s how I dress for the gym too
Is this your Britney Spears era
I wore that same thing to the gym yesterday
This a breakup revenge video what did he do!!???
She’s an energy vampire she fed off of Pete and is becoming more powerful and beautiful

 Explicit tags 

Male, Man, Person, Adult, Female, Woman, Body Part, Torso, Clothing


### Review

#### Analysis of flagged content
Only one video of the eight were tagged as explicit, with the algorithm finding confidence levels above 90% for labels like "Male, Man, Female, Woman, Body Part, Torso, Clothing." This seems to be very suggestive. Looking at a few of the comments, it seems to rather suggest that this might be more related to gym with comments like "how I dress for gym, wore that same thing to the gym" and words like "breakup revenge" and "powerful and beautiful". The latter two aligning with the common sentiment of a glow-up, of which gyms and exercise are essential. 

Based on these comments and the various tags, this seems to be more of a fitness video with possibly minimal clothing. While it is not necessarily nudity, the algorithm has flagged it as so with 92% confidence. Since this video is still uploaded and public on TikTok, it seems that TikTok is fair in its judgment of explicit content and not too restrictive. This seems to be a false positive, meaning that the algorithm has passed it for nudity when it isn't. 

Below, I decided to take it one step further and check for false negatives. Parsing through the comments, I decided to pick one video that might seem to have a suggestive draw. 


In [121]:
#### Review of Video Won_1 (potential suggestive draw)

print("\n Not-flagged comments \n")

comments_series = main_df.iloc[1,5:6].values[0]
comments_list = comments_series.strip('"').split('","')
comments_list = [comment.strip('"') for comment in comments_list]

for comments in comments_list:
    print(comments)
    
print("\n Not-flagged tags \n")

labels_series = main_df.iloc[1,7:8].values[0]
labels_list = labels_series.strip('"').split('","')
labels_list = [tag.strip('"') for tag in labels_list]

for tag in labels_list:
    print(tag)
    


 Not-flagged comments 

My name is Regina
Beautiful 🔥🔥🔥🥰🥰🥰🥰😍😍😍😍
either watch again after so many times or stare big titty chest probably both
@Larry-Shit OMGGGG
I have... I- I have THOUGHTS

 Not-flagged tags 

Baby, Fitness, Gym, Gym Weights, Sport, Working Out, Bench Press


#### Analysis of potential false negative 

In this video, we get comments like "🔥🔥🔥", "stare big titty chest", "OMGGGG" and "I- I have THOUGHTS." These are highly reactive comments and seems to suggest inappopriate or suggestive content. However, looking at the tags from Sagemaker, it looks like the video has various tags related to gym such as "Fitness, Gym, Gym Weights, Sport, Working Out, Bench Press" with confidence scores in the 90% range. This implies that, while it might have elicited a strong reaction from the fans and display some evocative images, it is not an inappropriate video and does not have any explicit contents. As such, it is a true negative. 

As a general conclusion, Tik Tok is doing a good job with its moderation of explicit content and based on the videos sampled, does not have any false negatives or false positives. 

## Question 6

Kim Kardashian, who has a big social media presence already, could take further steps to improve her engagement. While she is doing well by raw metrics like aggregate likes, views, or comments, it is much lower if adjusted for following size. Compared to her followings, she has a relatively low engagement. To improve her engagements, she may upload more active or entertaining videos. This could be like a makeup tutorial or a review of a luxury brand. Videos like those have more re-watch value because users can mimic makeup or research the luxury brand, for instance. As such, more users will bookmark and engage. They may comment questions like "What is that brand" or press like to get more quality shorts like that. 

Wonho, having a smaller social media presence, could focus on increasing his media presence. While he has very good engagments already with his current followers, he can further improve by having more collaboration with other celebrities active on social media. This will be a good way to expand reach since the algorithm will introduce him to a new group of audience that previously do not know of him. 

# Question 7
See powerpoint. 