# The first step is preprocessing. We use the [SSBD Dataset](https://rolandgoecke.net/research/datasets/ssbd/) in order to get the data for hand flapping and spinning. 

## The dataset does contain 75 URLs (although we'll ignore all headbanging videos) to youtube videos. All the data is nicely stored in XML files that I will read to get the youtube videos and also the time stamps of when the behavior (hand flapping or spinning) occurs. Then I will use pytube to download the youtube videos to .mp4 and moviepy to cut the .mp4 videos into the areas of interest. Finally, because some of those areas of interest clips are more than a few seconds long (which is all you need to detect spinning or headbanging) I will take those areas that are > 8 seconds and split them into many clips (that way we have more data.) Also in sections of the video where no behavior is used I will take them as videos as control data. 

In [None]:
# first change the directory over to ssbd release
import os 
import xml.etree.ElementTree as ET

In [None]:
# next step would be to get the hand flapping and spinning data 

tree = ET.parse("ssbd-release/Annotations/v_ArmFlapping_07.xml")
root = tree.getroot()
for child in root:
    # for each child in the root 
    if child.tag == "url":
        print(child.text)
    if child.tag == "behaviours":
        for behavior in child: # go through each reported behavior 
            for tag in behavior: # tag is just the attribute of the behavior 
                if tag.tag == "time":
                    print(tag.text)
                if tag.tag == "intensity":
                    print(tag.text)
                if tag.tag == "category":
                    print(tag.text)

In [None]:
def convert_to_second(time : str) -> int:
    # this will take in a time like "0125" or 1:25 and make it 85 (60 + 25)
    overall_seconds = 0 
    for i, time_char in enumerate(reversed(time)):
        if i == 0:
            overall_seconds += int(time_char)
        if i == 1:
            overall_seconds += int(time_char) * 10
        if i == 2:
            overall_seconds += int(time_char) * 60 
        if i == 3:
            overall_seconds += int(time_char) * 600 
    return overall_seconds 

assert convert_to_second('2345') == 23 * 60 + 45  

def consecutive(data, stepsize=1):
    '''groups up elements in an array that are continous with each other (useful to create sections where none 
    of the behaviors are shown.)'''
    return np.split(data, np.where(np.diff(data) != stepsize)[0]+1)

In [None]:
from collections import defaultdict
import math 
import numpy as np

NUM_SECONDS_TO_RECOGNIZE = 8 # hypothesis: takes this many seconds seconds to recognize handflapping + spinning 

URLS_TO_DOWNLOAD = set() # contains all youtube videos to download 
links_to_times = {} # data will be stored here like {link : {'category' : [(start, end)], 'another cat' : [(start, end)]}


for i, file_name in enumerate(os.listdir('ssbd-release/Annotations/')):
    # parse this file 
    tree = ET.parse('ssbd-release/Annotations/' + file_name)
    root = tree.getroot() 
    
    # everything we need to store
    URL = ""
    
    for child in root:
        
        if child.tag == "url":
            URL = child.text # store URL
            URLS_TO_DOWNLOAD.add(URL)
            links_to_times[URL] = defaultdict(list) 
        
        if child.tag == "duration":
            duration = int(child.text[:-1])
            all_times = list(range(duration))
        
        if child.tag == "behaviours": # this child has the list of behaviors 
            for reported_behavior in child: 
                for info in reported_behavior:
                    # gather the start time, end time, and category for this youtube link 
                    if info.tag == "time":
                        # the time will be start:end 
                        times = str(info.text) # contains the string 
                        if times.count(":"): 
                            divider_index = times.index(":")
                        elif times.count("-"):
                            divider_index = times.index("-")
                        else:
                            break # invalid then 
                        actual_start_time, actual_end_time = convert_to_second(times[:divider_index]), convert_to_second(times[divider_index + 1:])
                        START_TIMES, END_TIMES = [], []
                        times = np.array(range(actual_start_time, actual_end_time +1))
                        for time in times:
                            try:
                                all_times.remove(time)
                            except Exception as e:
                                pass 
                        split_times = np.array_split(times, math.ceil(times.shape[0] / NUM_SECONDS_TO_RECOGNIZE))
                        for time in split_times:
                            START_TIMES.append(time[0])
                            END_TIMES.append(time[-1])
                    if info.tag == "category":
                        # this is the label 
                        LABEL = info.text 
                
                # create an entry for this reported behavior 
                for START_TIME, END_TIME in zip(START_TIMES, END_TIMES):
                    links_to_times[URL][LABEL].append((START_TIME, END_TIME))
                    
            idle_times = consecutive(np.array(all_times))
            num_contributed = 0 # each video can only give 4 control clips (because otherwise it takes WAY too long)
            for control_times in idle_times:
                if num_contributed >= 4: 
                    break 
                # times maybe > NUM_SECONDS_TO_RECOGNIZE so split if that is the case 
                if len(control_times) <= NUM_SECONDS_TO_RECOGNIZE:
                    START_TIME, END_TIME = control_times[0], control_times[-1]
                    links_to_times[URL]['control'].append((START_TIME, END_TIME))
                    num_contributed += 1
                else:
                    # needs to be split 
                    control_times_split = np.array_split(control_times, math.ceil(len(control_times) / NUM_SECONDS_TO_RECOGNIZE))
                    for control_time in control_times_split:  
                        START_TIME, END_TIME = control_time[0], control_time[-1]
                        links_to_times[URL]['control'].append((START_TIME, END_TIME))
                        num_contributed += 1
                        if num_contributed >= 4: 
                            break 

In [None]:
links_to_times

In [None]:
# pickle this just incase 
import pickle 
with open("links_to_times.pkl", 'wb') as f:
    pickle.dump(links_to_times, f)

In [None]:
# RARELY RUN THIS - DELETES ALL VIDEOS IN ALL BEHAVIOR_DATA DIRS

for behavior in os.listdir("behavior_data"): 
    if behavior == ".DS_Store": continue 
    for file in os.listdir("behavior_data/" + behavior): 
        os.remove("behavior_data/" + behavior + "/" + file)

In [None]:
import cv2
import numpy as np
from moviepy.video.io.VideoFileClip import VideoFileClip
import pytube
FPS = 30 
i = 0 
for vid, (url, category_times) in enumerate((links_to_times.items())): 
    print(f"staring the {vid+1}th file")
    # download the video 
    print(url)
    try: 
        print("This is url: ", url)
        y = pytube.YouTube(url)
        video = y.streams.get_highest_resolution()
        video.download()
    except Exception as e:
        print(f"annoying url: {url}")
        print(e)
        continue 
        
    for category, times in category_times.items(): 
        folder_path = "behavior_data/" + category + "/"
        
        for start_time, end_time in times:
            try:
                input_file = y.streams.get_highest_resolution().default_filename
                output_file = folder_path + f"{i}.mp4"
                print(os.listdir(folder_path))
                if f"{i}.mp4" not in os.listdir(folder_path):
                    print("adding file")
                    with VideoFileClip(input_file) as video:
                        new = video.subclip(start_time, end_time)
                        new.write_videofile(output_file, audio_codec='aac')
                        i += 1 
            except Exception as e:
                print(f"failed on {i}")
                i += 1 
    os.remove(y.streams.get_highest_resolution().default_filename)

# Directory Structure  

## We have one folder inside of this AnishMachineLearning folder called "behavior_data" that has the "armflapping" and "spinning" folders. There all of the sliced .mp4 files with the behavior of interest are located. 

### We will process headbanging videos even if we are not going to use it because we still want it as a negative case for training the arm flapping & spinning. 

In [None]:
# for spinning it is just the hand positions that matter right?

import cv2
import mediapipe as mp 
import numpy as np
mp_hands = mp.solutions.hands


cap = cv2.VideoCapture("/Users/anish/Documents/Machine Learning Env/AnishMachineLearning/behavior_data/spinning/37.mp4")
#cap = cv2.VideoCapture(0)
# Initiate holistic model

#capcv2.VideoCapture(0)

hands = mp_hands.Hands(min_detection_confidence = 0.5, min_tracking_confidence = 0.5)

while cap.isOpened():
    ret, image = cap.read() 
    if not ret:break 

    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False 
    results = hands.process(image)

    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    
    height, width, _ = image.shape

    white_image = np.zeros_like(image)
    white_image.fill(255.0)
    
    #check for hand results 
    if results.multi_hand_landmarks:
        for hand_landmark in results.multi_hand_landmarks:
            for i in range(0, 21):
                landmark = hand_landmark.landmark[i]
                x = int(landmark.x * width)
                y = int(landmark.y * height)
                cv2.circle(white_image, (x, y), 5, (100, 100, 0), -1)

    cv2.imshow("", white_image)

    if cv2.waitKey(1) == ord("q"):
        break 

cap.release() 
cv2.destroyAllWindows()

In [None]:
list(range(21))

# We can try using the y values for each of the hand flapping videos and graph them to see if there is a noticeable difference of the y-values (we'll use the mean of all y-values for all 21 hand landmarks and then graph them).

In [None]:
import matplotlib.pyplot as plt
import os 

# first all hand flapping videos 
for hand_flap_video in os.listdir("behavior_data/armflapping"):
    video = "behavior_data/armflapping/" + hand_flap_video
    cap = cv2.VideoCapture(video)
    
    hands = mp_hands.Hands(min_detection_confidence = 0.5, min_tracking_confidence = 0.5)
    
    all_YS = [] 
    
    while cap.isOpened():
        ret, image = cap.read() 
        if not ret:break 

        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image.flags.writeable = False 
        results = hands.process(image)

        image.flags.writeable = True
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

        height, width, _ = image.shape

        #check for hand results 
        y_s = []
        
        if results.multi_hand_landmarks:
            for hand_landmark in results.multi_hand_landmarks:
                for i in range(0, 21):
                    landmark = hand_landmark.landmark[i]
                    x = int(landmark.x * width)
                    y = int(landmark.y * height)
                    y_s.append(y)
        
        all_YS.append(np.mean(y_s))

    plt.plot(range(len(all_YS)), all_YS, color = "green")
    plt.show()

In [None]:
# now for spinning 

import matplotlib.pyplot as plt
import os 

# first all hand flapping videos 
for hand_flap_video in os.listdir("behavior_data/spinning"):
    video = "behavior_data/spinning/" + hand_flap_video
    cap = cv2.VideoCapture(video)
    
    hands = mp_hands.Hands(min_detection_confidence = 0.5, min_tracking_confidence = 0.5)
    
    all_YS = [] 
    
    while cap.isOpened():
        ret, image = cap.read() 
        if not ret:break 

        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image.flags.writeable = False 
        results = hands.process(image)

        image.flags.writeable = True
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

        height, width, _ = image.shape

        #check for hand results 
        y_s = []
        
        if results.multi_hand_landmarks:
            for hand_landmark in results.multi_hand_landmarks:
                for i in range(0, 21):
                    landmark = hand_landmark.landmark[i]
                    x = int(landmark.x * width)
                    y = int(landmark.y * height)
                    y_s.append(y)
        
        all_YS.append(np.mean(y_s))

    plt.plot(range(len(all_YS)), all_YS, color = "green")
    plt.show()

In [None]:
# next for headbanging 

# now for spinning 

import matplotlib.pyplot as plt
import os 

# first all hand flapping videos 
for hand_flap_video in os.listdir("behavior_data/headbanging"):
    video = "behavior_data/headbanging/" + hand_flap_video
    cap = cv2.VideoCapture(video)
    
    hands = mp_hands.Hands(min_detection_confidence = 0.5, min_tracking_confidence = 0.5)
    
    all_YS = [] 
    
    while cap.isOpened():
        ret, image = cap.read() 
        if not ret:break 

        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image.flags.writeable = False 
        results = hands.process(image)

        image.flags.writeable = True
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

        height, width, _ = image.shape

        #check for hand results 
        y_s = []
        
        if results.multi_hand_landmarks:
            for hand_landmark in results.multi_hand_landmarks:
                for i in range(0, 21):
                    landmark = hand_landmark.landmark[i]
                    x = int(landmark.x * width)
                    y = int(landmark.y * height)
                    y_s.append(y)
        
        all_YS.append(np.mean(y_s))

    plt.plot(range(len(all_YS)), all_YS, color = "green")
    plt.show()

# First let's get the frames for every arm flapping and control video. If the number of frames is less than 100 frames we will not take it. 

In [None]:
ARMFLAPPING_VIDEOS = []
CONTROL_VIDEOS = []

for video_name in os.listdir('behavior_data/armflapping'): 
    cap = cv2.VideoCapture('behavior_data/armflapping/' + video_name)  
    FRAMES = [] # frames for this video 
    while True: 
        _, image = cap.read() 
        if not _ : break 
        
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        FRAMES.append(image)
    if len(FRAMES) >= 100: 
        # ignore any .DS_Store files
        ARMFLAPPING_VIDEOS.append(FRAMES)

ARMFLAPPING_LABELS = np.ones(len(ARMFLAPPING_VIDEOS))

for video_name in os.listdir('behavior_data/control'): 
    cap = cv2.VideoCapture('behavior_data/control/' + video_name)  
    FRAMES = [] # frames for this video 
    while True: 
        _, image = cap.read() 
        if not _ : break 
        
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        FRAMES.append(image)
        
    if len(FRAMES) >= 100: 
        CONTROL_VIDEOS.append(FRAMES)

CONTROL_LABELS = np.zeros(len(CONTROL_VIDEOS))

In [None]:
# shuffle and then balance the amount of videos
amount_of_videos = min([len(CONTROL_VIDEOS), len(ARMFLAPPING_VIDEOS)])

ARMFLAPPING_VIDEOS = np.array(ARMFLAPPING_VIDEOS)
CONTROL_VIDEOS = np.array(CONTROL_VIDEOS)
import numpy as np
control_permutation = np.random.permutation(CONTROL_LABELS.shape[0])
CONTROL_VIDEOS, CONTROL_LABELS = CONTROL_VIDEOS[control_permutation], CONTROL_LABELS[control_permutation]

armflapping_permutation = np.random.permutation(ARMFLAPPING_LABELS.shape[0])
ARMFLAPPING_VIDEOS, ARMFLAPPING_LABELS = ARMFLAPPING_VIDEOS[armflapping_permutation], ARMFLAPPING_LABELS[armflapping_permutation]

ARMFLAPPING_VIDEOS, ARMFLAPPING_LABELS = ARMFLAPPING_VIDEOS[:amount_of_videos], ARMFLAPPING_LABELS[:amount_of_videos]
CONTROL_VIDEOS, CONTROL_LABELS = CONTROL_VIDEOS[:amount_of_videos], CONTROL_LABELS[:amount_of_videos]

In [None]:
assert len(ARMFLAPPING_VIDEOS) == len(CONTROL_VIDEOS)

# Great we have gotten 75 videos of armflapping and 75 control videos. Every single video has a minimum of 100 frames. We have set the predetermined amount of frames that go into the LSTM at 100, because we don't want the model to overfit or care about the length of the video. For videos with more than 100 frames, we will only collect the first 100 frames. 

## When we get the x and y locations for where on the 21 hand landmarks, note that they will be based on the width / height of the video which varies from each video . One way to deal with this would be to simply make all frames the same width and height, however that may make it tough for mediapipe to actually find the landmarks. Because the average frame has roughly a height and width of 400x600 we will take whatever x and y values given for a frame and adjust them based on the frame's height/width divided by 400 /600. 

In [None]:
armflapping_videos_shapes = [] 
for video in ARMFLAPPING_VIDEOS: 
    armflapping_videos_shapes.append(list(np.array(video).shape)) # (frames, height, width, num channels)

In [None]:
np.mean(armflapping_videos_shapes, axis = 0) # average height and width for the armflapping frames is 403 x 562 

In [None]:
control_videos_shapes = [] 
for video in CONTROL_VIDEOS:
    control_videos_shapes.append(list(np.array(video).shape)) # (frames, height, width, num channels)

In [None]:
np.mean(control_videos_shapes, axis = 0) # average height and width for the armflapping frames is 419 x 581 

In [None]:
y = np.arange(35).reshape(5,7)
y

In [None]:
y[0::1]

In [None]:
selected_armflapping_frames = [] # the dimensions of this will be (75, 100, height, width, channels)
selected_control_frames = [] 

for FRAMES in ARMFLAPPING_VIDEOS: 
    selected_armflapping_frames.append(FRAMES[:100])

for FRAMES in CONTROL_VIDEOS: 
    selected_control_frames.append(FRAMES[:100])

In [None]:
selected_armflapping_frames = np.array(selected_armflapping_frames)
selected_control_frames = np.array(selected_control_frames)

In [None]:
# the final data should be (75, 100, 42)
import matplotlib.pyplot as plt 
import mediapipe as mp

mp_hands = mp.solutions.hands
hands = mp_hands.Hands(min_detection_confidence = 0.3, min_tracking_confidence = 0.3)

for image in selected_armflapping_frames[22] :
    height, width, _ = image.shape
    image.flags.writeable = False 
    results = hands.process(image)
    
    y_s = []
    x_s = []
    
    if results.multi_hand_landmarks:
        for hand_landmark in results.multi_hand_landmarks:
            for i in range(0, 21):
                landmark = hand_landmark.landmark[i]
                x = int(landmark.x * width)
                y = int(landmark.y * height)
                y_s.append(y)
                x_s.append(x)
                
                image = cv2.circle(image, (x, y), 5, (255, 0, 0), 2)
    
    print(y_s)
    print(x_s)
    
    plt.imshow(image)
    plt.show()

In [1]:
import cv2
import mediapipe as mp
mp_hands = mp.solutions.hands

cap = cv2.VideoCapture('/Users/anish/Downloads/all_videos/armflapping/499.mp4')
mp_drawing = mp.solutions.drawing_utils
with mp_hands.Hands(min_detection_confidence=0.8, min_tracking_confidence=0.5) as hands: 
    while cap.isOpened():
        ret, frame = cap.read()
        print(ret)
        # BGR 2 RGB
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # Flip on horizontal
        image = cv2.flip(image, 1)
        
        # Set flag
        image.flags.writeable = False
        
        # Detections
        results = hands.process(image)
        
        # Set flag to true
        image.flags.writeable = True
        
        # RGB 2 BGR
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        
        # Detections
        print(results)
        
        # Rendering results
        if results.multi_hand_landmarks:
            for num, hand in enumerate(results.multi_hand_landmarks):
                mp_drawing.draw_landmarks(image, hand, mp_hands.HAND_CONNECTIONS, 
                                        mp_drawing.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=4),
                                        mp_drawing.DrawingSpec(color=(250, 44, 250), thickness=2, circle_radius=2),
                                         )
            
        # Save our image    
        cv2.imwrite(os.path.join('Output Images', '{}.jpg'.format(uuid.uuid1())), image)
        cv2.imshow('Hand Tracking', image)

        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()

# Let's create our custom layer for image augmentations. We will be using a transformation of brightness, rotation, height and width, and maybe even shear. 

In [None]:
import keras 
import numpy as np

class AugLayer(keras.Layers.Layer): 
    def __init__(self, height_shift_range = 0.3, width_shift_range = 0.3, brightness_range = 0.2, rotation_range = 35): 
        self.height_shift_range = height_shift_range
        self.width_shift_range = width_shift_range
        self.brightness_range = brightness_range
        self.rotation_range = rotation_range
    def call(self, X, training=True): 
        if training: 
            # do augmentations
            height_shift = np.random.uniform(-self.height_shift_range, self.height_shift_range)
            
        else: 
            

In [11]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(
    featurewise_center=True,
    featurewise_std_normalization=True,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    validation_split=0.2)

In [13]:
img = cv2.imread("/Users/anish/Documents/CP + Programming Fun/web dev **practice**/wedding/assets/proposal/square_proposal.jpg")

In [18]:
datagen.flow([[img, img]])

IndexError: list index out of range