#### This notebook makes two tasks:
#### 1)splits videos into frames and saves the frames into newly created directory. For each video a separate directory is created.
#### 2) Using previosely collected numerical values of the Avg.Wind and WingGust for the given date and time, we extrapolate the values corresponding to the video-file that just was split. Then the new csv file is created that contains a table of the following columns *name of the video*, *Avg.Wind*, *Wind Gusts*. This file will be used later to get labels for training model.

In [3]:
import os
import pandas as pd
import numpy as np
import cv2
import datetime
from PIL import Image
import matplotlib.pyplot as plt
import timeit
%matplotlib inline

#### The working directories where are our video and csv files 

In [4]:
working_dir='../Data_samples/train data/'#path to train dataset

#### Creating lists of names of video and csv files in these directiries

In [5]:
#list of all videofiles pathes inside of the working dir
video_names = [root+'/'+file for root, _, files in os.walk(working_dir) for file in files if file.endswith('flv')]

#list of all csv files pathes inside of workin dir
csv_names   = [root+'/'+file for root, _, files in os.walk(working_dir) for file in files if file.endswith('csv')]

##### Checking if everything is ok

In [6]:
video_names, csv_names

(['../Data_samples/train data/ion-club-valdevaqueros/2018-07-11-18-34.flv',
  '../Data_samples/train data/ion-club-valdevaqueros/2018-07-11-18-39.flv',
  '../Data_samples/train data/ion-club-valdevaqueros/2018-07-11-18-44.flv',
  '../Data_samples/train data/ion-club-valdevaqueros/2018-07-11-18-49.flv',
  '../Data_samples/train data/ion-club-valdevaqueros/2018-07-11-18-55.flv',
  '../Data_samples/train data/valdevaqueros-tangana/2018-07-11-18-34.flv',
  '../Data_samples/train data/valdevaqueros-tangana/2018-07-11-18-39.flv',
  '../Data_samples/train data/valdevaqueros-tangana/2018-07-11-18-44.flv',
  '../Data_samples/train data/valdevaqueros-tangana/2018-07-11-18-49.flv',
  '../Data_samples/train data/valdevaqueros-tangana/2018-07-11-18-55.flv'],
 ['../Data_samples/train data//2018-05-01.csv',
  '../Data_samples/train data//2018-05-02.csv',
  '../Data_samples/train data//2018-05-03.csv',
  '../Data_samples/train data//2018-05-04.csv',
  '../Data_samples/train data//2018-05-05.csv',
  '.

#### Function that splits video from *pathIn* into frames and saves them to *pathOut*

In [97]:
def extract_images(pathIn, pathOut):
    count = 0
    vidcap = cv2.VideoCapture(pathIn)
    success = True
    while success and count<10:#second condition should be removed to extract all the frames
        success,image = vidcap.read()
        if not success:
            break
        cv2.imwrite( pathOut + "/fr-%d.jpg" % count, image)     # save frame as JPEG file
        count = count + 1

#### Function that converts filenames of the video into datatime objects

In [98]:
def vn_to_time(vn):
    return datetime.datetime.strptime(vn.split('/')[-1].split('.')[0], '%Y-%m-%d-%H-%M')

##### Creation of one general DataFrame of numerical values from csv files for different dates

In [7]:
wind_data=pd.DataFrame()
for path in csv_names: 
    if path == '../Data_samples/train data/frames/wind_data_tg.csv':
        continue
    tabel=pd.read_csv(path,parse_dates=False).reset_index(drop=True)
    wind_data=pd.concat([wind_data,tabel]).reset_index(drop=True)

##### Checking the dataframe

In [8]:
wind_data.tail()

Unnamed: 0,date,Avg. wind,wind gusts
2879,2018-7-15-10-35,10,13.0
2880,2018-7-15-10-40,9,13.0
2881,2018-7-15-10-45,9,13.0
2882,2018-7-15-10-50,8,12.0
2883,2018-7-15-10-55,10,13.0


##### Function that produces the numerical values for given video. It finds two closest in time data points and makes interpolation. It also takes care of some edge cases.

In [101]:
def wind_for_video(vn,wind_data):
    vid_time=vn_to_time(vn)
    #wd=wind_data.copy()
    wind_data["time_diff"]=wind_data['date'].apply(lambda x: 
                                                   (datetime.datetime.strptime(x,'%Y-%m-%d-%H-%M')-vid_time).total_seconds())
    wind_data['time_dif_abs']=wind_data['time_diff'].apply(abs)
    wd=wind_data.sort_values(['time_dif_abs']).head(2).reset_index(drop=True)
    
    if wd.empty:
        return None
    if wd['time_dif_abs'].min()>500:
        return None
    elif wd['time_dif_abs'].max()>1000:
        awind=wd['Avg. wind'].loc[wd['time_dif_abs'].idxmin()]
        gwind=wd['wind gusts'].loc[wd['time_dif_abs'].idxmin()]
    
    dt = wd['time_diff'].loc[1]-wd['time_diff'].loc[0]
    if abs(dt)<1:
        awind=(wd['Avg. wind'].loc[0]+wd['Avg. wind'].loc[1])/2
        gwind=(wd['wind gusts'].loc[0]+wd['wind gusts'].loc[1])/2
    else:
        awind=(wd['Avg. wind'].loc[0]*wd['time_diff'].loc[1] - wd['Avg. wind'].loc[1]*wd['time_diff'].loc[0])/dt
        gwind=(wd['wind gusts'].loc[0]*wd['time_diff'].loc[1] - wd['wind gusts'].loc[1]*wd['time_diff'].loc[0])/dt
    return [awind,gwind]
    

#### Running the loop through all the videos, finding the numerical values for each given video. If numerical values are available, split video into frames and save the frames into directory. The corresponding tabel of numerical values is then saved into a csv file

In [105]:
frame_dir='../Data_samples/train data/frames/'
try:
    os.mkdir(frame_dir)#create dir if doesn't exist
except FileExistsError:
    pass
wind_dict={}
for i,vn in enumerate(video_names): #looping through all the video names
    if 'tangana' in vn:
        prefix='tg'
    elif 'ion' in vn:
        prefix='ion'
    video_dir='video_{}{}'.format(prefix,i)
    
    pathout=frame_dir+video_dir+'/' #path to store the frames of a given video
    
    try:
        winds=wind_for_video(vn,wind_data)#function that finds corresponding wind strength for the given video and writes it into same dir
        if winds:
            wind_dict[video_dir]=winds #create dict for wind strengths, the key id the video_dir
            try:
                os.mkdir(pathout)
            except FileExistsError:
                pass
            extract_images(vn,pathout) #function to split video and write the frames to the pathout dir
    except:
        pass
    try:
        print(vn,wind_dict[video_dir],i)
    except:
        pass
labels_df= pd.DataFrame.from_dict(wind_dict,orient='index')
labels_df.rename(columns={0: "avg. wind", 1: "wind gust"},inplace=True)
labels_df.to_csv(path_or_buf=frame_dir+'wind_data_tg.csv')

../Data_samples/train data/ion-club-valdevaqueros/2018-07-11-18-34.flv [14.2, 18.2] 0
../Data_samples/train data/ion-club-valdevaqueros/2018-07-11-18-39.flv [14.2, 17.8] 1
../Data_samples/train data/ion-club-valdevaqueros/2018-07-11-18-44.flv [14.8, 19.0] 2
../Data_samples/train data/ion-club-valdevaqueros/2018-07-11-18-49.flv [13.4, 19.0] 3
../Data_samples/train data/ion-club-valdevaqueros/2018-07-11-18-55.flv [13.4, 18.4] 4
../Data_samples/train data/valdevaqueros-tangana/2018-07-11-18-34.flv [14.2, 18.2] 5
../Data_samples/train data/valdevaqueros-tangana/2018-07-11-18-39.flv [14.2, 17.8] 6
../Data_samples/train data/valdevaqueros-tangana/2018-07-11-18-44.flv [14.8, 19.0] 7
../Data_samples/train data/valdevaqueros-tangana/2018-07-11-18-49.flv [13.4, 19.0] 8
../Data_samples/train data/valdevaqueros-tangana/2018-07-11-18-55.flv [13.4, 18.4] 9
