#### This notebook does mostly the same as 'create_dataset_and_labels.ipynb' but instead of extracting frame from the video, it just creates dataframe of path of the video in test dataset and corresponding labels. The extraction of the frames will happen in real time during validation process.

In [1]:
import pandas as pd
import os
import datetime
from random import shuffle,sample
import re
import timeit
import matplotlib.pyplot as plt
from matplotlib.cbook import flatten
%matplotlib inline

#### Define the directory with test data

In [None]:
working_directory = '../Data_samples/test data'

##### Extracting paths of video files and csv files from the directory

In [8]:
video_names = [root+'/'+file for root, _, files in os.walk(working_directory) for file in files if file.endswith('flv')]
csv_names   = [root+'/'+file for root, _, files in os.walk(working_directory) for file in files if file.endswith('csv')]

##### If the final csv file already exists it shout be deleted from the list of csv files 

In [27]:
if '../Data_samples/test data/test_data.csv' in csv_names:
    csv_names.remove('../Data_samples/test data/test_data.csv')

In [28]:
csv_names

['../Data_samples/test data/2018-05-26.csv',
 '../Data_samples/test data/2018-05-27.csv',
 '../Data_samples/test data/2018-05-28.csv',
 '../Data_samples/test data/2018-05-29.csv',
 '../Data_samples/test data/2018-05-30.csv',
 '../Data_samples/test data/2018-05-31.csv']

#### Creating one dataframe with numerical values collecting all the csv files together

In [29]:
wind_data=pd.DataFrame()
for path in csv_names:
    tabel=pd.read_csv(path,parse_dates=False)#.reset_index(drop=True)
    wind_data=pd.concat([wind_data,tabel]).reset_index(drop=True)

wind_data.tail()

Unnamed: 0,date,Avg. wind,wind gusts
708,2018-5-31-16-7,15,20
709,2018-5-31-16-12,16,19
710,2018-5-31-16-17,13,17
711,2018-5-31-16-22,12,17
712,2018-5-31-16-27,13,16


##### Function to get time stamp from the video name

In [30]:
def vn_to_time(vn):
    return datetime.datetime.strptime(vn.split('/')[-1].split('.')[0], '%Y-%m-%d-%H-%M')

##### testing the function

In [31]:
vn_to_time(video_names[0])

datetime.datetime(2018, 5, 26, 11, 58)

##### Function that produces the numerical values for given video. It finds two closest in time data points and makes interpolation. It also takes care of some edge cases.

In [32]:
def wind_for_video(vn,wind_data=wind_data):
    vid_time=vn_to_time(vn) #get the timestame for the given video
    
    wind_data["time_diff"]=wind_data['date'].apply(lambda x: #get the difference between time stamps of video and all datapoints
                                                   (datetime.datetime.strptime(x,'%Y-%m-%d-%H-%M')-vid_time).total_seconds())
    wind_data['time_dif_abs']=wind_data['time_diff'].apply(abs) #get absolute values of this difference
    wd=wind_data.sort_values(['time_dif_abs']).head(2).reset_index(drop=True) #get two closest datapoints
    
    if wd.empty:  
        return None
    if wd['time_dif_abs'].min()>500: # edge case when there is no datapoints closer than 500sec. 
                                     # No data will be produced for such a video
        return None
    
    elif wd['time_dif_abs'].max()>1000: # egde case when only one data point is close to the given video
        awind=wd['Avg. wind'].loc[wd['time_dif_abs'].idxmin()]
        gwind=wd['wind gusts'].loc[wd['time_dif_abs'].idxmin()]
    
    dt = wd['time_diff'].loc[1]-wd['time_diff'].loc[0]
    if abs(dt)<1:  # Edge case when two closest datapoints almost coinside 
        awind=(wd['Avg. wind'].loc[0]+wd['Avg. wind'].loc[1])/2
        gwind=(wd['wind gusts'].loc[0]+wd['wind gusts'].loc[1])/2
    else: # a regular case when there are two points to interpolate
        awind=(wd['Avg. wind'].loc[0]*wd['time_diff'].loc[1] - wd['Avg. wind'].loc[1]*wd['time_diff'].loc[0])/dt
        gwind=(wd['wind gusts'].loc[0]*wd['time_diff'].loc[1] - wd['wind gusts'].loc[1]*wd['time_diff'].loc[0])/dt
    return [awind,gwind]

##### testing the function

In [33]:
wind_for_video(video_names[0])

[8.0, 10.0]

##### Create a new dataframe with three columns - video path, and two corresponding numerical values as calculated using function above.

In [34]:
df_wind=pd.DataFrame()
for video in video_names:
    num_values=wind_for_video(video) 
    if num_values is None: #if there is no numerical value for the given video available, skip this video
        continue
    df=pd.DataFrame([[video]+num_values],columns=['video','av.wn.','wn.gs.'])
    df_wind=df_wind.append(df)
df_wind.reset_index(drop=True,inplace=True)

In [35]:
df_wind.to_csv("../Data_samples/test data/test_data.csv",index=False)

##### test

In [36]:
test_df=pd.read_csv('../Data_samples/test data/test_data.csv')

In [26]:
test_df

Unnamed: 0,video,av.wn.,wn.gs.
0,../Data_samples/test data/2018-05-26-11-58.flv,8.0,10.0
1,../Data_samples/test data/2018-05-26-12-14.flv,10.0,12.0
2,../Data_samples/test data/2018-05-26-12-20.flv,10.0,12.0
3,../Data_samples/test data/2018-05-26-12-4.flv,8.2,11.0
4,../Data_samples/test data/2018-05-26-12-9.flv,9.2,11.2
