In [79]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from datetime import date

In [89]:
def batch_usvs(direc):
    
    ## unpacking provided directory, initializing output dataframe, and grabbing today's date
    
    files = os.listdir(direc)
    output_df = pd.DataFrame(index = ['total_call_time', 'average_call_length', 'total_call_number']).T
    today = str(date.today())
    
    ## grabbing list of text files for analysis
    
    txts = []
    for file in files:
        if '.txt' in file:
            txts = np.append(txts, file)
            
    print(txts)
            
    ## main loop through all data files in the directory
    
    for txt in txts:
        
        ## read in and clean data
        usv_data = pd.read_csv(direc + '\\' + txt, delimiter = '\t')
        usv_data.drop(len(usv_data) - 1, inplace = True)
        col = ['filename']
        usv_data.loc[:,col] = usv_data.loc[:,col].ffill()
        
        for i in range(len(usv_data['filename'])):
            usv_data['filename'][i] = usv_data['filename'][i].split('\\')[-1]
            
        ## we want to make sure the columns are the correct datatype so that the grouping/mean works correctly
        usv_data[['duration', '#']] = usv_data[['duration', '#']].astype('float')

        ## now we are calculating the summary statistics we are actually interested in!
        total_call_time = usv_data.groupby(['filename'])['duration'].sum()
        total_call_number = usv_data.groupby(['filename'])['#'].count()
        average_call_length = usv_data.groupby(['filename'])['duration'].mean()
        
        
        ## create the summary dataframe for each txt file individually
        calc_df = pd.DataFrame([total_call_time, average_call_length, total_call_number], index = ['total_call_time', 'average_call_length', 'total_call_number']).T
    
        ## add that summary dataframe to the master output dataframe
        output_df = pd.concat([output_df, calc_df])
        
        ## automatically write this dataframe to csv in the provided directory
        output_df.to_csv(direc + '\\usv_analysis_' + today + '.csv')
        
    return output_df
        



In [90]:
output_df = batch_usvs('E:\\DATA\\usvs')

['200211_batch.txt' '200227_batch.txt' '200301_batch.txt']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [78]:
output_df

Unnamed: 0,total_call_time,average_call_length,total_call_number
ch1_834_P8_2020-02-04_16-46-42_01.WAV,0.0844,0.007673,11.0
ch1_834_P8_2020-02-04_16-59-20_02.WAV,9.8075,0.046926,209.0
ch1_959_P8_2020-02-04_17-12-57_03.WAV,0.0983,0.01966,5.0
ch2_834_P8_2020-02-04_16-46-42_01.WAV,5.4437,0.038336,142.0
ch2_834_P8_2020-02-04_16-59-20_02.WAV,9.4309,0.041546,227.0
ch2_959_P8_2020-02-04_17-12-57_03.WAV,4.8361,0.030416,159.0
ch3_834_P8_2020-02-04_16-46-42_01.WAV,3.4233,0.046895,73.0
ch3_834_P8_2020-02-04_16-59-20_02.WAV,5.7229,0.039198,146.0
ch3_959_P8_2020-02-04_17-27-31_04.WAV,1.0154,0.029011,35.0
ch4_834_P8_2020-02-04_16-46-42_01.WAV,1.0348,0.020696,50.0
