In [None]:
"""
This script processes data from Zoom CSV files, where the duration of webinars is incorrectly specified due 
to the fact that webinars started at 10am and ended around 11am. The host's login time was recorded, which 
is not the correct way to track participants' attendance time. In this script, the time of 
the last participant's disconnection will be found to obtain the actual duration of the webinar.

"""

In [None]:
import datetime
import pandas as pd
import openpyxl
import numpy as np
from openpyxl.utils.dataframe import dataframe_to_rows

attendee_details = pd.read_excel('data_files/report.xlsx', sheet_name='attendee_details')
attendee_details['Time in Session (minutes)'] = attendee_details['Time in Session (minutes)'].round(0)

#last webinar was added automatically and it has different kind of time, it is necessary to prepare separately
last_webinar = attendee_details['Webinar'].iloc[len(attendee_details) - 1]

int_attendee_details = attendee_details[attendee_details['Webinar'] != last_webinar]
last_attendee_details = attendee_details[attendee_details['Webinar'] == last_webinar]

int_attendee_details['Join Time'] = pd.to_datetime(int_attendee_details['Join Time'], errors='coerce').dt.time
int_attendee_details['Leave Time'] = pd.to_datetime(int_attendee_details['Leave Time'], errors='coerce').dt.time

attendee_details = pd.concat([int_attendee_details, last_attendee_details])

# actual duration of webinars

attendee_details['Webinar Start'] = '10:00:00'
attendee_details['Webinar Start'] = pd.to_datetime(attendee_details['Webinar Start'], format='%H:%M:%S')

# convert time columns to datetime objects
attendee_details['Join Time'] = pd.to_datetime(attendee_details['Join Time'], format='%H:%M:%S')
attendee_details['Leave Time'] = pd.to_datetime(attendee_details['Leave Time'], format='%H:%M:%S')


# calculate duration of participation of each attendee
attendee_details['participation_time'] = (attendee_details['Leave Time'] - attendee_details['Join Time']).dt.total_seconds() / 60
attendee_details['participation_time'] = attendee_details['participation_time'].round(0)

attendee_details['webinar_duration'] = (attendee_details['Leave Time'] - attendee_details['Webinar Start']).dt.total_seconds() / 60
attendee_details['webinar_duration'] = attendee_details['webinar_duration'].round(0)

# calculate the maximum value for each webinar group
max_duration = attendee_details.groupby('Webinar')['webinar_duration'].transform('max')
max_participation = attendee_details.groupby('Webinar')['participation_time'].transform('max')


# assign the max values to a new column
attendee_details['actual_duration'] = max_duration
attendee_details['max_participation'] = max_participation

In [51]:
# calculate the maximum non-extreme value for each webinar group
#mean = np.mean(attendee_details['time_diff_minutes'])
#std = np.std(attendee_details['time_diff_minutes'])
#threshold = 2
#max_values[attendee_details.groupby('Webinar')['time_diff_minutes'].transform(lambda x: len([i for i in x if (i >= mean - threshold * std) and (i <= mean + threshold * std)]) == 0)] = np.nan
#max_values = attendee_details.groupby('Webinar')['time_diff_minutes'].transform(lambda x: max([i for i in x if (i >= mean - threshold * std) and (i <= mean + threshold * std)]))   

In [None]:
writer = pd.ExcelWriter('data_files/temp.xlsx')
attendee_details.to_excel(writer, sheet_name='temp')
writer.save()

In [None]:
# it is neceassry to implement duration assessment