In [None]:
"""
This Python application is designed to extract data from a CSV file and create separate pandas DataFrames 
for specific rows that contain key words such as 'Host Details', 'Panelist Details', and 'Attendee Details'. 
The program reads the CSV file row by row and detects the target rows containing the key words using a flag variable. 
When a target row is found, the program creates a new DataFrame and appends the subsequent rows to it until the next 
target row is found. Once all the target rows have been processed, the program returns a list of 
DataFrames containing the data from the CSV file that corresponds to each of the target rows. This application 
can be useful for data processing and analysis tasks that require separate DataFrames for different types of data, 
such as attendance reports or meeting logs.
"""

In [208]:
import os
import csv
import datetime
import pandas as pd

key_words = ['Host Details', 'Panelist Details', 'Attendee Details']
df_names = [x.replace(' ', '_').lower() for x in key_words]

# Set the path to the folder containing the CSV files
folder_path = 'data_files/csv/'

# Get a list of all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

In [209]:
def section_data_list(file_path, key_words):
    all_rows = []
    key_words = key_words.copy()

    with open(file_path, 'r') as csvfile:
        csvreader = csv.reader(csvfile, delimiter=',', quotechar='"')

        # Read the CSV file row by row and process each row data
        for row in csvreader:
            all_rows.append(row)

    key_indexes = []
    for k in key_words:
        for i, row in enumerate(all_rows):
            if k in row:
                key_indexes.append(i)
                
    var_names = [x.replace(' ', '_').lower() for x in key_words]
    
    # Create an empty dictionary to store the variables
    section_data_list = {}
    
    for i in range(len(var_names)):
        # Get the start and end indices for the current variable
        start = key_indexes[i]
        end = key_indexes[i+1] if i < len(var_names)-1 else None

        section_data_list[var_names[i]] = all_rows[start:end]
    
    return section_data_list

In [210]:
def extract_section_data(key, section_data_list):
    
    data = section_data_list[key]
    
    # Create an empty list to store the rows after the target row
    length_of_row = []
    
    for row in data[1:]:
        length_of_row.append(len(row))

    # Identify the minimum value from the list of lenght of rows
    min_value = min(length_of_row)
    
    data_m = data[1:]

    for i, row in enumerate(data_m):
        if len(row) > min_value:
            data_m[i] = row[:-1]
        else:
            data_m[i] = row

    # Create a pandas DataFrame from the list of rows after the target row
    df = pd.DataFrame(data_m[1:], columns=data_m[0])
    df.name = key
    
    return df 

In [211]:
list_of_file_paths = [folder_path + x for x in csv_files]

for file_path in list_of_file_paths:
    section_data_list = section_data_list(file_path, key_words)
    dfs = []
    for key in df_names:
        dfs.append(extract_section_data(key, section_data_list))

In [None]:
"""
The following code deals with data that contains information about participants of a webinar. 
Some participants have connected multiple times for various reasons, resulting in multiple entries for the same person.
The goal is to remove all duplicate values while preserving the actual time elapsed from the first login to 
the last logout of the webinar.

"""

attendees_df = []
for d in dfs:
    if d.name == 'attendee_details':
        attendees_df.append(d)
        
len(attendees_df)

In [219]:
df = attendees_df[0]

In [214]:
def right_format(df):
    # change format of columns
    df['Time in Session (minutes)'] = pd.to_numeric(df['Time in Session (minutes)'], errors='coerce')
    df['Join Time'] = pd.to_datetime(df['Join Time'], errors='coerce').dt.time
    df['Leave Time'] = pd.to_datetime(df['Leave Time'], errors='coerce').dt.time

    # replace '--' values with None
    df = df.replace('--', None)
    return df

df = right_format(df)

In [215]:
# get the emails that appear more than once
counts = df['Email'].value_counts()
emails = counts[counts > 1].index.tolist()

# filter out the rows that contain the exclude emails
df_filtered = df[~df['Email'].isin(emails)]

df_duplicated = df[df['Email'].isin(emails)]

In [216]:
def keep_unique_email(df, value):
    
    key_df = df[df['Email'] == value]
    
    time1 = key_df['Join Time'].iloc[0]
    time2 = key_df['Leave Time'].iloc[-1]
        
    key_df['Leave Time'].iloc[0] = time2
        
    # get difference between time1 and time2 in minutes
    diff = (datetime.datetime.combine(datetime.date.today(), time2) - datetime.datetime.combine(datetime.date.today(), time1)).total_seconds() / 60
        
    key_df['Time in Session (minutes)'].iloc[0] = round(diff, 1)
    key_df = key_df.drop_duplicates(subset='Email', keep='first')
    
    return key_df

In [None]:
for email in emails:
    prep_df = keep_unique_email(df_duplicated, email)
    df_filtered = pd.concat([df_filtered, prep_df])

In [None]:
writer = pd.ExcelWriter('data_files/for_check.xlsx')
df_filtered.to_excel(writer, sheet_name='test')
writer.save()