In [1]:
"""
This Python application is designed to extract data from a CSV file and create separate pandas DataFrames 
for specific rows that contain key words such as 'Host Details', 'Panelist Details', and 'Attendee Details'. 
The program reads the CSV file row by row and detects the target rows containing the key words using a flag variable. 
When a target row is found, the program creates a new DataFrame and appends the subsequent rows to it until the next 
target row is found. Once all the target rows have been processed, the program returns a list of 
DataFrames containing the data from the CSV file that corresponds to each of the target rows. This application 
can be useful for data processing and analysis tasks that require separate DataFrames for different types of data, 
such as attendance reports or meeting logs.
"""

"\nThis Python application is designed to extract data from a CSV file and create separate pandas DataFrames \nfor specific rows that contain key words such as 'Host Details', 'Panelist Details', and 'Attendee Details'. \nThe program reads the CSV file row by row and detects the target rows containing the key words using a flag variable. \nWhen a target row is found, the program creates a new DataFrame and appends the subsequent rows to it until the next \ntarget row is found. Once all the target rows have been processed, the program returns a list of \nDataFrames containing the data from the CSV file that corresponds to each of the target rows. This application \ncan be useful for data processing and analysis tasks that require separate DataFrames for different types of data, \nsuch as attendance reports or meeting logs.\n"

In [2]:
import os
import csv
import datetime
import pandas as pd
import openpyxl
from openpyxl.utils.dataframe import dataframe_to_rows

key_words = ['Report Generated:', 'Host Details', 'Panelist Details', 'Attendee Details']
df_names = [x.replace(' ', '_').lower() for x in key_words]

# Set the path to the folder containing the CSV files
folder_path = 'data_files/csv/'

# Get a list of all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

In [3]:
def section_data_list(file_path, key_words):
    
    all_rows = []
    key_words = key_words.copy()

    with open(file_path, 'r') as csvfile:
        csvreader = csv.reader(csvfile, delimiter=',', quotechar='"')

        # Read the CSV file row by row and process each row data
        for row in csvreader:
            all_rows.append(row)

    key_indexes = []
    for k in key_words:
        for i, row in enumerate(all_rows):
            if k in row:
                key_indexes.append(i)
                
    var_names = [x.replace(' ', '_').lower() for x in key_words]
    
    # Create an empty dictionary to store the variables
    section_data_list = {}
    
    for i in range(len(var_names)):
        # Get the start and end indices for the current variable
        start = key_indexes[i]
        end = key_indexes[i+1] if i < len(var_names)-1 else None

        section_data_list[var_names[i]] = all_rows[start:end]
    
    return section_data_list

In [4]:
def extract_section_data(key, section_data_list):
    
    data = section_data_list[key]
    
    # Create an empty list to store the rows after the target row
    length_of_row = []
    
    for row in data[1:]:
        length_of_row.append(len(row))

    # Identify the minimum value from the list of lenght of rows
    min_value = min(length_of_row)
    
    data_m = data[1:]

    for i, row in enumerate(data_m):
        if len(row) > min_value:
            data_m[i] = row[:-1]
        else:
            data_m[i] = row

    # Create a pandas DataFrame from the list of rows after the target row
    df = pd.DataFrame(data_m[1:], columns=data_m[0])
    df.name = key
    
    return df

In [5]:
# Create a list of file paths by concatenating the folder path with each CSV file name
list_of_file_paths = [folder_path + x for x in csv_files]

# Loop through each file path in the list
for file_path in list_of_file_paths:
    # Call a function to extract data from sections in the file that contain certain key words
    # The function returns a list of dictionaries containing the extracted data
    section_data_list = section_data_list(file_path, key_words)
    # Create an empty list to store the extracted dataframes
    dfs_ = []
    for key in df_names:
        # Call a function to extract a specific dataframe from the section data list
        # The function returns the extracted dataframe
        dfs_.append(extract_section_data(key, section_data_list))

In [None]:
"""
The following code deals with data that contains information about participants of a webinar. 
Some participants have connected multiple times for various reasons, resulting in multiple entries for the same person.
The goal is to remove all duplicate values while preserving the actual time elapsed from the first login to 
the last logout of the webinar.

"""

In [7]:
def right_format(df):
    # change format of columns
    df['Time in Session (minutes)'] = pd.to_numeric(df['Time in Session (minutes)'], errors='coerce')
    df['Join Time'] = pd.to_datetime(df['Join Time'], errors='coerce').dt.time
    df['Leave Time'] = pd.to_datetime(df['Leave Time'], errors='coerce').dt.time

    # replace '--' values with None
    df = df.replace('--', None)
    return df

In [8]:
def keep_unique_email(df, value):
    
    key_df = df[df['Email'] == value]
    
    time1 = key_df['Join Time'].iloc[0]
    time2 = key_df['Leave Time'].iloc[-1]
    key_df['Leave Time'].iloc[0] = time2
        
    # get difference between time1 and time2 in minutes
    diff = (datetime.datetime.combine(datetime.date.today(), time2) - datetime.datetime.combine(datetime.date.today(), time1)).total_seconds() / 60   
    key_df['Time in Session (minutes)'].iloc[0] = diff
    key_df = key_df.drop_duplicates(subset='Email', keep='first')
    
    return key_df

"""
The following function is designed to substitute verified contact information from a table into a webinar participant's 
details if they have attended previous webinars. If the participant has not attended before, the function saves 
their details to a separate file along with the information they entered.

"""
def old_and_new_contacts(df):
    column_list = list(df.columns)
    contacts_df = pd.read_excel('data_files/data_base.xlsx', sheet_name='webinar_invitees', header=3)
    df['Email'] = df['Email'].str.strip().str.lower()  
    new_contacts_df = df[~df['Email'].isin(contacts_df['Email'])]
    #& (~df['Last Name'].isin(contacts_df['last_name']))
    old_contacts_df = df[df['Email'].isin(contacts_df['Email'])]
    
    print('check point', len(df), '=',  len(new_contacts_df), '+', len(old_contacts_df))
    
    old_contacts_df = old_contacts_df.drop(['User Name (Original Name)', 'First Name', 'Last Name', 'Organization'], axis=1)
    
    merged_df = pd.merge(old_contacts_df, contacts_df, on='Email', how='left')
    
    merged_df = merged_df.rename(columns={'company_name': 'Organization', 'person': 'User Name (Original Name)', 'first_name': 'First Name',
       'last_name': 'Last Name'})

    old_contacts_df = merged_df.loc[:, column_list]
    
    old_and_new_contacts = pd.concat([old_contacts_df, new_contacts_df])
    
    new_contacts_df = new_contacts_df.loc[:,['User Name (Original Name)', 'First Name', 'Last Name', 'Email', 'Organization']]
    new_contacts_df = new_contacts_df.rename(columns={'Organization': 'company_name', 'User Name (Original Name)': 'person', 'First Name': 'first_name',
       'Last Name': 'last_name'})
    new_contacts_df['company_name'] = new_contacts_df['company_name'].str.upper().str.strip()
    new_contacts_df['person'] = new_contacts_df['person'].apply(lambda x: x.title())
    new_contacts_df['first_name'] = new_contacts_df['first_name'].str.strip().str.title()
    new_contacts_df['last_name'] = new_contacts_df['last_name'].str.strip().str.title()
    
    new_contacts_df = new_contacts_df.reindex(columns=['Email', 'company_name', 'person', 'first_name', 'last_name'])
    new_contacts_df.reset_index(inplace=True, drop=True )
    
    writer = pd.ExcelWriter('data_files/new_contacts_df.xlsx')
    new_contacts_df.to_excel(writer, sheet_name='new')
    writer.save()
    
    return old_and_new_contacts

In [9]:
dfs = dfs_.copy()

In [10]:
"""
This code snippet prepares data to be added to a report file in the format and columns that match the information 
in the original report file.

"""

hosts  = []
panelists = []

for df in dfs:
    if df.name == 'host_details':
        hosts = hosts + list(df['Email'].unique())
    elif df.name == 'panelist_details':
        hosts = hosts + list(df['Email'].unique())
        panelists = panelists + list(df['User Name (Original Name)'].unique())
    elif df.name == 'report_generated:':
        df['Topic'] = df['Topic'].str.replace('"', '').str.strip()
        webinar_name = df['Topic'].iloc[0]
        webinar_duration = df['Actual Duration (minutes)'].iloc[0]
        df['Actual Start Time'] = pd.to_datetime(df['Actual Start Time'])
        df['Date'] = df['Actual Start Time'].dt.strftime('%d-%m-%Y')
        webinar_date = df['Date'].iloc[0]
        
        
for i, df in enumerate(dfs):  
    if df.name == 'report_generated:':
        #df['Topic'] = df['Topic'].str.replace('"', '').str.strip()
        #webinar_name = df['Topic'].iloc[0]
        df['Name in Teams'] = webinar_name
        df['Panelists'] = ', '.join(panelists)
        df['Actual Start Time'] = pd.to_datetime(df['Actual Start Time'])
        df['Date'] = df['Actual Start Time'].dt.strftime('%d-%m-%Y')
        df = df.reindex(columns=['Name in Teams', 'Panelists', 'Date', 'Topic', 'Actual Duration (minutes)',
       '# Registered', '# Cancelled', 'Unique Viewers', 'Total Users',
       'Max Concurrent Views', 'Actual Start Time', 'Webinar ID'])
        dfs[i] = df
        dfs[i].name = 'webinar_list'
    elif df.name == 'panelist_details':
        dfs[i-1]['Host/Panelist'] = 'H'
        dfs[i]['Host/Panelist'] = 'P'
        dfs[i] = pd.concat([df, dfs[i-1]])
        dfs[i] = dfs[i].reindex(columns = ['User Name (Original Name)', 'Email', 'Host/Panelist', 'Time in Session (minutes)','Join Time','Leave Time'])
        dfs[i].name = 'panelist_details'
    elif df.name == 'attendee_details':
        dfs[i] = right_format(df)
        # get the emails that appear more than once
        counts = df['Email'].value_counts()
        emails = counts[counts > 1].index.tolist()
        # filter out the rows that contain the exclude emails
        df_filtered = df[~df['Email'].isin(emails)]
        df_duplicated = df[df['Email'].isin(emails)]
        for email in emails:
            prep_df = keep_unique_email(df_duplicated, email)
            df_filtered = pd.concat([df_filtered, prep_df])
        df_filtered['Webinar'] = webinar_name
        df_filtered['Date'] = webinar_date    
        df_filtered['Duration'] = webinar_duration
        df_filtered['Time in Session (minutes)'] = df_filtered['Time in Session (minutes)'].astype(float)
        df_filtered['Duration'] = df_filtered['Duration'].astype(float)
        df_filtered['Rate'] = (df_filtered['Time in Session (minutes)'] / df_filtered['Duration'])
        df_filtered['Rate'] = df_filtered['Rate'].round(2)
        df_filtered = df_filtered.reindex(columns=['Webinar', 'Date', 'Attended', 'User Name (Original Name)', 'First Name', 'Last Name',
       'Email', 'Organization', 'Registration Time', 'Approval Status',
       'Join Time', 'Leave Time', 'Time in Session (minutes)',
       'Country/Region Name', 'Duration', 'Rate'])
        # delete hosts from participants if they were in for any reasons
        df_filtered = df_filtered[~df_filtered['Email'].isin(hosts)]
        df_filtered = old_and_new_contacts(df_filtered)
        dfs[i] = df_filtered
        dfs[i].name = 'attendee_details'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  key_df['Leave Time'].iloc[0] = time2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  key_df['Time in Session (minutes)'].iloc[0] = diff
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  key_df['Leave Time'].iloc[0] = time2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  key_df['Time in Session (minutes)'].iloc[0]

check point 65 = 4 + 61


  writer.save()


In [11]:
"""
saving all the new data to the report file

"""

# Prepare data frame for Excel workbook sheet
def df_from_sheet(wkb, sht_name):
    # Select the sheet you want to modify
    sheet = wkb[sht_name]

    # Save data frame from sheet data
    data_days = sheet.values
    cols = next(data_days)[0:]
    df = pd.DataFrame(data_days, columns=cols)
    return df

# Write the updated DataFrame to the worksheet with the same format
def same_format_sheet(wkb, sht_name, df, file_name):
    sht = wkb[sht_name]
    # Create a new worksheet with the same formatting as the existing worksheet
    new_worksheet = wkb.create_sheet('Temp')
    new_worksheet.sheet_format = sht.sheet_format
    new_worksheet.sheet_properties = sht.sheet_properties
    new_worksheet.page_setup = sht.page_setup

    # Write the updated DataFrame to the worksheet
    for r in dataframe_to_rows(df, index=False, header=True):
        new_worksheet.append(r)

    # Delete all existing rows in the worksheet
    sht.delete_rows(1, sht.max_row)
    for row in new_worksheet.iter_rows():
        sht.append([cell.value for cell in row])

    # Delete the temporary worksheet
    wkb.remove(new_worksheet)

    # Save the changes to the Excel file
    wkb.save(file_name)

    # Close the workbook
    wkb.close()
    
# Create Data Frame from Excel file
def report_data_df(report_file, df):
    
    key = df.name
    new_df = df.copy()

    # Load the Reprt xlsx workbook
    workbook = openpyxl.load_workbook(report_file)
    df_sheets = df_from_sheet(workbook, key)
    try:
        df_sheets['Date'] = pd.to_datetime(df_sheets['Date'], format='%d/%m/%Y')
        df_sheets = df_sheets.dropna(how='all')
    except:
        df_sheets = df_sheets.dropna(how='all')

    # Save columns to a list
    cols = list(df_sheets.columns)

    # Loop over each column and convert it to the desired data type
    for col in new_df.columns:
        # Check if the data type of the column is a string ('object')
        col_dtype = df_sheets[col].dtype
        # Convert the column to dtype of original dataframe
        if col_dtype == 'datetime64[ns]':
            new_df[col] = pd.to_datetime(new_df[col], format='%d-%m-%Y')
        elif col_dtype == 'float':
            # Apply the transformation to the entire column
            #new_df[col] = new_df[col].apply(lambda x: float(x.replace(',', '.')))
            new_df[col] = new_df[col].astype(col_dtype)
        else:
            new_df[col] = new_df[col].astype(col_dtype)
    df_to_write = pd.concat([df_sheets, new_df], ignore_index=True)
    same_format_sheet(workbook, key, df_to_write, report_file)
    print(key, 'done')
    
report_file = 'data_files/report.xlsx'

for df in dfs:
    try:
        report_data_df(report_file, df)
    except:
        pass

webinar_list done
panelist_details done
attendee_details done
