## Clean and graph each CSV instead of combining

#### setup

In [2]:
import os, glob
import pandas as pd
import seaborn as sns
import numpy as np

# enter working directory for CSV files

# personal
os.chdir(r"C:\Users\Zack\Desktop\work\T-Mobile\data\CSAM data\attendance\LnLs")
# work
# os.chdir(r"C:\Users\Zjaffen1\Desktop\CSAM data\attendance\LnLs")
# os.chdir(r"C:\Users\Zjaffen1\Desktop\CSAM data\attendance\webex")

#### get and check file names

In [3]:
# build list of file names and check
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]

# print list to check
# DISABLE OR LIMIT RANGE if there are a lot of files
for i in range(len(all_filenames)):
    print(all_filenames[i])

Consumer Privacy.csv
Cyberwarfare.csv
Forgotten Passwords.csv
Hacker Stories.csv
Improv Show.csv
Managing Credentials.csv
PAS and AIM Demo.csv
Privacy Matters.csv
Secure Applications.csv


#### function for converting entry/leave times into int minute values

In [4]:
def time_to_minutes(str_time):
    ''' takes time string in format : '12:02 pm'
        returns int of minutes      :  722'''

    time, period= str_time.split(' ')
    hour, minute = map(int, time.split(':'))
    if period.lower() == 'pm' and hour < 12:
        hour += 12
    return hour*60 + minute

#### create data frames for each CSV and clean

In [5]:
# print(time_to_minutes("10:01 PM"))
# TODO: get start time for meetings, start graph then
# TODO: determin 'stop' time

In [7]:

# list of dataframes for attendance data
file_df_list = [pd.read_csv(f, sep="\t", skiprows=[0,1], encoding="utf-16-le") for f in all_filenames]

# loop to clean data
cols = []           # list of col names from files
minute_lists = []   # list of attendence number (by the minute) lists
num_attended = {}   # number of events attended by each person
avg_durations = []  # average attendee stay in each event

for i in range(len(all_filenames)):
    # change event dates to names based on file names
    file_df_list[i].rename(columns={"Date": "Event"}, inplace = True)
    file_df_list[i]['Event'] = all_filenames[i][:len(all_filenames[i])-4]
    cols.append(all_filenames[i][:len(all_filenames[i])-4])

    # change 'Duration from string to int value
    file_df_list[i]['Duration'] = file_df_list[i]['Duration'].apply(lambda x: int(x.split(' ')[0]))

    # change start/stop time to minute value, lower all emails to same case
    file_df_list[i]['Start time'] = file_df_list[i]['Start time'].apply(time_to_minutes)
    file_df_list[i]['End time'] = file_df_list[i]['End time'].apply(time_to_minutes)
    file_df_list[i]['Email'] = file_df_list[i]['Email'].apply(lambda x: x.lower())

    # meeting start and end
    start = file_df_list[i]['Start time'].min()
    end = file_df_list[i]['End time'].max()
    diff = end - start

    # create absolute minutes columns
    file_df_list[i]['Min Joined'] = file_df_list[i]['Start time'].apply(lambda x: x - start)
    file_df_list[i]['Min Left'] = file_df_list[i]['End time'].apply(lambda x: x - start)

    # initial minutes counter
    diffs = [0] * (file_df_list[i]['Min Left'].max() + 1)
    mins = [0] * (file_df_list[i]['Min Left'].max() + 1)

    # check average length stayed
    duration_total = 0
    unique_joins = []

    # iterate through attendees and add/subtract to minute attendance, track email attendance
    for index, row in file_df_list[i].iterrows():
        diffs[row['Min Joined']] += 1
        diffs[row['Min Left']] -= 1
        email = row['Email']
        if email in num_attended:
            if num_attended[email] < 9:
                num_attended[email] += 1
        else:
            num_attended[email] = 1
        
        # length stayed per person
        if email not in unique_joins:
            unique_joins.append(email)
        duration_total += row['Duration']
    
    # track average attendee stay for each meeting
    avg_durations.append(round(duration_total/len(unique_joins), 2))
    
    # current attendees in meeting at specific minute
    total = 0
    for i in range(len(diffs)):
        total += diffs[i]
        mins[i] = total

    # null-out last minute to prevent "0" on graph
    mins[len(mins)-1] = np.nan

    # preserve minutes attendance and duration average
    minute_lists.append(mins.copy())

print(avg_durations)

[51.8, 53.87, 50.76, 50.7, 75.48, 45.74, 40.9, 45.96, 46.08]


#### for the single skype session

In [None]:
# os.chdir(r"C:\Users\Zjaffen1\Desktop\CSAM data\attendance")
os.chdir(r"C:\Users\Zack\Desktop\work\T-Mobile\data\CSAM data\attendance")
skype_data = pd.read_csv(r'Social Media Safety.csv', sep="\t")

# add to existing trackers
cols.append("Social Media Safety")

# lower all emails for consistency
skype_data['Email'] = skype_data['Email'].apply(lambda x: x.lower())

emails = skype_data['Email'].unique()

total_minutes = 0
for email in emails:
    if email.lower() not in u_emails and 'test' not in email.lower():
        u_emails.append(email)

total_minutes += skype_data['Time in Session (minutes)'].sum()

minutes = skype_data['Time in Session (minutes)'].sum()

print(total_minutes)

counting how many attended multiple sessions

In [None]:
from collections import Counter
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import numpy

def absolute_value(val):
    a  = numpy.round(val/100.*sum(slices), 0)
    return int(a)

plt.figure(figsize=(10,10))
res = Counter(num_attended.values())

print(res)
labels2 = []
slices = []

for key in res:
    labels2.append(key)
    slices.append(res[key])

plt.pie(slices, labels=labels2, autopct=absolute_value)
plt.title('Number of Events Attended')
plt.savefig('Number of Events Attended.png')

In [None]:
longest = len(max(minute_lists, key=len))
# extend minute lists to be of same length
for i in range(len(cols)):
    minute_lists[i].extend(np.full(longest-len(minute_lists[i]),np.nan))

# CREATE DATA FRAME!!!
df = pd.DataFrame()

# name columns and assign minute attendances
for i in range(len(cols)):
    df[cols[i]] = minute_lists[i]

df.head()

fig = go.Figure([{
    'x': df.index,
    'y': df[col],
    'name': col
}  for col in df.columns])

fig.update_xaxes(title_text='Time (minutes)')
fig.update_yaxes(title_text='Attendees')
fig.write_image("event attendance.png")
fig.show()

### section for combining into one csv

In [None]:
# combine and export
# combined_csv = pd.concat(file_df_list)
# combined_csv.to_csv( "combined_attendance.csv", index=False, encoding='utf-8-sig')
# print("success!!")

In [None]:
import plotly.graph_objects as go

lst0 = ['lst1', 'lst2', 'lst3']
lst1 = [1,2,3,4,5,6,7,8,9,10,9,8,7,6,5,4]
lst2 = [1,3,6,9,14,16,17,16,17,14,13,19,17,16,15,14,10,8,4,0]
lst3 = [1,2,3,4,5,6,7,8,6,5,3]
lst_lst = [lst1, lst2, lst3]

df = pd.DataFrame()
for i in range(len(lst0)):
    lst_lst[i].extend(np.full(len(max(lst_lst))-len(lst_lst[i]),np.nan))

for i in range(len(lst0)):
    df[lst0[i]] = lst_lst[i]

print(df.head())

fig = go.Figure([{
    'x': df.index,
    'y': df[col],
    'name': col
}  for col in df.columns])

fig.show()


In [None]:
lst0 = ['lst1', 'lst2', 'lst3']
lst1 = [1,2,3,4,5,6,7,8,9,10,9,8,7,6,5,4]
lst2 = [1,3,6,9,14,16,17,16,17,14,13,19,17,16,15,14,10,8,4,0]
lst3 = [1,2,3,4,5,6,7,8,6,5,3]
lst_lst = [lst1, lst2, lst3]

print(len(max(lst_lst)))
