## Metrics per day
- this is code for reading multiple frontpage files and calculating metrics per timestamp (here-daily)
- code is based on the example from "basic_file_read" by Bernhard

In [21]:
import glob
import pandas as pd
import numpy as np
from collections import Counter

In [22]:
#below choose which directory to work with (all files or test files)
path = 'homepage_csvs/test/'
#path = 'homepage_csvs/chaturbate/'

# get all filenames from the directory
csv_files = glob.glob(path + '*.csv')

In [23]:

daily_dict = {} #create an empty dictionary to store all metrics for daily calculations
daily_tags = {} #create an empty dictionary to store a list of all hashtags per day
counter = 0

# iterate over all files to create an array of lists with all metrics for each day
for filename in sorted(csv_files):
    
    # extract date from filename
    fileparts = filename.split('_')
    date = fileparts[2]

    if date not in daily_dict: #add the date to a dictionary with keys for each date
        daily_dict[date] = {"perf_counts":[], "view_sums":[], "show_length_sums":[], 
                            "female_counts":[], "male_counts":[], "trans_counts":[], "couple_counts":[]} 
                            #create a dictionary within each date for metrics and list for each value
        daily_tags[date] = []

    # read CSV file and get metrics
    df = pd.read_csv(filename)
    perf_count = len(df) #number of shows in each csv file
    view_sum = sum(df["viewers"]) #sum of viewers in each csv file
    show_len_sum = sum(df["time"]) #sum of show time in each csv file
    female_count = df.female.sum() #number of female performers in each csv file
    male_count = df.male.sum() #number of male performers
    trans_count = df.trans.sum() #number of trans performers
    couple_count = df.couple.sum() #number of couples performing
    hashtags_list = df["tags"].tolist() #makes a list of all the hashtags used that day

    #below, takes the list of hashtags collected and flattens it 
    hashtags = []
    for object in hashtags_list:
        #print(object)
        if type(object) is str:
            #print(type(object))
            object = object.split(",")
            for word in object:
                hashtags.append(word)
        else:
            #print(type(object))
            hashtags.append(object)

    #print(hashtags)

    # add all the new collected values in the nested dictionary
    daily_dict[date]["perf_counts"].append(perf_count)
    daily_dict[date]["view_sums"].append(view_sum)
    daily_dict[date]["show_length_sums"].append(show_len_sum)
    daily_dict[date]["female_counts"].append(female_count)
    daily_dict[date]["male_counts"].append(male_count)
    daily_dict[date]["trans_counts"].append(trans_count)
    daily_dict[date]["couple_counts"].append(couple_count)

    #add the collected hashtags in their own dictionary
    daily_tags[date]+=hashtags



    print(str(counter) + ' ' + date)
    counter += 1
    

#print(daily_dict)
#print(daily_tags)

0 2021-11-09
1 2021-11-09
2 2021-11-09
3 2021-11-09
4 2021-11-09
5 2021-11-09
6 2021-11-09
7 2021-11-09
8 2021-11-09
9 2021-11-09
10 2021-11-09
11 2021-11-09
12 2021-11-09
13 2021-11-09
14 2021-11-09
15 2021-11-09
16 2021-11-09
17 2021-11-09
18 2021-11-09
19 2021-11-09
20 2021-11-09
21 2021-11-09
22 2021-11-09
23 2021-11-09
24 2021-11-09
25 2021-11-09
26 2021-11-09
27 2021-11-09
28 2021-11-09
29 2021-11-09
30 2021-11-09
31 2021-11-09
32 2021-11-09
33 2021-11-09
34 2021-11-09
35 2021-11-09
36 2021-11-09
37 2021-11-09
38 2021-11-09
39 2021-11-09
40 2021-11-09
41 2021-11-09
42 2021-11-09
43 2021-11-09
44 2021-11-09
45 2021-11-09
46 2021-11-09
47 2021-11-11
48 2021-11-11
49 2021-11-11
50 2021-11-11
51 2021-11-11
52 2021-11-12
53 2021-11-12
54 2021-11-12


In [24]:
daily_avgs = []

# next step is to iterate over array to calculate daily averages
for date in daily_dict:
    #for each date calculate
    daily_shows_avg = np.mean(daily_dict[date]["perf_counts"]) #average length of page (number of performances)
    daily_views_avg = sum(daily_dict[date]["view_sums"]) / sum(daily_dict[date]["perf_counts"]) #average number of viewers per day
    daily_length_avg = sum(daily_dict[date]["show_length_sums"]) / sum(daily_dict[date]["perf_counts"]) #average length of show per day
    daily_female_avg = np.mean(daily_dict[date]["female_counts"])
    daily_female_percent = sum(daily_dict[date]["female_counts"]) / sum(daily_dict[date]["perf_counts"]) * 100
    daily_male_avg = np.mean(daily_dict[date]["male_counts"])
    daily_male_percent = sum(daily_dict[date]["male_counts"]) / sum(daily_dict[date]["perf_counts"]) * 100
    daily_trans_avg = np.mean(daily_dict[date]["trans_counts"])
    daily_trans_percent = sum(daily_dict[date]["trans_counts"]) / sum(daily_dict[date]["perf_counts"]) * 100
    daily_couple_avg = np.mean(daily_dict[date]["couple_counts"])
    daily_couple_percent = sum(daily_dict[date]["couple_counts"]) / sum(daily_dict[date]["perf_counts"]) * 100


    #then add everything to a nice dictionary
    daily_avgs.append({'date':date,'shows_average':round(daily_shows_avg, 2), 'viewers_average':round(daily_views_avg, 2), 'show_length_average':round(daily_length_avg, 2),
                        'average_females':round(daily_female_avg,2), 'percentage_females':round(daily_female_percent,2), 'average_males':round(daily_male_avg,2), 'percentage_males':round(daily_male_percent,2),
                        'average_trans':round(daily_trans_avg,2), 'percentage_trans':round(daily_trans_percent,2), 'average_couples':round(daily_couple_avg), 'percentage_couples':round(daily_couple_percent,2)})
    #print(daily_avgs)

#and make a dataframe from the dictionary
df = pd.DataFrame.from_dict(daily_avgs)
df.to_csv('daily_averages.csv',index=False) #and save it as a csv file
df

Unnamed: 0,date,shows_average,viewers_average,show_length_average,average_females,percentage_females,average_males,percentage_males,average_trans,percentage_trans,average_couples,percentage_couples
0,2021-11-09,6857.77,48.56,150.35,4798.19,69.97,1224.04,17.85,507.04,7.39,328,4.79
1,2021-11-11,5704.6,50.23,155.0,4011.6,70.32,985.4,17.27,432.6,7.58,275,4.82
2,2021-11-12,6509.0,45.25,172.28,4594.0,70.58,1082.33,16.63,546.33,8.39,286,4.4


In [25]:
# here i make a file with the counted hashtags over time
# FUTURE: not include all hashtags, but only those that are counted above a threshold?

daily_tags_counted = []

for date in daily_tags:  #iterating though every date
    daily_counts = Counter(daily_tags[date]) #instead of having a list of all hashtags, this makes a dictionary with counts of each hashtag
    #print(daily_counts)

    for tag in daily_counts: 
        daily_tags_counted.append({"date": date, "hashtag": tag , "count": daily_counts[tag]})
        #daily_tags_counted[date][tag] = daily_counts[tag]

df = pd.DataFrame.from_dict(daily_tags_counted)
df.to_csv('daily_hashtags.csv',index=False)
df
    

Unnamed: 0,date,hashtag,count
0,2021-11-09,,50177
1,2021-11-09,#lovense,41098
2,2021-11-09,#tattoo,3633
3,2021-11-09,#ink,28
4,2021-11-09,#latina,47531
...,...,...,...
5416,2021-11-12,#fountainsquirt,1
5417,2021-11-12,#bwc,1
5418,2021-11-12,#teasing,1
5419,2021-11-12,#africa,1


## Metrics per hour in day
- next, we use a similar approach to look at metrics per hour and analyze performer work "rhythms" during different times of day

In [26]:
#make the master dictionary for collecting hourly data
hourly_dict = {}
#IF HASHTAGS ARE INTERESTING PER TIME IN DAY, THEN HERE THAT SHOULD BE ADDED THE SAME WAY AS IN METRICS PER DAY
counter = 0

for filename in sorted(csv_files):

    # extract hour from filename
    fileparts = filename.split('_')
    time = fileparts[3]
    timeparts = time.split('-')
    hour = timeparts[0]

    if hour not in hourly_dict:
       hourly_dict[hour] = {"perf_counts":[], "view_sums":[], "show_length_sums":[],
                            "female_counts":[], "male_counts":[], "trans_counts":[], "couple_counts":[]}
    
    # read CSV file and collect needed metrics
    df = pd.read_csv(filename)
    perf_count = len(df) #number of shows in each csv file
    view_sum = sum(df["viewers"]) #sum of viewers in each csv file
    show_len_sum = sum(df["time"]) #sum of show length in each csv file
    female_count = df.female.sum() #number of female performers in each csv file
    male_count = df.male.sum() #number of male performers
    trans_count = df.trans.sum() #number of trans performers
    couple_count = df.couple.sum() #number of couples performing


    hourly_dict[hour]["perf_counts"].append(perf_count)
    hourly_dict[hour]["view_sums"].append(view_sum)
    hourly_dict[hour]["show_length_sums"].append(show_len_sum)
    hourly_dict[hour]["female_counts"].append(female_count)
    hourly_dict[hour]["male_counts"].append(male_count)
    hourly_dict[hour]["trans_counts"].append(trans_count)
    hourly_dict[hour]["couple_counts"].append(couple_count)


    print(str(counter) + ' ' + hour)
    counter += 1
    

#print(hourly_dict)


0 00
1 00
2 01
3 01
4 02
5 02
6 03
7 03
8 04
9 04
10 05
11 05
12 06
13 06
14 07
15 07
16 08
17 08
18 09
19 09
20 10
21 10
22 11
23 11
24 12
25 12
26 13
27 13
28 14
29 14
30 15
31 15
32 16
33 16
34 17
35 17
36 18
37 18
38 19
39 19
40 20
41 20
42 21
43 22
44 22
45 23
46 23
47 10
48 11
49 12
50 13
51 13
52 08
53 09
54 10


In [27]:
hourly_avgs = []

# next step is to iterate over array to calculate hourly metrics
for hour in hourly_dict:
    
    hourly_shows_avg = np.mean(hourly_dict[hour]["perf_counts"])
    hourly_views_avg = sum(hourly_dict[hour]["view_sums"]) / sum(hourly_dict[hour]["perf_counts"])
    hourly_length_avg = sum(hourly_dict[hour]["show_length_sums"]) / sum(hourly_dict[hour]["perf_counts"])
    hourly_female_avg = np.mean(hourly_dict[hour]["female_counts"])
    hourly_female_percent = sum(hourly_dict[hour]["female_counts"]) / sum(hourly_dict[hour]["perf_counts"]) * 100
    hourly_male_avg = np.mean(hourly_dict[hour]["male_counts"])
    hourly_male_percent = sum(hourly_dict[hour]["male_counts"]) / sum(hourly_dict[hour]["perf_counts"]) * 100
    hourly_trans_avg = np.mean(hourly_dict[hour]["trans_counts"])
    hourly_trans_percent = sum(hourly_dict[hour]["trans_counts"]) / sum(hourly_dict[hour]["perf_counts"]) * 100
    hourly_couple_avg = np.mean(hourly_dict[hour]["couple_counts"])
    hourly_couple_percent = sum(hourly_dict[hour]["couple_counts"]) / sum(hourly_dict[hour]["perf_counts"]) * 100
    
    hourly_avgs.append({'shows_average':round(hourly_shows_avg, 2), 'viewers_average':round(hourly_views_avg, 2), 'show_length_average':round(hourly_length_avg, 2),
                        'average_females':round(hourly_female_avg,2), 'percentage_females':round(hourly_female_percent,2), 'average_males':round(hourly_male_avg,2), 'percentage_males':round(hourly_male_percent,2),
                        'average_trans':round(hourly_trans_avg,2), 'percentage_trans':round(hourly_trans_percent,2), 'average_couples':round(hourly_couple_avg), 'percentage_couples':round(hourly_couple_percent,2)})


df = pd.DataFrame.from_dict(hourly_avgs)
df.to_csv('hourly_averages.csv',index=False)
df


Unnamed: 0,shows_average,viewers_average,show_length_average,average_females,percentage_females,average_males,percentage_males,average_trans,percentage_trans,average_couples,percentage_couples
0,7822.0,50.98,158.89,5384.0,68.83,1449.5,18.53,581.5,7.43,407,5.2
1,7422.5,47.87,177.94,5126.0,69.06,1366.5,18.41,551.0,7.42,379,5.11
2,6908.0,46.87,177.33,4779.5,69.19,1259.0,18.23,545.0,7.89,324,4.7
3,6358.5,47.26,156.66,4261.0,67.01,1255.5,19.75,514.5,8.09,328,5.15
4,6391.0,46.83,133.68,4242.0,66.37,1305.5,20.43,493.0,7.71,350,5.48
5,6661.0,46.11,125.54,4471.0,67.12,1284.5,19.28,548.0,8.23,358,5.37
6,6400.0,47.53,132.04,4273.0,66.77,1279.0,19.98,525.5,8.21,322,5.04
7,6020.5,49.02,145.31,4081.0,67.79,1125.5,18.69,527.0,8.75,287,4.77
8,6208.33,49.18,158.11,4318.67,69.56,1071.33,17.26,527.33,8.49,291,4.69
9,6024.33,46.58,169.86,4227.0,70.17,1029.0,17.08,490.0,8.13,278,4.62
