## Metrics per day
- this is code for reading multiple frontpage files and calculating metrics per timestamp (here-daily)
- code is based on the example from "basic_file_read" by Bernhard

In [28]:
import glob
import pandas as pd
import numpy as np
from collections import Counter

In [29]:
#below choose which directory to work with (all files or test files)
#path = 'homepage_csvs/test/'
path = 'homepage_csvs/chaturbate/'

# get all filenames from the directory
csv_files = glob.glob(path + '*.csv')

In [30]:

daily_dict = {} #create an empty dictionary to store all metrics for daily calculations
daily_tags = {} #create an empty dictionary to store a list of all hashtags per day
counter = 0

# iterate over all files to create an array of lists with all metrics for each day
for filename in sorted(csv_files):
    
    # extract date from filename
    fileparts = filename.split('_')
    date = fileparts[2]

    if date not in daily_dict: #add the date to a dictionary with keys for each date
        daily_dict[date] = {"perf_counts":[], "view_sums":[], "show_length_sums":[], 
                            "female_counts":[], "male_counts":[], "trans_counts":[], "couple_counts":[]} 
                            #create a dictionary within each date for metrics and list for each value
        daily_tags[date] = []

    # read CSV file and get metrics
    df = pd.read_csv(filename)
    perf_count = len(df) #number of shows in each csv file
    view_sum = sum(df["viewers"]) #sum of viewers in each csv file
    show_len_sum = sum(df["time"]) #sum of show time in each csv file
    female_count = df.female.sum() #number of female performers in each csv file
    male_count = df.male.sum() #number of male performers
    trans_count = df.trans.sum() #number of trans performers
    couple_count = df.couple.sum() #number of couples performing
    hashtags_list = df["tags"].tolist() #makes a list of all the hashtags used that day

    #below, takes the list of hashtags collected and flattens it 
    hashtags = []
    for object in hashtags_list:
        #print(object)
        if type(object) is str:
            #print(type(object))
            object = object.split(",")
            for word in object:
                hashtags.append(word)
        else:
            #print(type(object))
            hashtags.append(object)

    #print(hashtags)

    # add all the new collected values in the nested dictionary
    daily_dict[date]["perf_counts"].append(perf_count)
    daily_dict[date]["view_sums"].append(view_sum)
    daily_dict[date]["show_length_sums"].append(show_len_sum)
    daily_dict[date]["female_counts"].append(female_count)
    daily_dict[date]["male_counts"].append(male_count)
    daily_dict[date]["trans_counts"].append(trans_count)
    daily_dict[date]["couple_counts"].append(couple_count)

    #add the collected hashtags in their own dictionary
    daily_tags[date]+=hashtags



    print(str(counter) + ' ' + date)
    counter += 1
    

#print(daily_dict)
#print(daily_tags)

0 2021-11-03
1 2021-11-03
2 2021-11-03
3 2021-11-03
4 2021-11-03
5 2021-11-03
6 2021-11-03
7 2021-11-03
8 2021-11-04
9 2021-11-04
10 2021-11-04
11 2021-11-04
12 2021-11-04
13 2021-11-04
14 2021-11-04
15 2021-11-04
16 2021-11-04
17 2021-11-04
18 2021-11-04
19 2021-11-04
20 2021-11-04
21 2021-11-04
22 2021-11-04
23 2021-11-04
24 2021-11-04
25 2021-11-04
26 2021-11-04
27 2021-11-04
28 2021-11-04
29 2021-11-04
30 2021-11-04
31 2021-11-04
32 2021-11-04
33 2021-11-04
34 2021-11-04
35 2021-11-04
36 2021-11-04
37 2021-11-04
38 2021-11-04
39 2021-11-04
40 2021-11-04
41 2021-11-04
42 2021-11-04
43 2021-11-04
44 2021-11-04
45 2021-11-04
46 2021-11-04
47 2021-11-04
48 2021-11-04
49 2021-11-04
50 2021-11-04
51 2021-11-04
52 2021-11-04
53 2021-11-04
54 2021-11-04
55 2021-11-04
56 2021-11-05
57 2021-11-05
58 2021-11-05
59 2021-11-05
60 2021-11-05
61 2021-11-05
62 2021-11-05
63 2021-11-05
64 2021-11-05
65 2021-11-05
66 2021-11-05
67 2021-11-05
68 2021-11-05
69 2021-11-05
70 2021-11-05
71 2021-11-05
72

In [31]:
daily_avgs = []

# next step is to iterate over array to calculate daily averages
for date in daily_dict:
    #for each date calculate
    daily_shows_avg = np.mean(daily_dict[date]["perf_counts"]) #average length of page (number of performances)
    daily_views_avg = sum(daily_dict[date]["view_sums"]) / sum(daily_dict[date]["perf_counts"]) #average number of viewers per day
    daily_length_avg = sum(daily_dict[date]["show_length_sums"]) / sum(daily_dict[date]["perf_counts"]) #average length of show per day
    daily_female_avg = np.mean(daily_dict[date]["female_counts"])
    daily_female_percent = sum(daily_dict[date]["female_counts"]) / sum(daily_dict[date]["perf_counts"]) * 100
    daily_male_avg = np.mean(daily_dict[date]["male_counts"])
    daily_male_percent = sum(daily_dict[date]["male_counts"]) / sum(daily_dict[date]["perf_counts"]) * 100
    daily_trans_avg = np.mean(daily_dict[date]["trans_counts"])
    daily_trans_percent = sum(daily_dict[date]["trans_counts"]) / sum(daily_dict[date]["perf_counts"]) * 100
    daily_couple_avg = np.mean(daily_dict[date]["couple_counts"])
    daily_couple_percent = sum(daily_dict[date]["couple_counts"]) / sum(daily_dict[date]["perf_counts"]) * 100


    #then add everything to a nice dictionary
    daily_avgs.append({'date':date,'shows_average':round(daily_shows_avg, 2), 'viewers_average':round(daily_views_avg, 2), 'show_length_average':round(daily_length_avg, 2),
                        'average_females':round(daily_female_avg,2), 'percentage_females':round(daily_female_percent,2), 'average_males':round(daily_male_avg,2), 'percentage_males':round(daily_male_percent,2),
                        'average_trans':round(daily_trans_avg,2), 'percentage_trans':round(daily_trans_percent,2), 'average_couples':round(daily_couple_avg), 'percentage_couples':round(daily_couple_percent,2)})
    #print(daily_avgs)

#and make a dataframe from the dictionary
df = pd.DataFrame.from_dict(daily_avgs)
df.to_csv('daily_averages.csv',index=False) #and save it as a csv file
df

Unnamed: 0,date,shows_average,viewers_average,show_length_average,average_females,percentage_females,average_males,percentage_males,average_trans,percentage_trans,average_couples,percentage_couples
0,2021-11-03,7825.75,51.51,132.12,5491.75,70.18,1407.62,17.99,535.00,6.84,391,5.00
1,2021-11-04,6996.94,50.36,147.90,4904.04,70.09,1233.25,17.63,518.23,7.41,341,4.88
2,2021-11-05,7161.96,48.21,149.61,5014.52,70.02,1251.23,17.47,544.56,7.60,352,4.91
3,2021-11-06,6913.75,51.71,155.66,4783.81,69.19,1253.10,18.12,523.73,7.58,353,5.11
4,2021-11-07,5508.87,65.37,154.55,3631.02,65.91,1123.19,20.39,458.17,8.32,296,5.38
...,...,...,...,...,...,...,...,...,...,...,...,...
62,2022-01-12,6544.91,56.00,141.26,4524.25,69.13,1218.18,18.61,487.48,7.45,315,4.81
63,2022-01-13,6546.20,57.68,146.77,4536.71,69.30,1199.76,18.33,491.38,7.51,318,4.86
64,2022-01-14,6979.38,52.31,149.10,4881.08,69.94,1263.32,18.10,498.60,7.14,336,4.82
65,2022-01-15,6679.51,58.65,149.39,4581.81,68.59,1262.13,18.90,502.94,7.53,333,4.98


In [32]:
# here i make a file with the counted hashtags over time
# FUTURE: not include all hashtags, but only those that are counted above a threshold?

daily_tags_counted = []

for date in daily_tags:  #iterating though every date
    daily_counts = Counter(daily_tags[date]) #instead of having a list of all hashtags, this makes a dictionary with counts of each hashtag
    #print(daily_counts)

    for tag in daily_counts: 
        daily_tags_counted.append({"date": date, "hashtag": tag , "count": daily_counts[tag]})
        #daily_tags_counted[date][tag] = daily_counts[tag]

df = pd.DataFrame.from_dict(daily_tags_counted)
df.to_csv('daily_hashtags.csv',index=False)
df
    

Unnamed: 0,date,hashtag,count
0,2021-11-03,,9881
1,2021-11-03,#daddy,471
2,2021-11-03,#18,6385
3,2021-11-03,#young,4830
4,2021-11-03,#teen,6615
...,...,...,...
158970,2022-01-16,#ice,1
158971,2022-01-16,#colorhair,1
158972,2022-01-16,#jocks,1
158973,2022-01-16,#spoilme,1


## Metrics per hour in day
- next, we use a similar approach to look at metrics per hour and analyze performer work "rhythms" during different times of day

In [33]:
#make the master dictionary for collecting hourly data
hourly_dict = {}
#IF HASHTAGS ARE INTERESTING PER TIME IN DAY, THEN HERE THAT SHOULD BE ADDED THE SAME WAY AS IN METRICS PER DAY
counter = 0

for filename in sorted(csv_files):

    # extract hour from filename
    fileparts = filename.split('_')
    time = fileparts[3]
    timeparts = time.split('-')
    hour = timeparts[0]

    if hour not in hourly_dict:
       hourly_dict[hour] = {"perf_counts":[], "view_sums":[], "show_length_sums":[],
                            "female_counts":[], "male_counts":[], "trans_counts":[], "couple_counts":[]}
    
    # read CSV file and collect needed metrics
    df = pd.read_csv(filename)
    perf_count = len(df) #number of shows in each csv file
    view_sum = sum(df["viewers"]) #sum of viewers in each csv file
    show_len_sum = sum(df["time"]) #sum of show length in each csv file
    female_count = df.female.sum() #number of female performers in each csv file
    male_count = df.male.sum() #number of male performers
    trans_count = df.trans.sum() #number of trans performers
    couple_count = df.couple.sum() #number of couples performing


    hourly_dict[hour]["perf_counts"].append(perf_count)
    hourly_dict[hour]["view_sums"].append(view_sum)
    hourly_dict[hour]["show_length_sums"].append(show_len_sum)
    hourly_dict[hour]["female_counts"].append(female_count)
    hourly_dict[hour]["male_counts"].append(male_count)
    hourly_dict[hour]["trans_counts"].append(trans_count)
    hourly_dict[hour]["couple_counts"].append(couple_count)


    print(str(counter) + ' ' + hour)
    counter += 1
    

#print(hourly_dict)


0 20
1 20
2 21
3 21
4 22
5 22
6 23
7 23
8 00
9 00
10 01
11 01
12 02
13 02
14 03
15 03
16 04
17 04
18 05
19 05
20 06
21 06
22 07
23 07
24 08
25 08
26 09
27 09
28 10
29 10
30 11
31 11
32 12
33 12
34 13
35 13
36 14
37 14
38 15
39 15
40 16
41 16
42 17
43 17
44 18
45 18
46 19
47 19
48 20
49 20
50 21
51 21
52 22
53 22
54 23
55 23
56 00
57 00
58 01
59 01
60 02
61 02
62 03
63 03
64 04
65 04
66 05
67 05
68 06
69 06
70 07
71 07
72 08
73 08
74 09
75 09
76 10
77 10
78 11
79 11
80 12
81 12
82 13
83 13
84 14
85 14
86 15
87 15
88 16
89 16
90 17
91 17
92 18
93 18
94 19
95 19
96 20
97 20
98 21
99 21
100 22
101 22
102 23
103 23
104 00
105 00
106 01
107 01
108 02
109 02
110 03
111 03
112 04
113 04
114 05
115 05
116 06
117 06
118 07
119 07
120 08
121 08
122 09
123 09
124 10
125 10
126 11
127 11
128 12
129 12
130 13
131 13
132 14
133 14
134 15
135 15
136 16
137 16
138 17
139 17
140 18
141 18
142 19
143 19
144 20
145 20
146 21
147 21
148 22
149 22
150 23
151 23
152 00
153 00
154 01
155 01
156 02
157 02
158 

In [36]:
hourly_avgs = []

# next step is to iterate over array to calculate hourly metrics
for hour in hourly_dict:
    
    hourly_shows_avg = np.mean(hourly_dict[hour]["perf_counts"])
    hourly_views_avg = sum(hourly_dict[hour]["view_sums"]) / sum(hourly_dict[hour]["perf_counts"])
    hourly_length_avg = sum(hourly_dict[hour]["show_length_sums"]) / sum(hourly_dict[hour]["perf_counts"])
    hourly_female_avg = np.mean(hourly_dict[hour]["female_counts"])
    hourly_female_percent = sum(hourly_dict[hour]["female_counts"]) / sum(hourly_dict[hour]["perf_counts"]) * 100
    hourly_male_avg = np.mean(hourly_dict[hour]["male_counts"])
    hourly_male_percent = sum(hourly_dict[hour]["male_counts"]) / sum(hourly_dict[hour]["perf_counts"]) * 100
    hourly_trans_avg = np.mean(hourly_dict[hour]["trans_counts"])
    hourly_trans_percent = sum(hourly_dict[hour]["trans_counts"]) / sum(hourly_dict[hour]["perf_counts"]) * 100
    hourly_couple_avg = np.mean(hourly_dict[hour]["couple_counts"])
    hourly_couple_percent = sum(hourly_dict[hour]["couple_counts"]) / sum(hourly_dict[hour]["perf_counts"]) * 100
    
    hourly_avgs.append({'shows_average':round(hourly_shows_avg, 2), 'viewers_average':round(hourly_views_avg, 2), 'show_length_average':round(hourly_length_avg, 2),
                        'average_females':round(hourly_female_avg,2), 'percentage_females':round(hourly_female_percent,2), 'average_males':round(hourly_male_avg,2), 'percentage_males':round(hourly_male_percent,2),
                        'average_trans':round(hourly_trans_avg,2), 'percentage_trans':round(hourly_trans_percent,2), 'average_couples':round(hourly_couple_avg), 'percentage_couples':round(hourly_couple_percent,2)})


df = pd.DataFrame.from_dict(hourly_avgs)
df["daily_hour"] = df.index
df.to_csv('hourly_averages.csv',index=False)
df


Unnamed: 0,shows_average,viewers_average,show_length_average,average_females,percentage_females,average_males,percentage_males,average_trans,percentage_trans,average_couples,percentage_couples,daily_hour
0,5747.87,63.37,150.48,3902.7,67.9,1150.61,20.02,400.26,6.96,294,5.12,0
1,6378.5,56.61,124.37,4376.2,68.61,1228.77,19.26,448.5,7.03,325,5.1,1
2,7017.5,53.63,125.53,4829.4,68.82,1340.37,19.1,496.35,7.07,351,5.01,2
3,7110.18,56.72,138.16,4856.65,68.31,1380.22,19.41,514.3,7.23,359,5.05,3
4,6929.14,59.47,154.32,4704.27,67.89,1360.18,19.63,514.54,7.43,350,5.05,4
5,6531.15,59.98,169.59,4415.11,67.6,1289.15,19.74,507.26,7.77,320,4.89,5
6,6006.59,59.1,172.2,4016.26,66.86,1212.03,20.18,487.5,8.12,291,4.84,6
7,5417.63,56.56,157.37,3550.75,65.54,1129.09,20.84,464.36,8.57,273,5.05,7
8,5500.57,51.95,139.67,3600.53,65.46,1146.13,20.84,464.82,8.45,289,5.26,8
9,5903.19,50.2,134.9,3874.01,65.63,1236.81,20.95,482.77,8.18,310,5.24,9
