In [1]:
import os
import matplotlib
import numpy as np
import pandas as pd
import csv
import datetime
from collections import Counter

pd.set_option('display.max_columns', 1000)  # or 1000
pd.set_option('display.max_rows', 1000)  # or 1000
pd.set_option('display.max_colwidth', None) 
pd.options.plotting.backend = "plotly"

In [2]:
directory = "API-logs/"

### Server logs

In [9]:
with open(directory+'server-19days.log','r' , errors='replace')  as f:
    lines = f.readlines() # readlines creates a list of the lines
len(lines)

2722543

In [10]:
requestdf = pd.DataFrame(columns = ['date', 'requests'])

In [13]:
import re
count=0
for line in lines:
    if "POST /generate HTTP/1.1" in line:
        time = line.split('[')[1].split(']')[0]
        code = line.split("POST /generate HTTP/1.1")[1]
        code= re.sub("[^0-9]", "", code)
        code= code.replace('0400','400')
        requestdf.loc[count] = [time, code]
        count+=1

In [14]:
requestdf['date'] = pd.to_datetime(requestdf['date'] , format= "%d/%b/%Y %H:%M:%S")

In [None]:
2022-09-22 	15:12:12

In [95]:
requestdf.head()

Unnamed: 0,date,requests
0,2022-09-22 15:15:50,200
1,2022-09-22 15:15:51,200
2,2022-09-22 15:15:55,200
3,2022-09-22 15:16:03,200
4,2022-09-22 15:16:08,200


### Getting total number of days (+dates)

In [15]:
requestdaydf = requestdf.groupby(pd.Grouper(key = 'date', freq='1440min')).count()
print('there are '+str(requestdaydf.shape[0]) + ' days of logs')

there are 18 days of logs


In [55]:
requestdaydf.head(18)

Unnamed: 0_level_0,requests
date,Unnamed: 1_level_1
2022-09-22,9979
2022-09-23,13908
2022-09-24,9491
2022-09-25,12997
2022-09-26,16420
2022-09-27,17521
2022-09-28,14446
2022-09-29,12187
2022-09-30,8524
2022-10-01,9845


In [17]:
requestdf['requests'] = pd.to_numeric(requestdf['requests'])

In [18]:
requestdf.requests.value_counts()

200    224889
400      5523
503       343
500        13
Name: requests, dtype: int64

200 = OK we answered

400 = used bad request

503 = refused because no more room in the queue

### Average number of requests per hour

In [96]:
requesthourdf = requestdf.groupby(pd.Grouper(key = 'date', freq='60min')).count()
requesthourdf['requests'].describe()

count     413.000000
mean      558.760291
std       350.727293
min        93.000000
25%       334.000000
50%       488.000000
75%       676.000000
max      3630.000000
Name: requests, dtype: float64

### Number of requests per 10 minutes

In [322]:
requesthourdf = requestdf.groupby(pd.Grouper(key = 'date', freq='10min')).count()
requesthourdf = requesthourdf.iloc[7:-2 , :]
requesthourdf.tail()

Unnamed: 0_level_0,requests
date,Unnamed: 1_level_1
2022-10-09 18:00:00,163
2022-10-09 18:10:00,46
2022-10-09 18:20:00,133
2022-10-09 18:30:00,174
2022-10-09 18:40:00,160


In [336]:
requesthourdf.head()

Unnamed: 0_level_0,requests
date,Unnamed: 1_level_1
2022-09-22 16:20:00,237
2022-09-22 16:30:00,243
2022-09-22 16:40:00,213
2022-09-22 16:50:00,227
2022-09-22 17:00:00,208


In [324]:
requesthourdf.shape

(2463, 1)

### Code Carbon logs

In [56]:
carbondf = pd.DataFrame(columns = ['date', 'timestamp', 'RAM energy', 'GPU energy', 'GPU power', 'CPU energy', 'Energy consumed'])

In [None]:
## 72 hours

In [85]:
ins=[]
with open(directory+'carbon.log' , 'r') as f:
    lines = f.readlines() # readlines creates a list of the lines
    for line in lines:
        if 'Sep' in line:
            ins.append(lines.index(line))
len(lines)

694544

In [5]:
alldates, sep22, sep23, sep24, sep25, sep26= [], [], [], [], [], []
sep22 = lines[ins[0]:ins[1]]
print(len(sep22))
alldates.append(sep22)
sep23 = lines[ins[1]:ins[2]]
print(len(sep23))
alldates.append(sep23)
sep24 = lines[ins[2]:ins[3]]
alldates.append(sep24)
print(len(sep24))
sep25 = lines[ins[3]:ins[4]]
alldates.append(sep25)
print(len(sep25))
sep26 = lines[ins[4]:len(lines)]
alldates.append(sep26)
print(len(sep26))

13408
36577
36573
36577
20003


In [89]:
for i in range(1, len(lines), 4):
    for line in lines[i:i + 4]:
        try:
            time = line.split('@')[1].split(']')[0].strip()
        except:
            continue
        if 'RAM :' in line: 
            ram_nrj = line.split('Energy consumed for RAM :')[1].split('RAM')[0].replace('kWh.','')
        elif 'GPUs'in line:
            gpu_nrj = line.split('Energy consumed for all GPUs :')[1].split('All GPUs')[0].replace('kWh.','')
            gpu_power = line.split('All GPUs Power : ')[1].replace('W','').strip()
        elif 'CPUs' in line:
            cpu_nrj= line.split('Energy consumed for all CPUs :')[1].split('All CPUs')[0].replace('kWh.','')
        elif ' electricity used since the begining.' in line:
            total_nrj = line.split(' electricity used since the begining.')[0].split(']')[1].replace(' of','').replace('kWh','')
    carbondf.loc[count] = [time, ram_nrj, gpu_nrj, gpu_power, cpu_nrj, total_nrj]
    count+=1

In [279]:
carbondf = pd.read_csv(directory+'19days_carbon.csv')
#carbondf= carbondf.drop('Unnamed: 0', axis=1)

In [280]:
carbondf.shape

(173636, 7)

In [281]:
carbondf.head()

Unnamed: 0,date,timestamp,RAM energy,GPU energy,GPU power,CPU energy,Energy consumed
0,2022-09-22,15:12:12,0.002708,0.006703,2412.392,0.000125,0.008222
1,2022-09-22,15:12:22,0.004022,0.013103,2446.52,0.000243,0.016054
2,2022-09-22,15:12:32,0.005333,0.02132,3138.044,0.000361,0.025703
3,2022-09-22,15:12:42,0.006658,0.024947,1389.091,0.000478,0.030758
4,2022-09-22,15:12:52,0.007968,0.028631,1395.522,0.000597,0.035886


In [76]:
## with date : for shorter logs (too much time for 19 days)

In [None]:
count=0
for day in alldates: 
    date= day[0].replace('\n','')
    print(date)
    print(len(day))
    for i in range(1, len(day), 4):
        for line in day[i:i + 4]:
            try:
                time = line.split('@')[1].split(']')[0].strip()
            except:
                continue
            if 'RAM :' in line: 
                ram_nrj = line.split('Energy consumed for RAM :')[1].split('RAM')[0].replace('kWh.','')
            elif 'GPUs'in line:
                gpu_nrj = line.split('Energy consumed for all GPUs :')[1].split('All GPUs')[0].replace('kWh.','')
                gpu_power = line.split('All GPUs Power : ')[1].replace('W','').strip()
            elif 'CPUs' in line:
                cpu_nrj= line.split('Energy consumed for all CPUs :')[1].split('All CPUs')[0].replace('kWh.','')
            elif ' electricity used since the begining.' in line:
                total_nrj = line.split(' electricity used since the begining.')[0].split(']')[1].replace(' of','').replace('kWh','')
        carbondf.loc[count] = [date, time, ram_nrj, gpu_nrj, gpu_power, cpu_nrj, total_nrj]
        count+=1

In [282]:
carbondf["Energy consumed"] = pd.to_numeric(carbondf["Energy consumed"])
carbondf["GPU power"] = pd.to_numeric(carbondf["GPU power"])

In [283]:
carbondf.shape

(173636, 7)

In [337]:
carbondf[carbondf["GPU power"] > 5000].head()

Unnamed: 0,date,timestamp,RAM energy,GPU energy,GPU power,CPU energy,Energy consumed,time
137,2022-09-22,15:33:52,0.174261,0.769212,5066.0,0.015473,0.957637,2022-09-22 15:33:52
160,2022-09-22,15:37:22,0.20187,0.92797,5658.65,0.017952,1.131657,2022-09-22 15:37:22
177,2022-09-22,15:40:12,0.224245,1.050833,5686.439,0.019958,1.278797,2022-09-22 15:40:12
190,2022-09-22,15:42:02,0.238719,1.130143,5014.558,0.021376,1.375667,2022-09-22 15:42:02
239,2022-09-22,15:49:52,0.300566,1.453894,5607.181,0.026924,1.781384,2022-09-22 15:49:52


In [32]:
#carbondf.date.value_counts()

In [11]:
#carbondf = carbondf.sort_values('Energy consumed')
#carbondf = carbondf.reset_index(drop=True)

In [338]:
carbondf["GPU power"].head(10000).plot()

In [339]:
carbondf['time'] = carbondf['date'] + ' ' + carbondf['timestamp'].astype(str)

In [340]:
carbondf['time'] = pd.to_datetime(carbondf['time'] , format= "%Y-%m-%d %H:%M:%S")

In [341]:
carbondf.head()

Unnamed: 0,date,timestamp,RAM energy,GPU energy,GPU power,CPU energy,Energy consumed,time
0,2022-09-22,15:12:12,0.002708,0.006703,2412.392,0.000125,0.008222,2022-09-22 15:12:12
1,2022-09-22,15:12:22,0.004022,0.013103,2446.52,0.000243,0.016054,2022-09-22 15:12:22
2,2022-09-22,15:12:32,0.005333,0.02132,3138.044,0.000361,0.025703,2022-09-22 15:12:32
3,2022-09-22,15:12:42,0.006658,0.024947,1389.091,0.000478,0.030758,2022-09-22 15:12:42
4,2022-09-22,15:12:52,0.007968,0.028631,1395.522,0.000597,0.035886,2022-09-22 15:12:52


In [71]:
#carbondf['date']= carbondf.index

## Energy consumption by 10 min

In [342]:
carbonhourdf = carbondf.groupby(pd.Grouper(key = 'time', freq='10min')).last()
carbonhourdf['time'] = carbonhourdf.index
carbonhourdf.head()

Unnamed: 0_level_0,date,timestamp,RAM energy,GPU energy,GPU power,CPU energy,Energy consumed,time
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-09-22 15:10:00,2022-09-22,15:19:52,0.063238,0.271815,2527.826,0.005555,0.332647,2022-09-22 15:10:00
2022-09-22 15:20:00,2022-09-22,15:29:52,0.142635,0.615689,2395.379,0.012757,0.771081,2022-09-22 15:20:00
2022-09-22 15:30:00,2022-09-22,15:39:52,0.221611,1.029444,2541.23,0.019722,1.26278,2022-09-22 15:30:00
2022-09-22 15:40:00,2022-09-22,15:49:52,0.300566,1.453894,5607.181,0.026924,1.781384,2022-09-22 15:40:00
2022-09-22 15:50:00,2022-09-22,15:59:52,0.379544,1.842744,2494.936,0.03389,2.248372,2022-09-22 15:50:00


In [343]:
carbonhourdf['Energy by interval'] = carbonhourdf["Energy consumed"].diff()

In [344]:
#2022-09-22 16:20:00 	237
#2022-10-09 18:40:00 	

In [345]:
carbonhourdf = carbonhourdf.iloc[7:, :]

In [346]:
carbonhourdf.shape

(2728, 9)

In [347]:
carbonhourdf.head()

Unnamed: 0_level_0,date,timestamp,RAM energy,GPU energy,GPU power,CPU energy,Energy consumed,time,Energy by interval
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-09-22 16:20:00,2022-09-22,16:29:52,0.616152,2.986218,1763.207,0.055257,3.657627,2022-09-22 16:20:00,0.46672
2022-09-22 16:30:00,2022-09-22,16:39:52,0.695054,3.373999,2494.777,0.062223,4.123394,2022-09-22 16:30:00,0.465767
2022-09-22 16:40:00,2022-09-22,16:49:53,0.773883,3.757831,2599.632,0.069425,4.60114,2022-09-22 16:40:00,0.477746
2022-09-22 16:50:00,2022-09-22,16:59:52,0.852737,4.140414,2541.436,0.07639,5.061579,2022-09-22 16:50:00,0.460439
2022-09-22 17:00:00,2022-09-22,17:09:53,0.931576,4.517707,2447.751,0.083592,5.532874,2022-09-22 17:00:00,0.471295


In [348]:
carbonhourdf = carbonhourdf.iloc[:-121, :]

In [349]:
carbonhourdf.tail()

Unnamed: 0_level_0,date,timestamp,RAM energy,GPU energy,GPU power,CPU energy,Energy consumed,time,Energy by interval
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-10-10 18:00:00,2022-10-10,18:09:53,206.865759,687.488517,1302.686,18.485593,912.839868,2022-10-10 18:00:00,0.334254
2022-10-10 18:10:00,2022-10-10,18:19:53,206.945001,687.702846,1317.169,18.49256,913.135654,2022-10-10 18:10:00,0.295786
2022-10-10 18:20:00,2022-10-10,18:29:53,207.024209,687.926633,1309.552,18.49976,913.450603,2022-10-10 18:20:00,0.314949
2022-10-10 18:30:00,2022-10-10,18:39:53,207.10336,688.162174,1302.061,18.506726,913.767511,2022-10-10 18:30:00,0.316908
2022-10-10 18:40:00,2022-10-10,18:49:53,207.182557,688.384563,1299.409,18.513927,914.081048,2022-10-10 18:40:00,0.313537


In [350]:
carbonhourdf.shape

(2607, 9)

### Merging the carbon and request DFs

In [351]:
alllogdf = pd.concat([carbonhourdf,requesthourdf], axis=1)
alllogdf['date']= alllogdf.index
alllogdf.shape

(2607, 10)

Total time for logs:

In [352]:
alllogdf_diff = alllogdf.iloc[[0, -1]]
alllogdf_diff['date'].iloc[1] -  alllogdf_diff['date'].iloc[0]

Timedelta('18 days 02:20:00')

In [353]:
alllogdf.tail()

Unnamed: 0,date,timestamp,RAM energy,GPU energy,GPU power,CPU energy,Energy consumed,time,Energy by interval,requests
2022-10-10 18:00:00,2022-10-10 18:00:00,18:09:53,206.865759,687.488517,1302.686,18.485593,912.839868,2022-10-10 18:00:00,0.334254,
2022-10-10 18:10:00,2022-10-10 18:10:00,18:19:53,206.945001,687.702846,1317.169,18.49256,913.135654,2022-10-10 18:10:00,0.295786,
2022-10-10 18:20:00,2022-10-10 18:20:00,18:29:53,207.024209,687.926633,1309.552,18.49976,913.450603,2022-10-10 18:20:00,0.314949,
2022-10-10 18:30:00,2022-10-10 18:30:00,18:39:53,207.10336,688.162174,1302.061,18.506726,913.767511,2022-10-10 18:30:00,0.316908,
2022-10-10 18:40:00,2022-10-10 18:40:00,18:49:53,207.182557,688.384563,1299.409,18.513927,914.081048,2022-10-10 18:40:00,0.313537,


In [354]:
alllogdf['requests'] = pd.to_numeric(alllogdf['requests'])

### Tons of carbon

In total

In [391]:
((alllogdf.iloc[-1]['Energy consumed'] * 394.0) /1e6)*1000

360.147932912

Per day

In [390]:
((alllogdf.iloc[-1]['Energy consumed'] * 394.0) /1e6)/18*1000

20.00821849511111

### Plotting raw values

In [356]:
alllogdf["GPU power"].describe()

count    2607.000000
mean     1663.759421
std       543.560086
min      1294.404000
25%      1310.151500
50%      1326.509000
75%      2387.707000
max      4589.912000
Name: GPU power, dtype: float64

In [357]:
fig1= alllogdf.plot(x="date", y=["GPU power"], kind="line")
fig1.update_layout(
    xaxis_title= "Date",
    yaxis_title="GPU Power Used (W)",
)

In [358]:
max_grouping = carbondf.groupby(pd.Grouper(key = 'time', freq='10min')).max()
max_grouping['date']= max_grouping.index
fig2= max_grouping.plot(x="date", y=["GPU power"], kind="line")
fig2.update_layout(
    xaxis_title= "Date",
    yaxis_title="GPU Power Used (W)",
)

# Power usage per GPU

In [368]:
mean_grouping['GPU power'].describe()

count    2735.000000
mean     1664.541940
std       287.324016
min      1252.252438
25%      1468.927675
50%      1589.101406
75%      1757.977633
max      2690.133016
Name: GPU power, dtype: float64

### Final power consumption

In [375]:
alllogdf.iloc[-1]['date'] - alllogdf.iloc[0]['date']

Timedelta('18 days 02:20:00')

This is equal to 434 hours 

In [371]:
alllogdf.tail()

Unnamed: 0,date,timestamp,RAM energy,GPU energy,GPU power,CPU energy,Energy consumed,time,Energy by interval,requests
2022-10-10 18:00:00,2022-10-10 18:00:00,18:09:53,206.865759,687.488517,1302.686,18.485593,912.839868,2022-10-10 18:00:00,0.334254,
2022-10-10 18:10:00,2022-10-10 18:10:00,18:19:53,206.945001,687.702846,1317.169,18.49256,913.135654,2022-10-10 18:10:00,0.295786,
2022-10-10 18:20:00,2022-10-10 18:20:00,18:29:53,207.024209,687.926633,1309.552,18.49976,913.450603,2022-10-10 18:20:00,0.314949,
2022-10-10 18:30:00,2022-10-10 18:30:00,18:39:53,207.10336,688.162174,1302.061,18.506726,913.767511,2022-10-10 18:30:00,0.316908,
2022-10-10 18:40:00,2022-10-10 18:40:00,18:49:53,207.182557,688.384563,1299.409,18.513927,914.081048,2022-10-10 18:40:00,0.313537,


In [360]:
mean_grouping = carbondf.groupby(pd.Grouper(key = 'time', freq='10min')).mean()
mean_grouping['date']= mean_grouping.index
fig3= mean_grouping.plot(x="date", y=["GPU power"], kind="line")
fig3.update_layout(
    xaxis_title= "Date",
    yaxis_title="GPU Power Used (W)",
)
fig3.add_hline(y=mean_grouping['GPU power'].mean(), line_width=3, line_dash="dash", line_color="red",name='avg power')


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



In [361]:
mean_grouping["GPU power"].describe()

count    2735.000000
mean     1664.541940
std       287.324016
min      1252.252438
25%      1468.927675
50%      1589.101406
75%      1757.977633
max      2690.133016
Name: GPU power, dtype: float64

In [226]:
min_grouping = carbondf.groupby(pd.Grouper(key = 'time', freq='10min')).min()
min_grouping['date']= min_grouping.index
fig4= min_grouping.plot(x="date", y=["GPU power"], kind="line")
fig4.update_layout(
    xaxis_title= "Date",
    yaxis_title="GPU Power Used (W)",
)

In [386]:
hist = alllogdf.hist(y='requests', x = 'date')
hist.update_layout(
    autosize=False,
    width=400,
    height=500,
    xaxis_title= "Date",
    yaxis_title="Number of requests",
)

In [384]:
fig5= alllogdf.plot(kind="scatter", x="requests", y="Energy by interval")
fig5.update_layout(
    autosize=False,
    width=1000,
    height=500,
    xaxis_title= "Number of requests received in a 10-minute interval",
    yaxis_title="Energy consumption (kWh)",
)

### Trying to fit quadratic curve

In [364]:
x= alllogdf["requests"]
y= alllogdf["Energy by interval"]

In [None]:
import numpy as np
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt 

a, b, c = np.polyfit(x, y, 2)

fit_equation = lambda x: a * x ** 2 + b * x + c

def plot_fit(X, Y, f):
    X_fit = np.linspace(min(X), max(X), 1000)
    Y_fit = f(X_fit)

    fig, ax1 = plt.subplots()
    ax1.plot(X_fit, Y_fit, color='r', alpha=0.5, label='Polynomial fit')
    ax1.scatter(X, Y, s=4, color='b', label='Data points')
    ax1.set_title('Polynomial fit to data')
    ax1.legend()
    plt.show()
    
plot_fit(x, y, fit_equation)


#### Calculating intercept:

In [45]:
fit_equation(0)

0.28842679888794365