In [1]:
# Imports
import pandas as pd
import numpy as np

### Aggregate World Data by Country

In [120]:
# Read and aggregate data
req_cols = ['avg_d_kbps', 'avg_u_kbps', 'avg_lat_ms', 'tests', 'devices', 'quarter', 'category', 'iso3', 'name']

df_world = pd.DataFrame()

for i in range(0,30):
    df = pd.read_csv(f'./data/preprocessed_files/whole_world/whole_world_{i}.csv', sep=';', usecols=req_cols)

    # rename country col
    df.rename(columns={'name':'country'}, inplace=True)

    # aggregate as weighted average
    # create helper columns for weighted averages
    df['product1'] = df['avg_d_kbps'] * df['tests']
    df['product2'] = df['avg_u_kbps'] * df['tests']
    df['product3'] = df['avg_lat_ms'] * df['tests']

    # Aggregate
    df_agg = df.groupby(['country', 'quarter', 'category']).agg({'product1': sum,
                                                              'product2': sum,
                                                              'product3': sum,
                                                              'tests': sum})
    
    # retrieve actual values from product columns
    df_agg['avg_d_kbps'] = df_agg['product1'] / df_agg['tests']
    df_agg['avg_u_kbps'] = df_agg['product2'] / df_agg['tests']
    df_agg['avg_lat_ms'] = df_agg['product3'] / df_agg['tests']

    # drop helper columns
    df_agg = df_agg.drop(columns=['product1', 'product2', 'product3'])

    # add cols for mbps
    df_agg['avg_d_mbps'] = df_agg['avg_d_kbps'] / 1000
    df_agg['avg_u_mbps'] = df_agg['avg_u_kbps'] / 1000

    # Concat together
    df_world = pd.concat([df_world, df_agg])


In [138]:
df_world.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tests,avg_d_kbps,avg_u_kbps,avg_lat_ms,avg_d_mbps,avg_u_mbps
country,quarter,category,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Afghanistan,2019-01-01,fixed,13839,4403.485656,4433.100441,80.137004,4.403486,4.4331
Albania,2019-01-01,fixed,73138,15327.166972,7317.424827,27.843324,15.327167,7.317425
Algeria,2019-01-01,fixed,42838,4280.955157,2715.10458,73.243265,4.280955,2.715105
American Samoa,2019-01-01,fixed,618,18732.224919,7906.815534,126.351133,18.732225,7.906816
Andorra,2019-01-01,fixed,4574,73915.473765,68640.664189,10.838872,73.915474,68.640664


In [118]:
# save to csv
df_world.to_csv('./data/final_data/world_aggregated.csv', sep=',')

### Data Exploration

In [32]:
# Germany
df_germany = pd.read_csv('./data/aws_data/performance/germany_final.csv', sep=';')

In [33]:
# Cleaning and adjustments
df_germany.drop(columns=',', inplace=True)

In [35]:
# rename country column
df_germany.rename(columns={'name':'country'}, inplace=True)

Unnamed: 0,avg_d_kbps,avg_u_kbps,avg_lat_ms,tests,devices,quarter,category,long,lat,geometry,iso3,country,continent,region,iso_3166_1_
0,7864,1383,43,4,2,2019-01-01,fixed,8.415527,55.024873,POINT (8.41552734375 55.0248734409448),DEU,Germany,Europe,Western Europe,DE
1,32663,8941,34,1,1,2019-01-01,fixed,8.421021,55.024873,POINT (8.4210205078125 55.0248734409448),DEU,Germany,Europe,Western Europe,DE
2,23600,10557,29,1,1,2019-01-01,fixed,8.426514,55.024873,POINT (8.426513671875 55.0248734409448),DEU,Germany,Europe,Western Europe,DE
3,34739,7250,40,2,1,2019-01-01,fixed,8.432007,55.024873,POINT (8.4320068359375 55.0248734409448),DEU,Germany,Europe,Western Europe,DE
4,7083,1644,33,4,3,2019-01-01,fixed,8.421021,55.021725,POINT (8.4210205078125 55.0217245215306),DEU,Germany,Europe,Western Europe,DE


In [36]:
# Convert to mpbs
df_germany['avg_d_mbps'] = df_germany['avg_d_kbps'] / 1000
df_germany['avg_u_mbps'] = df_germany['avg_u_kbps'] / 1000

Unnamed: 0,avg_d_kbps,avg_u_kbps,avg_lat_ms,tests,devices,quarter,category,long,lat,geometry,iso3,country,continent,region,iso_3166_1_,avg_d_mbps,avg_u_mbps
0,7864,1383,43,4,2,2019-01-01,fixed,8.415527,55.024873,POINT (8.41552734375 55.0248734409448),DEU,Germany,Europe,Western Europe,DE,7.864,1.383
1,32663,8941,34,1,1,2019-01-01,fixed,8.421021,55.024873,POINT (8.4210205078125 55.0248734409448),DEU,Germany,Europe,Western Europe,DE,32.663,8.941
2,23600,10557,29,1,1,2019-01-01,fixed,8.426514,55.024873,POINT (8.426513671875 55.0248734409448),DEU,Germany,Europe,Western Europe,DE,23.6,10.557
3,34739,7250,40,2,1,2019-01-01,fixed,8.432007,55.024873,POINT (8.4320068359375 55.0248734409448),DEU,Germany,Europe,Western Europe,DE,34.739,7.25
4,7083,1644,33,4,3,2019-01-01,fixed,8.421021,55.021725,POINT (8.4210205078125 55.0217245215306),DEU,Germany,Europe,Western Europe,DE,7.083,1.644


In [119]:
# save as comma separated csv
i = 'test'
df_germany.to_csv('./data/final_data/germany_final_comma.csv', sep=',')