In [1]:
import pandas as pd

from datetime import datetime
from _lib.data_preparation import remove_substandard_trips, df_calc_basic, df_join_generic_with_gps, read_gpx, calc_context
from _lib.data_preparation import get_df_detail_final, get_df_generic_final
from _lib.helper import val2year, val2zip, val2utf8, get_filepaths
from _lib.settings import DATA_AFTER_PREPARATION_DIR

# FR Amiens

In [2]:
from _lib.settings import DATA_ORIGIN_AMIENS_DIR


SHORT_NAME = 'ami'

### 2016

In [3]:
df_ami16 = pd.read_csv(f'{DATA_ORIGIN_AMIENS_DIR}/detail_2016.csv', encoding='windows-1250')

print('Shape before: ', df_ami16.shape)

''' Column names normalization '''
df_ami16.columns = [cname.replace(' ', '').lower() for cname in df_ami16.columns]

''' Column data normalization '''
df_ami16['tripid'] = SHORT_NAME + df_ami16['tripid'].astype(str).replace(' ', '')

df_ami16['timestamp'] = round(df_ami16['timestamp'].apply(lambda x: datetime.fromtimestamp(float(x)).timestamp()))

df_ami16 = df_ami16.astype({'latitude': 'float', 'longitude': 'float'})

df_ami16 = remove_substandard_trips(df_ami16)
df_ami16 = df_calc_basic(df_ami16)

print('Shape after: ', df_ami16.shape)

  exec(code_obj, self.user_global_ns, self.user_ns)


Shape before:  (424910, 8)
Removed 0 substandard trips.


distance: 100%|██████████| 424910/424910 [00:01<00:00, 261893.42it/s]
duration: 100%|██████████| 424910/424910 [00:01<00:00, 350475.25it/s]
start: 100%|██████████| 302537/302537 [00:00<00:00, 404808.37it/s]
end: 100%|██████████| 302537/302537 [00:00<00:00, 406753.87it/s]
stop: 100%|██████████| 302537/302537 [00:00<00:00, 362978.80it/s]

Shape after:  (302537, 12)





In [4]:
df_ami16_generic = pd.read_csv(f'{DATA_ORIGIN_AMIENS_DIR}/generic_2016.csv', encoding='windows-1250')

''' Column names normalization '''
df_ami16_generic.columns = [cname.replace(' ', '').lower() for cname in df_ami16_generic.columns]

df_ami16_generic['tripid'] = SHORT_NAME + df_ami16_generic['tripid'].apply(lambda x: x.replace(' ', ''))
df_ami16_generic['distance'] = df_ami16_generic['distance'].astype(float)
df_ami16_generic['valid'] = df_ami16_generic[df_ami16_generic['ecc'].notna()]['ecc'].apply(lambda x: False if x == 0 else True)
df_ami16_generic['avgspeed'] = df_ami16_generic['avgspeed'].astype(float)
df_ami16_generic['tracktype'] = df_ami16_generic[df_ami16_generic['tracktype'].notna()]['tracktype'].apply(val2utf8)
df_ami16_generic['male'] = df_ami16_generic[df_ami16_generic['sex'].notna()]['sex'].apply(lambda x: True if str(x).lower() == 'm' else (False if str(x).lower() == 'f' else float('nan')))
df_ami16_generic['yearofbirth'] = df_ami16_generic['year'].apply(val2year)
df_ami16_generic['profession'] = df_ami16_generic[df_ami16_generic['profession'].notna()]['profession'].apply(val2utf8)
df_ami16_generic['frequentuser'] = df_ami16_generic[df_ami16_generic['frequentuser'].notna()]['frequentuser'].apply(lambda x: False if x.lower() in ['no', 'non'] else False)
df_ami16_generic['zip'] = df_ami16_generic[df_ami16_generic['zip'].notna()]['zip'].apply(val2zip)
df_ami16_generic['source'] = df_ami16_generic[df_ami16_generic['source'].notna()]['source'].apply(val2utf8)
df_ami16_generic['typeofbike'] = df_ami16_generic[df_ami16_generic['typeofbike'].notna()]['typeofbike'].apply(val2utf8)
df_ami16_generic['typeoftrip'] = df_ami16_generic[df_ami16_generic['tipeoftrip'].notna()]['tipeoftrip'].apply(val2utf8)

df_ami16_generic.drop(['timestamp', 'startdt', 'ecc', 'sex', 'year', 'tipeoftrip', 'distance', 'avgspeed'], axis=1, inplace=True)

''' Joinig generic data with gps data '''
print('Shape before: ', df_ami16_generic.shape)

df_ami16_generic = df_join_generic_with_gps(df_ami16_generic, df_ami16)

print('Shape after: ', df_ami16_generic.shape)

Shape before:  (2107, 11)
Shape after:  (1749, 18)


### 2017

In [5]:
df_ami17 = pd.read_csv(f'{DATA_ORIGIN_AMIENS_DIR}/detail_2017.csv', encoding='windows-1250', sep=';')

print('Shape before: ', df_ami17.shape)

''' Column names normalization '''
df_ami17.columns = [cname.replace(' ', '').lower() for cname in df_ami17.columns]

''' Column data normalization '''
df_ami17['tripid'] = SHORT_NAME + df_ami17['tripid'].astype(str).replace(' ', '')

df_ami17['timestamp'] = df_ami17['timestamp'].apply(lambda x: round(datetime.fromtimestamp(float(x)).timestamp()))
df_ami17['latitude'] = df_ami17['latitude'].str.replace(',', '.').astype(float)
df_ami17['longitude'] = df_ami17['longitude'].str.replace(',', '.').astype(float)
df_ami17['altitude'] = df_ami17['altitude'].astype(float)

df_ami17 = remove_substandard_trips(df_ami17)
df_ami17 = df_calc_basic(df_ami17)

print('Shape after: ', df_ami17.shape)

Shape before:  (904274, 8)
Removed 54 substandard trips.


distance: 100%|██████████| 860377/860377 [00:03<00:00, 272769.09it/s]
duration: 100%|██████████| 860377/860377 [00:02<00:00, 374970.19it/s]
start: 100%|██████████| 857008/857008 [00:01<00:00, 433511.45it/s]
end: 100%|██████████| 857008/857008 [00:02<00:00, 424642.40it/s]
stop: 100%|██████████| 857008/857008 [00:02<00:00, 360217.43it/s]

Shape after:  (857008, 12)





In [6]:
df_ami17_generic = pd.read_csv(f'{DATA_ORIGIN_AMIENS_DIR}/generic_2017.csv', encoding='windows-1250', sep=';')

''' Column names normalization '''
df_ami17_generic.columns = [cname.replace(' ', '').lower() for cname in df_ami17_generic.columns]

''' Column data normalization '''
df_ami17_generic['tripid'] = SHORT_NAME + df_ami17_generic['tripid'].astype(str).replace(' ', '')
df_ami17_generic['avgspeed'] = df_ami17_generic['avgspeed'].str.replace(',', '.').astype(float)
df_ami17_generic['distance'] = df_ami17_generic['totallength'].str.replace(',', '.').astype(float)
df_ami17_generic['valid'] = df_ami17_generic['valid'].apply(lambda x: False if str(x).lower() == 'no' else True)
df_ami17_generic['male'] = df_ami17_generic[df_ami17_generic['sex'].notna()]['sex'].apply(lambda x: True if str(x).lower() == 'male' else (False if str(x).lower() == 'female' else float('nan')))
df_ami17_generic['yearofbirth'] = df_ami17_generic['yearofbirth'].apply(val2year)
df_ami17_generic['typeofbike'] = df_ami17_generic[df_ami17_generic['typeofbike'].notna()]['typeofbike'].apply(val2utf8)
df_ami17_generic['typeoftrip'] = df_ami17_generic[df_ami17_generic['typeoftrip'].notna()]['typeoftrip'].apply(val2utf8)

df_ami17_generic.drop(['uploaded', 'sex', 'timestamp', 'startdate', 'starttime', 'duration', 'maxspeed', 'totallength', 'lengthvalid', 'avgspeed', 'distance'], axis=1, inplace=True)

''' Joinig generic data with gps data '''
print('Shape before: ', df_ami17_generic.shape)

df_ami17_generic = df_join_generic_with_gps(df_ami17_generic, df_ami17)

print('Shape after: ', df_ami17_generic.shape)

Shape before:  (2350, 6)
Shape after:  (1622, 13)


### Removing overall columns & records

In [7]:
''' DETAIL '''

print('Shape before. 2016:', df_ami16.shape, '2017:', df_ami17.shape)

df_ami16 = get_df_detail_final(df_ami16, df_ami16_generic)

df_ami17 = get_df_detail_final(df_ami17, df_ami17_generic)

print('Shape after. 2016:', df_ami16.shape, '2017:', df_ami17.shape)

Shape before. 2016: (302537, 12) 2017: (857008, 12)
Shape after. 2016: (270679, 7) 2017: (760805, 7)


In [8]:
''' GENERIC '''

print('Shape before. 2015:', df_ami16_generic.shape, '2016:', df_ami17_generic.shape)

df_ami16_generic = get_df_generic_final(df_ami16_generic, ['tracktype', 'source', 'profession', 'male', 'frequentuser', 'zip', 'yearofbirth', 'valid'])
df_ami17_generic = get_df_generic_final(df_ami17_generic, ['typeofbike', 'typeoftrip', 'male', 'yearofbirth', 'valid'])

print('Shape before. 2015:', df_ami16_generic.shape, '2016:', df_ami17_generic.shape)

Shape before. 2015: (1749, 18) 2016: (1622, 13)
Shape before. 2015: (1749, 16) 2016: (1622, 13)


### Datasets concatenaton

In [9]:
df_ami = pd.concat([df_ami16, df_ami17], ignore_index=True)
df_ami.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1031484 entries, 0 to 1031483
Data columns (total 7 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   tripid     1031484 non-null  object 
 1   latitude   1031484 non-null  float64
 2   longitude  1031484 non-null  float64
 3   timestamp  1031484 non-null  float64
 4   stop       1031484 non-null  float64
 5   distance   1031484 non-null  float64
 6   duration   1031484 non-null  float64
dtypes: float64(6), object(1)
memory usage: 55.1+ MB


In [10]:
df_ami_generic = pd.concat([df_ami16_generic, df_ami17_generic], ignore_index=True)
df_ami_generic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3371 entries, 0 to 3370
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tripid         3371 non-null   object 
 1   speedmin       3371 non-null   float64
 2   speedmax       3371 non-null   float64
 3   speedmean      3371 non-null   float64
 4   speedavg_real  3371 non-null   float64
 5   distance       3371 non-null   float64
 6   startts        3371 non-null   float64
 7   endts          3371 non-null   float64
 8   tracktype      1749 non-null   object 
 9   source         1749 non-null   object 
 10  profession     1292 non-null   object 
 11  male           1850 non-null   object 
 12  frequentuser   1749 non-null   object 
 13  zip            845 non-null    object 
 14  yearofbirth    1947 non-null   object 
 15  valid          3371 non-null   bool   
 16  typeofbike     1131 non-null   object 
 17  typeoftrip     1142 non-null   object 
dtypes: bool(

### Saving operations

In [11]:
df_ami.to_csv(f'{DATA_AFTER_PREPARATION_DIR}/{SHORT_NAME}.csv', index=False, sep=';')
df_ami_generic.to_csv(f'{DATA_AFTER_PREPARATION_DIR}/{SHORT_NAME}_generic.csv', index=False, sep=';')

# PL Wroclaw

In [12]:
from tqdm import tqdm
from _lib.settings import DATA_ORIGIN_WROCLAW_DIR


SHORT_NAME = 'wro'

### 2015

In [13]:
df_wro15 = pd.read_csv(f'{DATA_ORIGIN_WROCLAW_DIR}/detail_2015.csv', encoding='windows-1250', skiprows=[9627453])

print('Shape before: ', df_wro15.shape)

''' Column names normalization '''
df_wro15.columns = [cname.replace(' ', '').lower() for cname in df_wro15.columns]

''' Column data normalization '''
df_wro15['tripid'] = SHORT_NAME + df_wro15['tripid'].astype(str).replace(' ', '')

df_wro15 = df_wro15.astype({'latitude': 'float', 'longitude': 'float'})
df_wro15 = remove_substandard_trips(df_wro15)

tqdm.pandas(desc='timestamp')
df_wro15['timestamp'] = df_wro15['timestamp'].progress_apply(lambda x: float('nan') if str(x).lower() in ['false', 'nan'] else round(datetime.fromtimestamp(float(x)).timestamp()))

df_wro15 = remove_substandard_trips(df_wro15)
df_wro15 = df_calc_basic(df_wro15)

print('Shape after: ', df_wro15.shape)

  exec(code_obj, self.user_global_ns, self.user_ns)


Shape before:  (20931869, 9)
Removed 21 substandard trips.


timestamp: 100%|██████████| 20931208/20931208 [00:38<00:00, 545502.85it/s]


Removed 1 substandard trips.


distance: 100%|██████████| 20930880/20930880 [01:14<00:00, 282470.99it/s]
duration: 100%|██████████| 20930880/20930880 [00:53<00:00, 389124.88it/s]
start: 100%|██████████| 16257363/16257363 [00:36<00:00, 449012.16it/s]
end: 100%|██████████| 16257363/16257363 [00:36<00:00, 441131.85it/s]
stop: 100%|██████████| 16257363/16257363 [00:41<00:00, 390253.95it/s]


Shape after:  (16257363, 13)


In [14]:
df_wro15_generic = pd.read_csv(f'{DATA_ORIGIN_WROCLAW_DIR}/generic_2015.csv')

''' Column names normalization '''
df_wro15_generic.columns = [cname.replace(' ', '').lower() for cname in df_wro15_generic.columns]

''' Column data normalization '''
df_wro15_generic['tripid'] = SHORT_NAME + df_wro15_generic['tripid'].apply(lambda id: id.replace(' ', ''))
df_wro15_generic['distance'] = df_wro15_generic['distance'].astype(float)
df_wro15_generic['valid'] = df_wro15_generic[df_wro15_generic['ecc'].notna()]['ecc'].apply(lambda x: False if x == 0 else True)
df_wro15_generic['avgspeed'] = df_wro15_generic['avgspeed'].astype(float)
df_wro15_generic['tracktype'] = df_wro15_generic[df_wro15_generic['tracktype'].notna()]['tracktype'].apply(val2utf8)
df_wro15_generic['male'] = df_wro15_generic[df_wro15_generic['sex'].notna()]['sex'].apply(lambda x: True if str(x).lower() == 'm' else (False if str(x).lower() == 'f' else float('nan')))
df_wro15_generic['yearofbirth'] = df_wro15_generic['year'].apply(val2year)
df_wro15_generic['profession'] = df_wro15_generic[df_wro15_generic['profession'].notna()]['profession'].apply(val2utf8)
df_wro15_generic['frequentuser'] = df_wro15_generic[df_wro15_generic['frequentuser'].notna()]['frequentuser'].apply(lambda x: False if x.lower() == 'no' else True)
df_wro15_generic['zip'] = df_wro15_generic[df_wro15_generic['zip'].notna()]['zip'].apply(val2zip)
df_wro15_generic['source'] = df_wro15_generic[df_wro15_generic['source'].notna()]['source'].apply(val2utf8)

df_wro15_generic.drop(['timestamp', 'startdt', 'ecc', 'sex', 'year', 'distance', 'avgspeed'], axis=1, inplace=True)

''' Joinig generic data with gps data '''
print('Shape before: ', df_wro15_generic.shape)

df_wro15_generic = df_join_generic_with_gps(df_wro15_generic, df_wro15)

print('Shape after: ', df_wro15_generic.shape)

Shape before:  (49941, 9)
Shape after:  (49167, 16)


### 2016

In [15]:
df_wro16 = pd.read_csv(f'{DATA_ORIGIN_WROCLAW_DIR}/detail_2016.csv', encoding='windows-1250', skiprows=[11184484])

print('Shape before: ', df_wro16.shape)

''' Column names normalization '''
df_wro16.columns = [cname.replace(' ', '').lower() for cname in df_wro16.columns]

''' Column data normalization '''
df_wro16['tripid'] = SHORT_NAME + df_wro16['tripid'].astype(str).replace(' ', '')

df_wro16 = df_wro16.astype({'latitude': 'float', 'longitude': 'float'})
df_wro16 = remove_substandard_trips(df_wro16)

tqdm.pandas(desc='timestamp')
df_wro16['timestamp'] = df_wro16['timestamp'].progress_apply(lambda x: float('nan') if str(x).lower() in ['false', 'nan'] else round(datetime.fromtimestamp(float(x)).timestamp()))

df_wro16 = remove_substandard_trips(df_wro16)
df_wro16 = df_calc_basic(df_wro16)

print('Shape after: ', df_wro16.shape)

  exec(code_obj, self.user_global_ns, self.user_ns)


Shape before:  (22455984, 8)
Removed 2 substandard trips.


timestamp: 100%|██████████| 22453198/22453198 [00:41<00:00, 540933.95it/s]


Removed 0 substandard trips.


distance: 100%|██████████| 22453198/22453198 [01:20<00:00, 277667.38it/s]
duration: 100%|██████████| 22453198/22453198 [00:57<00:00, 387965.16it/s]
start: 100%|██████████| 18056401/18056401 [00:39<00:00, 453814.90it/s]
end: 100%|██████████| 18056401/18056401 [00:40<00:00, 447532.20it/s]
stop: 100%|██████████| 18056401/18056401 [00:47<00:00, 383406.75it/s]


Shape after:  (18056401, 12)


In [16]:
df_wro16_generic = pd.read_csv(f'{DATA_ORIGIN_WROCLAW_DIR}/generic_2016.csv')

''' Column names normalization '''
df_wro16_generic.columns = [cname.replace(' ', '').lower() for cname in df_wro16_generic.columns]

''' Column data normalization '''
df_wro16_generic['tripid'] = SHORT_NAME + df_wro16_generic['tripid'].apply(lambda x: x.replace(' ', ''))
df_wro16_generic['distance'] = df_wro16_generic['distance'].astype(float)
df_wro16_generic['valid'] = df_wro16_generic[df_wro16_generic['ecc'].notna()]['ecc'].apply(lambda x: False if x == 0 else True)
df_wro16_generic['avgspeed'] = df_wro16_generic['avgspeed'].astype(float)
df_wro16_generic['tracktype'] = df_wro16_generic[df_wro16_generic['tracktype'].notna()]['tracktype'].apply(val2utf8)
df_wro16_generic['male'] = df_wro16_generic[df_wro16_generic['sex'].notna()]['sex'].apply(lambda x: True if str(x).lower() == 'm' else (False if str(x).lower() == 'f' else float('nan')))
df_wro16_generic['yearofbirth'] = df_wro16_generic['year'].apply(val2year)
df_wro16_generic['profession'] = df_wro16_generic[df_wro16_generic['profession'].notna()]['profession'].apply(val2utf8)
df_wro16_generic['frequentuser'] = df_wro16_generic[df_wro16_generic['frequentuser'].notna()]['frequentuser'].apply(lambda x: False if x.lower() in ['no', 'nie'] else False)
df_wro16_generic['zip'] = df_wro16_generic[df_wro16_generic['zip'].notna()]['zip'].apply(val2zip)
df_wro16_generic['source'] = df_wro16_generic[df_wro16_generic['source'].notna()]['source'].apply(val2utf8)
df_wro16_generic['typeofbike'] = df_wro16_generic[df_wro16_generic['typeofbike'].notna()]['typeofbike'].apply(val2utf8)
df_wro16_generic['typeoftrip'] = df_wro16_generic[df_wro16_generic['tipeoftrip'].notna()]['tipeoftrip'].apply(val2utf8)

df_wro16_generic.drop(['timestamp', 'startdt', 'ecc', 'sex', 'year', 'distance', 'avgspeed'], axis=1, inplace=True)

''' Joinig generic data with gps data '''
print('Shape before: ', df_wro16_generic.shape)

df_wro16_generic = df_join_generic_with_gps(df_wro16_generic, df_wro16)

print('Shape after: ', df_wro16_generic.shape)

Shape before:  (42384, 12)
Shape after:  (41288, 19)


### Removing overall columns & records

In [17]:
''' DETAIL '''

print('Shape before. 2015:', df_wro15.shape, '2016:', df_wro16.shape)

df_wro15 = get_df_detail_final(df_wro15, df_wro15_generic)
df_wro16 = get_df_detail_final(df_wro16, df_wro16_generic)

print('Shape after. 2015:', df_wro15.shape, '2016:', df_wro16.shape)

Shape before. 2015: (16257363, 13) 2016: (18056401, 12)
Shape after. 2015: (16070202, 7) 2016: (17822688, 7)


In [18]:
''' GENERIC '''

print('Shape before. 2015:', df_wro15_generic.shape, '2016:', df_wro16_generic.shape)

df_wro15_generic = get_df_generic_final(df_wro15_generic, ['tracktype', 'source', 'profession', 'male', 'frequentuser', 'zip', 'yearofbirth', 'valid'])
df_wro16_generic = get_df_generic_final(df_wro16_generic, ['tracktype', 'typeofbike', 'typeoftrip', 'source', 'profession', 'male', 'frequentuser', 'zip', 'yearofbirth', 'valid'])

print('Shape before. 2015:', df_wro15_generic.shape, '2016:', df_wro16_generic.shape)

Shape before. 2015: (49167, 16) 2016: (41288, 19)
Shape before. 2015: (49167, 16) 2016: (41288, 18)


### Datasets concatenaton

In [19]:
df_wro = pd.concat([df_wro15, df_wro16], ignore_index=True)
df_wro.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33892890 entries, 0 to 33892889
Data columns (total 7 columns):
 #   Column     Dtype  
---  ------     -----  
 0   tripid     object 
 1   latitude   float64
 2   longitude  float64
 3   timestamp  float64
 4   stop       float64
 5   distance   float64
 6   duration   float64
dtypes: float64(6), object(1)
memory usage: 1.8+ GB


In [20]:
df_wro_generic = pd.concat([df_wro15_generic, df_wro16_generic], ignore_index=True)
df_wro_generic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90455 entries, 0 to 90454
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tripid         90455 non-null  object 
 1   speedmin       90455 non-null  float64
 2   speedmax       90455 non-null  float64
 3   speedmean      90455 non-null  float64
 4   speedavg_real  90455 non-null  float64
 5   distance       90455 non-null  float64
 6   startts        90455 non-null  float64
 7   endts          90455 non-null  float64
 8   tracktype      90455 non-null  object 
 9   source         90455 non-null  object 
 10  profession     67426 non-null  object 
 11  male           79247 non-null  object 
 12  frequentuser   90434 non-null  object 
 13  zip            54733 non-null  object 
 14  yearofbirth    74260 non-null  object 
 15  valid          90455 non-null  bool   
 16  typeofbike     20551 non-null  object 
 17  typeoftrip     20532 non-null  object 
dtypes: boo

### Saving operations

In [21]:
df_wro.to_csv(f'{DATA_AFTER_PREPARATION_DIR}/{SHORT_NAME}.csv', index=False, sep=';')
df_wro_generic.to_csv(f'{DATA_AFTER_PREPARATION_DIR}/{SHORT_NAME}_generic.csv', index=False, sep=';')

# SE Orebro

In [22]:
from _lib.settings import DATA_ORIGIN_OREBRO_DIR


SHORT_NAME = 'ore'

### 2015

In [23]:
df_ore15, df_ore15_generic = read_gpx(f'{DATA_ORIGIN_OREBRO_DIR}/2015', SHORT_NAME)

df_ore15.shape, df_ore15_generic.shape

100%|██████████| 2156/2156 [00:43<00:00, 49.50it/s]


((648689, 5), (2156, 3))

In [24]:
print('Shape before: ', df_ore15.shape)

df_ore15 = remove_substandard_trips(df_ore15)
df_ore15 = df_calc_basic(df_ore15)

print('Shape after: ', df_ore15.shape)

Shape before:  (648689, 5)
Removed 9 substandard trips.


distance: 100%|██████████| 648398/648398 [00:02<00:00, 264953.65it/s]
duration: 100%|██████████| 648398/648398 [00:01<00:00, 362511.45it/s]
start: 100%|██████████| 508976/508976 [00:01<00:00, 419804.70it/s]
end: 100%|██████████| 508976/508976 [00:01<00:00, 402279.90it/s]
stop: 100%|██████████| 508976/508976 [00:01<00:00, 352895.86it/s]

Shape after:  (508976, 11)





In [25]:
''' Joinig generic data with gps data '''
print('Shape before: ', df_ore15_generic.shape)

df_ore15_generic = df_join_generic_with_gps(df_ore15_generic, df_ore15)

print('Shape after: ', df_ore15_generic.shape)

Shape before:  (2156, 3)
Shape after:  (2110, 10)


### 2016

In [26]:
# df_ore16, df_ore16_generic = read_gpx(f'{DATA_ORIGIN_OREBRO_DIR}/2016', SHORT_NAME)

# df_ore16 = remove_substandard_trips(df_ore16)

# df_ore16.shape, df_ore16_generic.shape

In [27]:
# print('Shape before: ', df_ore16.shape)

# df_ore16 = remove_substandard_trips(df_ore16)
# df_ore16 = df_calc_basic(df_ore16)

# ''' Removing points with 0 distance passed '''
# df_ore16 = df_ore16[(df_ore16['distance'] != 0) | (df_ore16['end']) | (df_ore16['start'])]

# print('Shape after: ', df_ore16.shape)

In [28]:
# ''' Joinig generic data with gps data '''
# print('Shape before: ', df_ore16_generic.shape)

# df_ore16_generic = df_join_generic_with_gps(df_ore16_generic, df_ore16)

# print('Shape after: ', df_ore16_generic.shape)

### Removing overall columns & records

In [29]:
''' DETAIL '''

print('Shape before:', df_ore15.shape)

df_ore15 = get_df_detail_final(df_ore15, df_ore15_generic)

print('Shape after:', df_ore15.shape)

Shape before: (508976, 11)
Shape after: (506185, 7)


In [30]:
''' GENERIC '''

print('Shape before:', df_ore15_generic.shape)

df_ore15_generic = get_df_generic_final(df_ore15_generic, ['email'])

print('Shape after:', df_ore15_generic.shape)

Shape before: (2110, 10)
Shape after: (2110, 9)


In [31]:
df_ore15.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 506185 entries, 0 to 508975
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   tripid     506185 non-null  object 
 1   latitude   506185 non-null  float64
 2   longitude  506185 non-null  float64
 3   timestamp  506185 non-null  float64
 4   stop       506185 non-null  float64
 5   distance   506185 non-null  float64
 6   duration   506185 non-null  float64
dtypes: float64(6), object(1)
memory usage: 30.9+ MB


In [32]:
df_ore15_generic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2110 entries, 0 to 2109
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tripid         2110 non-null   object 
 1   speedmin       2110 non-null   float64
 2   speedmax       2110 non-null   float64
 3   speedmean      2110 non-null   float64
 4   speedavg_real  2110 non-null   float64
 5   distance       2110 non-null   float64
 6   startts        2110 non-null   float64
 7   endts          2110 non-null   float64
 8   email          2110 non-null   object 
dtypes: float64(7), object(2)
memory usage: 148.5+ KB


### Saving operations

In [33]:
df_ore15.to_csv(f'{DATA_AFTER_PREPARATION_DIR}/{SHORT_NAME}.csv', index=False, sep=';')
df_ore15_generic.to_csv(f'{DATA_AFTER_PREPARATION_DIR}/{SHORT_NAME}_generic.csv', index=False, sep=';')

# DE Oldenburg

In [34]:
import os
import numpy as np

from tqdm import tqdm
from _lib.settings import DATA_ORIGIN_OLDENBURG_DIR


SHORT_NAME = 'old'

### Reading CSV files 2020

In [35]:
fpaths = get_filepaths(f'{DATA_ORIGIN_OLDENBURG_DIR}/2020', '.csv')

id, lat, lon, ts = [], [], [], []

for fpath in tqdm(fpaths):
    tripid = SHORT_NAME + fpath[:-4].split('-')[-1]
    df_trip = pd.read_csv(fpath, sep=';')
    df_trip['timestamp'] = pd.to_datetime(df_trip['measured_date'])
    df_trip['timestamp'] = df_trip['timestamp'].apply(lambda x: round(datetime.timestamp(x)))
    
    id = id + [tripid] * df_trip.shape[0]
    lat = lat + df_trip['latitude'].tolist()
    lon = lon + df_trip['longitude'].tolist()
    ts = ts + df_trip['timestamp'].tolist()

df_old = pd.DataFrame(np.array([id, lat, lon, ts]).T, columns=['tripid', 'latitude', 'longitude', 'timestamp'])
df_old = df_old.astype({'latitude': 'float', 'longitude': 'float', 'timestamp': 'float'})

df_old.shape

100%|██████████| 1181/1181 [01:01<00:00, 19.08it/s]


(908080, 4)

### Processing

In [36]:
print('Shape before: ', df_old.shape)

df_old = remove_substandard_trips(df_old)
df_old = df_calc_basic(df_old)

print('Shape after: ', df_old.shape)

Shape before:  (908080, 4)
Removed 146 substandard trips.


distance: 100%|██████████| 732216/732216 [00:02<00:00, 273283.75it/s]
duration: 100%|██████████| 732216/732216 [00:01<00:00, 373182.65it/s]
start: 100%|██████████| 730625/730625 [00:01<00:00, 431094.75it/s]
end: 100%|██████████| 730625/730625 [00:01<00:00, 427345.20it/s]
stop: 100%|██████████| 730625/730625 [00:01<00:00, 373584.88it/s]

Shape after:  (730625, 10)





In [37]:
df_old_generic = calc_context(df_old)

print('Shape before: ', df_old_generic.shape)

df_old_generic.drop_duplicates(subset=list(set(df_old_generic.columns.tolist()) - set(['startts', 'endts'])), keep='first', inplace=True)

df_old_generic = df_old_generic.reset_index(inplace=False)

print('Shape after: ', df_old_generic.shape)

Shape before:  (1034, 7)
Shape after:  (1034, 8)


### Removing overall columns & records

In [38]:
''' DETAIL '''

print('Shape before:', df_old.shape)

df_old = get_df_detail_final(df_old, df_old_generic)

print('Shape after:', df_old.shape)

Shape before: (730625, 10)
Shape after: (730625, 7)


In [39]:
''' GENERIC '''

print('Shape before:', df_old_generic.shape)

df_old_generic = get_df_generic_final(df_old_generic, [])

print('Shape after:', df_old_generic.shape)

Shape before: (1034, 8)
Shape after: (1034, 8)


In [40]:
df_old.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 730625 entries, 0 to 730624
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   tripid     730625 non-null  object 
 1   latitude   730625 non-null  float64
 2   longitude  730625 non-null  float64
 3   timestamp  730625 non-null  float64
 4   stop       730625 non-null  float64
 5   distance   730625 non-null  float64
 6   duration   730625 non-null  float64
dtypes: float64(6), object(1)
memory usage: 44.6+ MB


In [41]:
df_old_generic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1034 entries, 0 to 1033
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tripid         1034 non-null   object 
 1   speedmin       1034 non-null   float64
 2   speedmax       1034 non-null   float64
 3   speedmean      1034 non-null   float64
 4   speedavg_real  1034 non-null   float64
 5   distance       1034 non-null   float64
 6   startts        1034 non-null   float64
 7   endts          1034 non-null   float64
dtypes: float64(7), object(1)
memory usage: 64.8+ KB


### Saving operations

In [42]:
df_old.to_csv(f'{DATA_AFTER_PREPARATION_DIR}/{SHORT_NAME}.csv', index=False, sep=';')
df_old_generic.to_csv(f'{DATA_AFTER_PREPARATION_DIR}/{SHORT_NAME}_generic.csv', index=False, sep=';')

# DE Berlin

In [43]:
import os
import numpy as np

from tqdm import tqdm
from _lib.settings import DATA_ORIGIN_BERLIN_DIR


SHORT_NAME = 'ber'

### Reading files 2020 - 2021

In [44]:
fpaths = get_filepaths(f'{DATA_ORIGIN_BERLIN_DIR}/2020_2021', '')

id, lat, lon, ts = [], [], [], []

for fpath in tqdm(fpaths):
    tripid = SHORT_NAME + fpath.split('/')[-1].split('-')[-1]
    with open(fpath) as fr:
        Lines = fr.readlines()
        begin = False
        for line in Lines:
            if not begin:
                begin = 'lat,lon,X,Y,Z,timeStamp' in line
            else:
                lline = line.split(',')
                if lline[0] != '':
                    id.append(tripid)
                    lat.append(lline[0])
                    lon.append(lline[1])
                    ts.append(lline[5][:-3])

df_ber = pd.DataFrame(np.array([id, lat, lon, ts]).T, columns=['tripid', 'latitude', 'longitude', 'timestamp'])
df_ber = df_ber.astype({'latitude': 'float', 'longitude': 'float', 'timestamp': 'float'})

df_ber.shape

100%|██████████| 22931/22931 [03:37<00:00, 105.55it/s]


(12977703, 4)

In [45]:
print('Shape before: ', df_ber.shape)

df_ber = remove_substandard_trips(df_ber)
df_ber = df_calc_basic(df_ber)

print('Shape after: ', df_ber.shape)

Shape before:  (12977703, 4)
Removed 0 substandard trips.


distance: 100%|██████████| 12977703/12977703 [00:45<00:00, 283482.17it/s]
duration: 100%|██████████| 12977703/12977703 [00:32<00:00, 401044.78it/s]
start: 100%|██████████| 10893237/10893237 [00:24<00:00, 450675.76it/s]
end: 100%|██████████| 10893237/10893237 [00:24<00:00, 450759.78it/s]
stop: 100%|██████████| 10893237/10893237 [00:27<00:00, 392302.51it/s]

Shape after:  (10893237, 10)





In [46]:
df_ber_generic = calc_context(df_ber)

print('Shape before: ', df_ber_generic.shape)

df_ber_generic.drop_duplicates(subset=list(set(df_ber_generic.columns.tolist()) - set(['startts', 'endts'])), keep='first', inplace=True)

df_ber_generic = df_ber_generic.reset_index(inplace=False)

print('Shape after: ', df_ber_generic.shape)

Shape before:  (22886, 7)
Shape after:  (22544, 8)


### Removing overall columns & records

In [47]:
''' DETAIL '''

print('Shape before:', df_ber.shape)

df_ber = get_df_detail_final(df_ber, df_ber_generic)

print('Shape after:', df_ber.shape)

Shape before: (10893237, 10)
Shape after: (10704635, 7)


In [48]:
''' GENERIC '''

print('Shape before:', df_ber_generic.shape)

df_ber_generic = get_df_generic_final(df_ber_generic, [])

print('Shape after:', df_ber_generic.shape)

Shape before: (22544, 8)
Shape after: (22544, 8)


In [49]:
df_ber.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10704635 entries, 0 to 10893236
Data columns (total 7 columns):
 #   Column     Dtype  
---  ------     -----  
 0   tripid     object 
 1   latitude   float64
 2   longitude  float64
 3   timestamp  float64
 4   stop       float64
 5   distance   float64
 6   duration   float64
dtypes: float64(6), object(1)
memory usage: 653.4+ MB


In [50]:
df_ber_generic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22544 entries, 0 to 22543
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tripid         22544 non-null  object 
 1   speedmin       22544 non-null  float64
 2   speedmax       22544 non-null  float64
 3   speedmean      22544 non-null  float64
 4   speedavg_real  22544 non-null  float64
 5   distance       22544 non-null  float64
 6   startts        22544 non-null  float64
 7   endts          22544 non-null  float64
dtypes: float64(7), object(1)
memory usage: 1.4+ MB


### Saving operations

In [51]:
df_ber.to_csv(f'{DATA_AFTER_PREPARATION_DIR}/{SHORT_NAME}.csv', index=False, sep=';')
df_ber_generic.to_csv(f'{DATA_AFTER_PREPARATION_DIR}/{SHORT_NAME}_generic.csv', index=False, sep=';')

# PL Gdansk

In [52]:
from tqdm import tqdm
from _lib.settings import DATA_ORIGIN_GDANSK_DIR


SHORT_NAME = 'gda'

### 2015

In [53]:
df_gda15 = pd.read_csv(f'{DATA_ORIGIN_GDANSK_DIR}/detail_2015.csv', encoding='windows-1250')

print('Shape before: ', df_gda15.shape)

''' Column names normalization '''
df_gda15.columns = [cname.replace(' ', '').lower() for cname in df_gda15.columns]

''' Column data normalization '''
df_gda15['tripid'] = SHORT_NAME + df_gda15['tripid'].astype(str).replace(' ', '')

df_gda15 = df_gda15.astype({'latitude': 'float', 'longitude': 'float'})
df_gda15 = remove_substandard_trips(df_gda15)

tqdm.pandas(desc='timestamp')
df_gda15['timestamp'] = df_gda15['timestamp'].progress_apply(lambda x: float('nan') if str(x).lower() in ['false', 'nan'] else round(datetime.fromtimestamp(float(x)).timestamp()))

df_gda15 = remove_substandard_trips(df_gda15)
df_gda15 = df_calc_basic(df_gda15)

print('Shape after: ', df_gda15.shape)

  exec(code_obj, self.user_global_ns, self.user_ns)


Shape before:  (26762382, 9)
Removed 21 substandard trips.


timestamp: 100%|██████████| 26759422/26759422 [00:49<00:00, 543998.33it/s]


Removed 2 substandard trips.


distance: 100%|██████████| 26754447/26754447 [01:35<00:00, 281090.42it/s]
duration: 100%|██████████| 26754447/26754447 [01:07<00:00, 395754.65it/s]
start: 100%|██████████| 22414159/22414159 [00:49<00:00, 454088.30it/s]
end: 100%|██████████| 22414159/22414159 [00:49<00:00, 456351.56it/s]
stop: 100%|██████████| 22414159/22414159 [00:57<00:00, 391996.17it/s]


Shape after:  (22414159, 13)


In [54]:
df_gda15_generic = pd.read_csv(f'{DATA_ORIGIN_GDANSK_DIR}/generic_2015.csv')

''' Column names normalization '''
df_gda15_generic.columns = [cname.replace(' ', '').lower() for cname in df_gda15_generic.columns]

''' Column data normalization '''
df_gda15_generic['tripid'] = SHORT_NAME + df_gda15_generic['tripid'].apply(lambda id: id.replace(' ', ''))
df_gda15_generic['distance'] = df_gda15_generic['distance'].astype(float)
df_gda15_generic['valid'] = df_gda15_generic[df_gda15_generic['ecc'].notna()]['ecc'].apply(lambda x: False if x == 0 else True)
df_gda15_generic['avgspeed'] = df_gda15_generic['avgspeed'].astype(float)
df_gda15_generic['tracktype'] = df_gda15_generic[df_gda15_generic['tracktype'].notna()]['tracktype'].apply(val2utf8)
df_gda15_generic['male'] = df_gda15_generic[df_gda15_generic['sex'].notna()]['sex'].apply(lambda x: True if str(x).lower() == 'm' else (False if str(x).lower() == 'f' else float('nan')))
df_gda15_generic['yearofbirth'] = df_gda15_generic['year'].apply(val2year)
df_gda15_generic['profession'] = df_gda15_generic[df_gda15_generic['profession'].notna()]['profession'].apply(val2utf8)
df_gda15_generic['frequentuser'] = df_gda15_generic[df_gda15_generic['frequentuser'].notna()]['frequentuser'].apply(lambda x: False if x.lower() == 'no' else True)
df_gda15_generic['zip'] = df_gda15_generic[df_gda15_generic['zip'].notna()]['zip'].apply(val2zip)
df_gda15_generic['source'] = df_gda15_generic[df_gda15_generic['source'].notna()]['source'].apply(val2utf8)

df_gda15_generic.drop(['timestamp', 'startdt', 'ecc', 'sex', 'year', 'distance', 'avgspeed'], axis=1, inplace=True)

''' Joinig generic data with gps data '''
print('Shape before: ', df_gda15_generic.shape)

df_gda15_generic = df_join_generic_with_gps(df_gda15_generic, df_gda15)

print('Shape after: ', df_gda15_generic.shape)

Shape before:  (55279, 9)
Shape after:  (54267, 16)


### 2016

In [55]:
df_gda16 = pd.read_csv(f'{DATA_ORIGIN_GDANSK_DIR}/detail_2016.csv', encoding='windows-1250', skiprows=[11184484])

print('Shape before: ', df_gda16.shape)

''' Column names normalization '''
df_gda16.columns = [cname.replace(' ', '').lower() for cname in df_gda16.columns]

''' Column data normalization '''
df_gda16['tripid'] = SHORT_NAME + df_gda16['tripid'].astype(str).replace(' ', '')

df_gda16 = df_gda16.astype({'latitude': 'float', 'longitude': 'float'})
df_gda16 = remove_substandard_trips(df_gda16)

tqdm.pandas(desc='timestamp')
df_gda16['timestamp'] = df_gda16['timestamp'].progress_apply(lambda x: float('nan') if str(x).lower() in ['false', 'nan'] else round(datetime.fromtimestamp(float(x)).timestamp()))

df_gda16 = remove_substandard_trips(df_gda16)
df_gda16 = df_calc_basic(df_gda16)

print('Shape after: ', df_gda16.shape)

  exec(code_obj, self.user_global_ns, self.user_ns)


Shape before:  (48208935, 8)
Removed 1 substandard trips.


timestamp: 100%|██████████| 48207531/48207531 [01:28<00:00, 547134.84it/s]


Removed 1 substandard trips.


distance: 100%|██████████| 48204011/48204011 [03:17<00:00, 243628.14it/s]
duration: 100%|██████████| 48204011/48204011 [02:00<00:00, 400234.88it/s]
start: 100%|██████████| 40033593/40033593 [01:29<00:00, 447778.74it/s]
end: 100%|██████████| 40033593/40033593 [01:29<00:00, 449444.42it/s]
stop: 100%|██████████| 40033593/40033593 [01:42<00:00, 392094.01it/s]


Shape after:  (40033593, 12)


In [56]:
df_gda16_generic = pd.read_csv(f'{DATA_ORIGIN_GDANSK_DIR}/generic_2016.csv')

''' Column names normalization '''
df_gda16_generic.columns = [cname.replace(' ', '').lower() for cname in df_gda16_generic.columns]

''' Column data normalization '''
df_gda16_generic['tripid'] = SHORT_NAME + df_gda16_generic['tripid'].apply(lambda x: x.replace(' ', ''))
df_gda16_generic['distance'] = df_gda16_generic['distance'].astype(float)
df_gda16_generic['valid'] = df_gda16_generic[df_gda16_generic['ecc'].notna()]['ecc'].apply(lambda x: False if x == 0 else True)
df_gda16_generic['avgspeed'] = df_gda16_generic['avgspeed'].astype(float)
df_gda16_generic['tracktype'] = df_gda16_generic[df_gda16_generic['tracktype'].notna()]['tracktype'].apply(val2utf8)
df_gda16_generic['male'] = df_gda16_generic[df_gda16_generic['sex'].notna()]['sex'].apply(lambda x: True if str(x).lower() == 'm' else (False if str(x).lower() == 'f' else float('nan')))
df_gda16_generic['yearofbirth'] = df_gda16_generic['year'].apply(val2year)
df_gda16_generic['profession'] = df_gda16_generic[df_gda16_generic['profession'].notna()]['profession'].apply(val2utf8)
df_gda16_generic['frequentuser'] = df_gda16_generic[df_gda16_generic['frequentuser'].notna()]['frequentuser'].apply(lambda x: False if x.lower() in ['no', 'nie'] else False)
df_gda16_generic['zip'] = df_gda16_generic[df_gda16_generic['zip'].notna()]['zip'].apply(val2zip)
df_gda16_generic['source'] = df_gda16_generic[df_gda16_generic['source'].notna()]['source'].apply(val2utf8)
df_gda16_generic['typeofbike'] = df_gda16_generic[df_gda16_generic['typeofbike'].notna()]['typeofbike'].apply(val2utf8)
df_gda16_generic['typeoftrip'] = df_gda16_generic[df_gda16_generic['tipeoftrip'].notna()]['tipeoftrip'].apply(val2utf8)

df_gda16_generic.drop(['timestamp', 'startdt', 'ecc', 'sex', 'year', 'distance', 'avgspeed'], axis=1, inplace=True)

''' Joinig generic data with gps data '''
print('Shape before: ', df_gda16_generic.shape)

df_gda16_generic = df_join_generic_with_gps(df_gda16_generic, df_gda16)

print('Shape after: ', df_gda16_generic.shape)

Shape before:  (88992, 12)
Shape after:  (85898, 19)


### Removing overall columns & records

In [57]:
''' DETAIL '''

print('Shape before. 2015:', df_gda15.shape, '2016:', df_gda16.shape)

df_gda15 = get_df_detail_final(df_gda15, df_gda15_generic)
df_gda16 = get_df_detail_final(df_gda16, df_gda16_generic)

print('Shape after. 2015:', df_gda15.shape, '2016:', df_gda16.shape)

Shape before. 2015: (22414159, 13) 2016: (40033593, 12)
Shape after. 2015: (22251102, 7) 2016: (39231475, 7)


In [58]:
''' GENERIC '''

print('Shape before. 2015:', df_gda15_generic.shape, '2016:', df_gda16_generic.shape)

df_gda15_generic = get_df_generic_final(df_gda15_generic, ['tracktype', 'source', 'profession', 'male', 'frequentuser', 'zip', 'yearofbirth', 'valid'])
df_gda16_generic = get_df_generic_final(df_gda16_generic, ['tracktype', 'typeofbike', 'typeoftrip', 'source', 'profession', 'male', 'frequentuser', 'zip', 'yearofbirth', 'valid'])

print('Shape before. 2015:', df_gda15_generic.shape, '2016:', df_gda16_generic.shape)

Shape before. 2015: (54267, 16) 2016: (85898, 19)
Shape before. 2015: (54267, 16) 2016: (85898, 18)


### Datasets concatenaton

In [59]:
df_gda = pd.concat([df_gda15, df_gda16], ignore_index=True)
df_gda.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61482577 entries, 0 to 61482576
Data columns (total 7 columns):
 #   Column     Dtype  
---  ------     -----  
 0   tripid     object 
 1   latitude   float64
 2   longitude  float64
 3   timestamp  float64
 4   stop       float64
 5   distance   float64
 6   duration   float64
dtypes: float64(6), object(1)
memory usage: 3.2+ GB


In [60]:
df_gda_generic = pd.concat([df_gda15_generic, df_gda16_generic], ignore_index=True)
df_gda_generic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140165 entries, 0 to 140164
Data columns (total 18 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tripid         140165 non-null  object 
 1   speedmin       140165 non-null  float64
 2   speedmax       140165 non-null  float64
 3   speedmean      140165 non-null  float64
 4   speedavg_real  140165 non-null  float64
 5   distance       140165 non-null  float64
 6   startts        140165 non-null  float64
 7   endts          140165 non-null  float64
 8   tracktype      140165 non-null  object 
 9   source         140163 non-null  object 
 10  profession     104672 non-null  object 
 11  male           120235 non-null  object 
 12  frequentuser   140165 non-null  bool   
 13  zip            81481 non-null   object 
 14  yearofbirth    118106 non-null  object 
 15  valid          140165 non-null  bool   
 16  typeofbike     42057 non-null   object 
 17  typeoftrip     42022 non-null

### Saving operations

In [61]:
df_gda.to_csv(f'{DATA_AFTER_PREPARATION_DIR}/{SHORT_NAME}.csv', index=False, sep=';')
df_gda_generic.to_csv(f'{DATA_AFTER_PREPARATION_DIR}/{SHORT_NAME}_generic.csv', index=False, sep=';')

# SW Sodertalie

In [62]:
from tqdm import tqdm
from _lib.settings import DATA_ORIGIN_SODERTALIE_DIR


SHORT_NAME = 'sod'

In [63]:
df_sod = pd.read_csv(f'{DATA_ORIGIN_SODERTALIE_DIR}/sodertalje_detail.csv')

print('Shape before: ', df_sod.shape)

''' Column names normalization '''
df_sod.columns = [cname.replace(' ', '').lower() for cname in df_sod.columns]

''' Column data normalization '''
df_sod['tripid'] = SHORT_NAME + df_sod['tripid'].astype(str).replace(' ', '')

tqdm.pandas(desc='timestamp')
df_sod['timestamp'] = df_sod['timestamp'].apply(lambda x: round(datetime.fromtimestamp(float(x)).timestamp()))

df_sod.drop(['altitude', 'distance', 'speed', 'type'], axis=1, inplace=True)

df_sod = df_sod.astype({'latitude': 'float', 'longitude': 'float'})

df_sod = remove_substandard_trips(df_sod)
df_sod = df_calc_basic(df_sod)

print('Shape after: ', df_sod.shape)

Shape before:  (594498, 8)
Removed 0 substandard trips.


distance: 100%|██████████| 594498/594498 [00:02<00:00, 263192.40it/s]
duration: 100%|██████████| 594498/594498 [00:01<00:00, 356175.06it/s]
start: 100%|██████████| 455045/455045 [00:01<00:00, 405622.50it/s]
end: 100%|██████████| 455045/455045 [00:01<00:00, 398923.38it/s]
stop: 100%|██████████| 455045/455045 [00:01<00:00, 344268.32it/s]

Shape after:  (455045, 10)





In [64]:
df_sod_generic = calc_context(df_sod)

print('Shape before: ', df_sod_generic.shape)

df_sod_generic.drop_duplicates(subset=list(set(df_sod_generic.columns.tolist()) - set(['startts', 'endts'])), keep='first', inplace=True)

df_sod_generic = df_sod_generic.reset_index(inplace=False)

print('Shape after: ', df_sod_generic.shape)

Shape before:  (1487, 7)
Shape after:  (1466, 8)


### Removing overall columns & records

In [65]:
''' DETAIL '''

print('Shape before:', df_sod.shape)

df_sod = get_df_detail_final(df_sod, df_sod_generic)

print('Shape after:', df_sod.shape)

Shape before: (455045, 10)
Shape after: (452767, 7)


In [66]:
''' GENERIC '''

print('Shape before:', df_sod_generic.shape)

df_sod_generic = get_df_generic_final(df_sod_generic, [])

print('Shape after:', df_sod_generic.shape)

Shape before: (1466, 8)
Shape after: (1466, 8)


In [67]:
df_sod.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 452767 entries, 0 to 455044
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   tripid     452767 non-null  object 
 1   latitude   452767 non-null  float64
 2   longitude  452767 non-null  float64
 3   timestamp  452767 non-null  int64  
 4   stop       452767 non-null  float64
 5   distance   452767 non-null  float64
 6   duration   452767 non-null  float64
dtypes: float64(5), int64(1), object(1)
memory usage: 27.6+ MB


In [68]:
df_sod_generic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1466 entries, 0 to 1465
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tripid         1466 non-null   object 
 1   speedmin       1466 non-null   float64
 2   speedmax       1466 non-null   float64
 3   speedmean      1466 non-null   float64
 4   speedavg_real  1466 non-null   float64
 5   distance       1466 non-null   float64
 6   startts        1466 non-null   int64  
 7   endts          1466 non-null   int64  
dtypes: float64(5), int64(2), object(1)
memory usage: 91.8+ KB


### Saving operations

In [69]:
df_sod.to_csv(f'{DATA_AFTER_PREPARATION_DIR}/{SHORT_NAME}.csv', index=False, sep=';')
df_sod_generic.to_csv(f'{DATA_AFTER_PREPARATION_DIR}/{SHORT_NAME}_generic.csv', index=False, sep=';')