In [1]:
import os
import pandas as pd
import gensim

from _lib.settings import DATA_W2V_VECTORS_DIR, DATA_W2V_KEYED_VECTORS_DIR, DATA_TRIPS_AS_HEXES_DIR, DATA_FLAIR_TRIPS_DIR, DATA_FLAIR_CORPUS_DIR
from _lib.settings import SELECTED_RESOLUTIONS
from _lib.helper import get_file_paths
from _lib.w2v import csv2kv

# Converting trained h3 vectors into KeyedVector (Flair) format

In [2]:
for path2vectors_file in get_file_paths(DATA_W2V_VECTORS_DIR):
    print("Working on:", path2vectors_file)
    csv2kv(path2vectors_file, DATA_W2V_KEYED_VECTORS_DIR)

Working on: /media/yyeliseyenka/Zalman Data/lvrobi-gityy/data/word2vec/vectors/10.csv
Working on: /media/yyeliseyenka/Zalman Data/lvrobi-gityy/data/word2vec/vectors/10_g.csv
Working on: /media/yyeliseyenka/Zalman Data/lvrobi-gityy/data/word2vec/vectors/8.csv
Working on: /media/yyeliseyenka/Zalman Data/lvrobi-gityy/data/word2vec/vectors/8_g.csv
Working on: /media/yyeliseyenka/Zalman Data/lvrobi-gityy/data/word2vec/vectors/9.csv
Working on: /media/yyeliseyenka/Zalman Data/lvrobi-gityy/data/word2vec/vectors/9_g.csv


# Removing trips without embedded hexes

In [2]:
for resolution in SELECTED_RESOLUTIONS:
    print(f"Working on {resolution} resolution")
    df_trips = pd.read_csv(f'{DATA_TRIPS_AS_HEXES_DIR}/{resolution}.csv', sep=';', usecols=[0, 1])
    df_trips_count = pd.DataFrame({'count' : df_trips.groupby('tripid').size()}).reset_index()

    df_hexes = pd.read_csv(f'{DATA_W2V_VECTORS_DIR}/{resolution}.csv', sep=' ', header=None, usecols=[0])
    df_hexes.rename({0:'hexid'}, axis=1, inplace=True)

    df_merged = pd.merge(df_trips, df_hexes, on='hexid')
    df_merged_count = pd.DataFrame({'count' : df_merged.groupby('tripid').size()}).reset_index()
    
    df_trips_features = pd.merge(df_merged_count, df_trips_count, on=['tripid', 'count'])
    df_trips_features.drop(columns=['count'], inplace=True)

    df_merged = pd.merge(df_trips, df_trips_features, on='tripid')

    df_merged['hexes'] = df_merged.groupby('tripid')['hexid'].transform(lambda x: ' '.join(x))
    df_merged = df_merged[['tripid', 'hexes']]
    df_merged = df_merged.drop_duplicates()

    df_merged.to_csv(f'{DATA_FLAIR_TRIPS_DIR}/{resolution}.csv', sep=';', index=False)

Working on 8 resolution
Working on 9 resolution
Working on 10 resolution


# Creating corpus

## Joining generic data

In [8]:
from _lib.settings import DATA_AFTER_PREPARATION_DIR

df_generic = pd.DataFrame()

for path in get_file_paths(DATA_AFTER_PREPARATION_DIR, includes=['generic']):
    df_generic = df_generic.append(pd.read_csv(path, sep=';'), ignore_index=True)

df_generic.to_csv(f"{DATA_FLAIR_TRIPS_DIR}/generic.csv", sep=';', index=False)

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [10]:
df_generic.shape[0]

261145

In [58]:
df_generic[df_generic['typeoftrip'].notna()].shape[0]

63696

In [56]:
df_generic.groupby('typeoftrip').size()

typeoftrip
home-to-school       85
home-to-work        617
hometoschool       3001
hometowork        29515
leisure           21386
other              9092
dtype: int64

In [54]:
df_generic[df_generic['yearofbirth'].notna()].shape[0]

194313

In [55]:
df_generic[df_generic['profession'].notna()].shape[0]

173390

## Corpus

In [13]:
from sklearn.model_selection import train_test_split

from os import path

In [23]:
def read_generic_dataframe(path=f"{DATA_FLAIR_TRIPS_DIR}/generic.csv", maxspeed=60, minspeed=2, maxtime=300, mintime=1):
    df_generic = pd.read_csv(path, sep=';')

    df_generic = df_generic[(df_generic['speedavg_real'] <= maxspeed) & (df_generic['speedavg_real'] >= minspeed)] 

    df_generic['ttime'] = round((df_generic['endts'] - df_generic['startts'])/60, 1)
    df_generic = df_generic[(df_generic['ttime'] <= maxtime) & (df_generic['ttime'] >= mintime)]
    
    df_generic['typeoftrip'] = df_generic[~df_generic['typeoftrip'].isna()]['typeoftrip'].apply(lambda x: ''.join(filter(str.isalpha, x)))
    df_generic['city'] = df_generic['tripid'].str[:3]

    return df_generic


def read_trips_dataframe(path, min_len=1):
    df_trips = pd.read_csv(path, sep=';')
    df_trips = df_trips[df_trips['hexes'].str.split().str.len() > min_len]

    return df_trips


def split_into_corpus(df_x, df_Y, df_stratify, slice=0, random_state=8):
    if slice > 0:
        df_x, x_test, df_Y, Y_test = train_test_split(df_x, df_Y, train_size=slice, random_state=random_state, stratify=df_stratify)
    
    x_test, x_train, Y_test, Y_train = train_test_split(df_x, df_Y, train_size=0.2, random_state=random_state, stratify=df_stratify.iloc[df_Y.index])

    x_dev, x_train, Y_dev, Y_train = train_test_split(x_train, Y_train, train_size=0.25, random_state=random_state, stratify=df_stratify.iloc[Y_train.index])

    return x_train.join(Y_train), x_test.join(Y_test), x_dev.join(Y_dev)


def create_corpus(resolutions, column_name, slice):
    for resolution in resolutions:
        df_generic = read_generic_dataframe()
        df_generic = df_generic[~df_generic[column_name].isna()][['tripid', column_name]]

        df_trips = read_trips_dataframe(f"{DATA_FLAIR_TRIPS_DIR}/{resolution}.csv")
        
        df = pd.merge(df_trips, df_generic, on=['tripid'])
        df['city'] = df['tripid'].str[:3]
        df = df.loc[:, df.columns != 'tripid']
        print('DataFrame size:', df.shape[0])

        df_x, df_Y, df_stratify = df[['hexes']], df[[column_name]], df[[column_name, 'city']]
        
        train, test, dev = split_into_corpus(df_x, df_Y, df_stratify, slice)

        path2corpus = f"{DATA_FLAIR_CORPUS_DIR}/{resolution}_{column_name}{'_'+str(int(slice*100)) if slice > 0 else ''}"

        if not path.exists(path2corpus):
            os.mkdir(path2corpus)
        
        train.to_csv(f'{path2corpus}/train.csv', sep=';', index=False)
        test.to_csv(f'{path2corpus}/test.csv', sep=';', index=False)
        dev.to_csv(f'{path2corpus}/dev.csv', sep=';', index=False)

### typeoftrip

In [25]:
create_corpus(SELECTED_RESOLUTIONS, 'typeoftrip', 0)

  df_generic = read_generic_dataframe()


DataFrame size: 46909


  df_generic = read_generic_dataframe()


DataFrame size: 49003


  df_generic = read_generic_dataframe()


DataFrame size: 45242


### typeoftrip. SLICE = 0.2

In [24]:
create_corpus(SELECTED_RESOLUTIONS, 'typeoftrip', 0.2)

  df_generic = read_generic_dataframe()


DataFrame size: 46909


  df_generic = read_generic_dataframe()


DataFrame size: 49003


  df_generic = read_generic_dataframe()


DataFrame size: 45242
