In [7]:
import pandas as pd
import requests
import json
import os
import re
import datetime
from math import sin, cos, sqrt, atan2, radians
from copy import deepcopy

# root data
root_path_windows = 'C:\\Users\\Yan\\Documents\\Projects\\ETNA\\Mappy\\group-768539'
root_path_linux = '/home/yanis.bouzidi/ETNA-Projects/Mappy Geolife transport modal/group-768539'

# geolife data path
geolife_data_path_windows = 'C:\\Users\\Yan\\Documents\\Projects\\ETNA\\Mappy\\group-768539\\Data'
geolife_data_path_linux = '/home/yanis.bouzidi/ETNA-Projects/Mappy Geolife transport modal/group-768539/Data'

# pickles paths
pickles_path_windows = 'C:\\Users\\Yan\\Documents\\Projects\\ETNA\\Mappy\\group-768539\\Pickles'
pickles_path_linux = '/home/yanis.bouzidi/ETNA-Projects/Mappy Geolife transport modal/group-768539/Pickles'


# returns files paths in choosen path, retuns only files with choosen extensions
def files_paths(root_folder_path, files_extensions, files_paths_list_available=None):
    root_folder_path = os.path.abspath(root_folder_path)

    if isinstance(files_extensions, list) is False:
        files_extensions = [files_extensions]
        files_extensions_is_array = False
    else:
        files_extensions_is_array = True

    files_paths_list = {}
    files_extensions = files_extensions if isinstance(files_extensions, list) else [files_extensions]
    for index in range(len(files_extensions)):
        files_extensions[index] = "." + files_extensions[index] if files_extensions[index][0] != '.' else \
            files_extensions[index]
        files_paths_list[files_extensions[index]] = []
    for dirpath, subdirs, files in os.walk(root_folder_path):
        for file in files:
            for extension in files_extensions:
                if file.endswith(extension):
                    files_paths_list[extension].append(os.path.join(dirpath, file))
    if files_extensions_is_array is False:
        if files_paths_list_available is None:
            return files_paths_list[files_extensions[0]]
        else:
            files_paths_list_available.append(files_paths_list[files_extensions[0]])
    return files_paths_list


# two dates formats are present in the GeoLife files, returns the correct format by analysing the string
def date_format(date_string):
    if re.search('/', date_string):
        return '%Y/%m/%d %H:%M:%S'
    return '%Y-%m-%d %H:%M:%S'


# users are identified by folders in the GeoLife files, returns the user number in the path
def get_user_number(file_path, search_in_file=False):
    if search_in_file is False:
        search_linux = re.search('/\d{3}/', file_path)
        if search_linux:
            return search_linux.group(0)[1:-1]
        else:
            search_windows = re.search('\\\\\d{3}\\\\', file_path)
            if search_windows:
                return search_windows.group(0)[1:-1]
    else:
        search_linux = re.search('/\d{3}', file_path)
        if search_linux:
            return search_linux.group(0)[1:]
        else:
            search_windows = re.search('\\\\\d{3}', file_path)
            if search_windows:
                return search_windows.group(0)[1:]
    pass


def get_dir_path_from_label_path(label_path):
    replaced_path = label_path.replace('labels.txt', '')
    if replaced_path != label_path:
        return replaced_path
    pass


# loads csv and and adds user number and transport type
def load_from_plt(data_folder_path):
    labels_path_list = files_paths(data_folder_path, '.txt')
    labels_path_list_cleaned = {}
    for label_path in labels_path_list:
        user_number = get_user_number(label_path)
        label_path_cleaned = get_dir_path_from_label_path(label_path)
        if label_path_cleaned is not None and user_number is not None:
            labels_path_list_cleaned[user_number] = label_path_cleaned

    files_paths_list = {}
    for key__user_number in labels_path_list_cleaned:
        list_of_plt = files_paths(labels_path_list_cleaned[key__user_number], '.plt')
        list_of_plt.sort()
        files_paths_list[key__user_number] = list_of_plt

    data_frames = {}
    i = 0
    length_of_files_paths_list = len(files_paths_list)
    for key__user_number in files_paths_list:
        print(str(round((i / length_of_files_paths_list) * 100, 2)) + '%')
        data_frames[key__user_number] = []
        for file_path in files_paths_list[key__user_number]:
            data_frames[key__user_number].append(pd.read_csv(file_path, skiprows=6, header=None,
                                                             names=["latitude", "longitude", "other", "altitude",
                                                                    "timestamp", "date", "time"]))
        i = i + 1

    data_frames_labels = {}
    for key__user_number in labels_path_list_cleaned:
        data_frames_labels[key__user_number] = pd.read_csv(labels_path_list_cleaned[key__user_number] + 'labels.txt',
                                                           delimiter='	')
    print('100%')
    return {
        'data': data_frames,
        'labels': data_frames_labels
    }


def add_user_number_to_data_frame(data_frames):
    data_frames_copy = deepcopy(data_frames)
    for key__user_number in data_frames_copy:
        for x in range(len(data_frames_copy[key__user_number])):
            data_frames_copy[key__user_number][x]['user'] = key__user_number

    return data_frames_copy


def add_transport_modes_to_data_frame(data_frames, data_frames_labels):
    data_frames_copy = deepcopy(data_frames)
    length = number_of_data_frames_recursively(data_frames_copy) + 1
    # k variable is only for displaying % on process done
    k = 1
    for key__user_number in data_frames_labels:
        i = 0
        j = len(data_frames_labels[key__user_number].index)
        print(str(round((k / length) * 100, 2)) + '%')
        for x in range(len(data_frames_copy[key__user_number])):
            data_frames_copy[key__user_number][x]['transport'] = 'None'
            for index, row in data_frames_copy[key__user_number][x].iterrows():
                if k % 5000 == 0:
                    print(str(round((k / length) * 100, 2)) + '%')
                date_string_row = row['date'] + ' ' + row['time']
                while i < j:
                    date_string_label_start = data_frames_labels[key__user_number].iat[i, 0]
                    date_string_label_end = data_frames_labels[key__user_number].iat[i, 1]

                    row_date = datetime.datetime.strptime(date_string_row, date_format(date_string_row))
                    label_date_start = datetime.datetime.strptime(date_string_label_start,
                                                                  date_format(date_string_label_start))
                    label_date_end = datetime.datetime.strptime(date_string_label_end,
                                                                date_format(date_string_label_end))

                    if label_date_start <= row_date <= label_date_end:
                        data_frames_copy[key__user_number][x].at[index, 'transport'] = data_frames_labels[key__user_number].iat[i, 2]
                        break
                    elif label_date_end < row_date:
                        i = i + 1
                    else:
                        break
                k = k + 1
            k = k + 1
        k = k + 1
    print('100%')
    return data_frames_copy


def number_of_data_frames_recursively(data_frames, length=0):
    if isinstance(data_frames, list):
        length = length + len(data_frames)
        for x in range(len(data_frames)):
            length = number_of_data_frames_recursively(data_frames[x], length)
    elif isinstance(data_frames, dict):
        length = length + len(data_frames)
        for x in data_frames:
            length = number_of_data_frames_recursively(data_frames[x], length)
    elif isinstance(data_frames, pd.DataFrame):
        length = length + len(data_frames.index)
    return length


# saves data frames as pickles, separated in folders by user number
def save_data_frames_as_pickles(path_to_save_to, data_frames_=None, data_frames_labels_=None):
    if data_frames_ is not None:
        for key__user_number_ in data_frames_:
            for x_ in range(len(data_frames_[key__user_number_])):
                os.makedirs(os.path.join(path_to_save_to, "data_frames", key__user_number_), exist_ok=True)
                data_frames_[key__user_number_][x_].to_pickle(
                    os.path.join(path_to_save_to, "data_frames", key__user_number_, (str(x_) + ".pkl")))
    if data_frames_labels_ is not None:
        for key__user_number_ in data_frames_labels_:
            os.makedirs(os.path.join(path_to_save_to, "labels"), exist_ok=True)
            data_frames_labels_[key__user_number_].to_pickle(
                os.path.join(path_to_save_to, "labels", (key__user_number_ + ".pkl")))


# loads pickles created by function above
def load_all_pickles(pickles_folder_path):
    data_frames_ = {}
    data_frames_labels_ = {}
    data_frames_pickles_paths = files_paths(os.path.join(pickles_folder_path, "data_frames"), '.pkl')
    labels_data_frames_pickles_paths = files_paths(os.path.join(pickles_folder_path, 'labels'), '.pkl')
    for path in data_frames_pickles_paths:
        user_number_ = get_user_number(path)
        if user_number_ not in data_frames_:
            data_frames_[user_number_] = []
        data_frames_[user_number_].append(pd.read_pickle(path))
    for path in labels_data_frames_pickles_paths:
        user_number_ = get_user_number(path, True)
        data_frames_labels_[user_number_] = pd.read_pickle(path)
    return {
        "data_frames": data_frames_,
        'labels': data_frames_labels_
    }


# save data as csv for the ETL
def save_all_as_csv(path_to_save_to, data_frames_=None, data_frames_labels_=None):
    if data_frames_ is not None:
        for key__user_number_ in data_frames_:
            for x_ in range(len(data_frames_[key__user_number_])):
                os.makedirs(os.path.join(path_to_save_to, "data_frames", key__user_number_), exist_ok=True)
                data_frames_[key__user_number_][x_].to_csv(
                    os.path.join(path_to_save_to, "data_frames", key__user_number_, (str(x_) + ".csv")))
    if data_frames_labels_ is not None:
        os.makedirs(os.path.join(path_to_save_to, "labels"), exist_ok=True)
        for key__user_number_ in data_frames_labels_:
            data_frames_labels_[key__user_number_].to_csv(
                os.path.join(path_to_save_to, "labels", (key__user_number_ + ".csv")))


def save_decomposed(path_to_save_to, data_frames_):
    for key_ in range(len(data_frames_)):
        os.makedirs(os.path.join(path_to_save_to, "decomposed"), exist_ok=True)
        data_frames_[key_].to_pickle(
            os.path.join(path_to_save_to, "decomposed", (str(key_) + ".pkl")))


def load_decomposed(path_to_pickles):
    data_frames_paths = files_paths(os.path.join(path_to_pickles, "decomposed"), '.pkl')
    data_frames_paths_sorted = sorted(data_frames_paths, key=lambda i: int(os.path.splitext(os.path.basename(i))[0]))
    data_frames_ = []
    for key_ in range(len(data_frames_paths_sorted)):
        data_frames_.append(pd.read_pickle(data_frames_paths_sorted[key_]))
    return data_frames_


def merge_data_frames_one_level(data_frame):
    data_frames_copy = deepcopy(data_frame)
    if isinstance(data_frames_copy, dict):
        for key in data_frames_copy:
            if isinstance(data_frames_copy[key], list):
                data_frames_copy[key] = pd.concat(data_frames_copy[key]).reset_index(drop=True)
            else:
                data_frames_dict_to_array = list(data_frames_copy.values())
                return pd.concat(data_frames_dict_to_array).reset_index(drop=True)
    else:
        data_frames_copy = pd.concat(data_frames_copy).reset_index(drop=True)
    return data_frames_copy


# merge all data frames
def merge_all_data_frames_to_one(data_frames):
    data_frames_copy = deepcopy(data_frames)
    for key in data_frames_copy:
        if isinstance(data_frames_copy[key], list):
            data_frames_copy[key] = merge_data_frames_one_level(data_frames_copy[key])
    data_frames_copy = merge_data_frames_one_level(data_frames_copy)
    return data_frames_copy


# decomposed data_frame by travel number
def decompose_by_travel(data_frame):
    data_frame_decomposed = []
    first_index = 0
    length_of_data_frame = len(data_frame)  # for logs percent
    for index, row in data_frame.iterrows():

        if index % 5000 == 0:  # log percent
            print(str(round(((index / length_of_data_frame) * 100), 2)) + '%')

        if int(index) <= 0:
            continue
        date_one_string = data_frame.iat[int(index) - 1, 5] + ' ' + data_frame.iat[index - 1, 6]
        date_two_string = data_frame.iat[int(index), 5] + ' ' + data_frame.iat[index, 6]
        date_one = datetime.datetime.strptime(date_one_string, date_format(date_one_string))
        date_two = datetime.datetime.strptime(date_two_string, date_format(date_two_string))

        user_one = data_frame.iat[index - 1, 8]
        user_two = data_frame.iat[index, 8]
        if (date_two - date_one).total_seconds() > 600 or user_one != user_two:
            one_travel_data_frame = data_frame[first_index:index]
            first_index = index
            data_frame_decomposed.append(one_travel_data_frame)
    print('100%')
    return data_frame_decomposed


# adds travel number to data_frame
def add_travel_number(data_frames):
    data_frames_copy = deepcopy(data_frames)
    data_frames_copy['travel_number'] = "None"
    travel_index = 0
    length_of_data_frame = len(data_frames_copy)  # for logs percent
    for index, row in data_frames_copy.iterrows():
        if index % 5000 == 0:  # log percent
            print(str(round(((index / length_of_data_frame) * 100), 2)) + '%')
        if int(index) <= 0:
            data_frames_copy.at[index, 'travel_number'] = 0
            continue
        date_one_string = data_frames_copy.iat[int(index) - 1, 5] + ' ' + data_frames_copy.iat[index - 1, 6]
        date_two_string = data_frames_copy.iat[int(index), 5] + ' ' + data_frames_copy.iat[index, 6]
        date_one = datetime.datetime.strptime(date_one_string, date_format(date_one_string))
        date_two = datetime.datetime.strptime(date_two_string, date_format(date_two_string))

        user_one = data_frames_copy.at[index - 1, 'user']
        user_two = data_frames_copy.at[index, 'user']
        if (date_two - date_one).total_seconds() > 600 or user_one != user_two:
            travel_index = travel_index + 1
        data_frames_copy.at[index, 'travel_number'] = travel_index
    print('100%')
    return data_frames_copy


# loads data_frame from geoLife files
def load_and_concatenate(path_to_pickles, only_with_transports=False, with_labels=False):
    loaded_data_frames = load_all_pickles(path_to_pickles)
    data_frames = loaded_data_frames['data_frames']
    data_frames = merge_all_data_frames_to_one(data_frames)
    if only_with_transports is True:
        data_frames_temp = data_frames.loc[data_frames['transport'] != "None"].reset_index(drop=True)
        data_frames = data_frames_temp
    if with_labels is True:
        labels = loaded_data_frames['labels']
        labels = merge_all_data_frames_to_one(labels)
        return {
            'data_frames': data_frames,
            'labels': labels
        }
    return data_frames


def calculate_distance_between_two_coordinates(lat1, lon1, lat2, lon2):
    R = 6373.0

    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    return R * c


def difference_two_dates_and_time(date_1_string, time_1_string, date_2_string, time_2_string):
    datetime_1_string = date_1_string + ' ' + time_1_string
    datetime_2_string = date_2_string + ' ' + time_2_string
    datetime_1 = datetime.datetime.strptime(datetime_1_string, date_format(datetime_1_string))
    datetime_2 = datetime.datetime.strptime(datetime_2_string, date_format(datetime_2_string))
    return abs((datetime_1 - datetime_2).total_seconds())


def calculate_speed_from_distance(distance, seconds):
    speed_on_seconds = distance / seconds
    speed = speed_on_seconds * 3600
    if speed < 0.1:
        return 0
    return speed


def calculate_speed_from_data_frame(data_frame, index):
    lat_1 = data_frame.iat[index - 1, 0]
    lon_1 = data_frame.iat[index - 1, 1]
    lat_2 = data_frame.iat[index, 0]
    lon_2 = data_frame.iat[index, 1]
    distance = calculate_distance_between_two_coordinates(lat_1, lon_1, lat_2, lon_2)

    date_1 = data_frame.iat[index - 1, 5]
    time_1 = data_frame.iat[index - 1, 6]
    date_2 = data_frame.iat[index, 5]
    time_2 = data_frame.iat[index, 6]
    seconds = difference_two_dates_and_time(date_1, time_1, date_2, time_2)

    if seconds == 0:
        return data_frame.iat[index - 1, 9]
    return calculate_speed_from_distance(distance, seconds)


# adds speed to data_frame
def add_speed_to_data_frame(data_frames):
    data_frames_copy = deepcopy(data_frames)
    data_frames_copy['speed'] = "0"
    length_of_data_frame = len(data_frames_copy)  # for logs percent
    for index, row in data_frames_copy.iterrows():
        if index % 5000 == 0:  # log percent
            print(str(round(((index / length_of_data_frame) * 100), 2)) + '%')

        if index == 0:
            continue
        if data_frames_copy.at[index - 1, 'user'] != data_frames_copy.at[index, 'user']:
            data_frames_copy.at[index, 'speed'] = 0
            continue
        data_frames_copy.at[index, 'speed'] = calculate_speed_from_data_frame(data_frames_copy, index)
    print('100%')
    return data_frames_copy


def drop_rows_without_transport_mode(data_frame):
    return data_frame.loc[data_frame['transport'] != "None"].reset_index(drop=True)


def save_merged_pickle(path_to_save_to, data_frames):
    os.makedirs(os.path.join(path_to_save_to, "Pickles"), exist_ok=True)
    os.makedirs(os.path.join(path_to_save_to, "Pickles", 'joined'), exist_ok=True)
    data_frames.to_pickle(
        os.path.join(path_to_save_to, "Pickles", 'joined', "merged.pkl"))


def load_merged_pickle(path_to_pickles):
    linux = False
    if re.search('/', path_to_pickles):
        linux = True
    decomposed = None
    if linux:
        decomposed = path_to_pickles.split('/')
    else:
        decomposed = path_to_pickles.split('\\\\')
        if len(decomposed) == 1:
            decomposed = path_to_pickles.split('\\')
    if len(decomposed[len(decomposed) - 1]) == 0:
        decomposed.pop(len(decomposed) - 1)
    if decomposed[len(decomposed) - 1] == "merged.pkl":
        return pd.read_pickle(path_to_pickles)
    elif decomposed[len(decomposed) - 1] == "joined":
        return pd.read_pickle(os.path.join(path_to_pickles, 'merged.pkl'))
    elif decomposed[len(decomposed) - 1] == "Pickles":
        return pd.read_pickle(os.path.join(path_to_pickles, 'joined', 'merged.pkl'))
    if decomposed[len(decomposed) - 1].find('.plk') != -1:
        return pd.read_pickle(path_to_pickles)
    return pd.read_pickle(os.path.join(path_to_pickles, 'Pickles', "joined", 'merged.pkl'))


def request_details_from_coordinates(lat, lon, only_class=False):
    response = requests.get("https://nominatim.openstreetmap.org/search.php?q=" + str(lat) + " " + str(lon) + "&format=json")
    print(response)
    if response.ok:
        content = json.loads(response.content)
        if len(content) > 0:
            if 'class' in content[0] and only_class:
                return content[0]['class']
            elif 'class' in content[0]:
                return content[0]
    return False


transports_taken_into_account = {
    'railway': {
        'default': {
            'train': {
                'min': 30,
                'max': 400,
                'ideal': 140
            },
            'airplane': {
                'min': 400,
                'max': 1100,
                'ideal': 800,
            }
        }
    },
    'highway': {
        'secondary': {
            'car': {
                'min': 20,
                'max': 120,
                'ideal': 70
            },
            'train': {
                'min': 30,
                'max': 400,
                'ideal': 140
            },
            'bike': {
                'min': 10,
                'max': 50,
                'ideal': 25
            },
            'subway': {
                'min': 25,
                'max': 70,
                'ideal': 45
            },
            'airplane': {
                'min': 400,
                'max': 1100,
                'ideal': 800,
            }
        },
        'residential': {
            'car': {
                'min': 15,
                'max': 40,
                'ideal': 20
            },
            'bike': {
                'min': 5,
                'max': 30,
                'ideal': 14
            },
            'walk': {
                'min': 0,
                'max': 8,
                'ideal': 4
            },
            'subway': {
                'min': 25,
                'max': 70,
                'ideal': 45
            },
            'airplane': {
                'min': 400,
                'max': 1100,
                'ideal': 800,
            }
        },
        'tertiary': {
            'car': {
                'min': 10,
                'max': 40,
                'ideal': 20
            },
            'train': {
                'min': 30,
                'max': 400,
                'ideal': 140
            },
            'bike': {
                'min': 10,
                'max': 50,
                'ideal': 25
            },
            'subway': {
                'min': 25,
                'max': 70,
                'ideal': 45
            },
            'airplane': {
                'min': 400,
                'max': 1100,
                'ideal': 800,
            }
        },
        'default': {
            'car': {
                'min': 20,
                'max': 60,
                'ideal': 40
            },
            'walk': {
                'min': 0,
                'max': 8,
                'ideal': 4
            },
            'airplane': {
                'min': 400,
                'max': 1100,
                'ideal': 800,
            },
            'train': {
                'min': 30,
                'max': 400,
                'ideal': 140
            },
            'bike': {
                'min': 10,
                'max': 50,
                'ideal': 25
            },
        }
    },
    'amenity': {
        'default': {
            'walk': {
                'min': 0,
                'max': 8,
                'ideal': 4
            },
            'subway': {
                'min': 25,
                'max': 70,
                'ideal': 45
            },
            'airplane': {
                'min': 400,
                'max': 1100,
                'ideal': 800,
            }
        }
    },
    'building': {
        'default': {
            'car': {
                'min': 20,
                'max': 60,
                'ideal': 40
            },
            'train': {
                'min': 30,
                'max': 280,
                'ideal': 100
            },
            'bike': {
                'min': 10,
                'max': 30,
                'ideal': 20
            },
            'walk': {
                'min': 0,
                'max': 8,
                'ideal': 4
            },
            'subway': {
                'min': 25,
                'max': 70,
                'ideal': 45
            },
            'airplane': {
                'min': 400,
                'max': 1100,
                'ideal': 800,
            }
        }
    },
    'default': {
        'default': {
            'car': {
                'min': 30,
                'max': 120,
                'ideal': 70
            },
            'train': {
                'min': 30,
                'max': 400,
                'ideal': 140
            },
            'bike': {
                'min': 10,
                'max': 50,
                'ideal': 25
            },
            'walk': {
                'min': 0,
                'max': 8,
                'ideal': 4
            },
            'subway': {
                'min': 25,
                'max': 70,
                'ideal': 45
            },
            'airplane': {
                'min': 400,
                'max': 1100,
                'ideal': 800,
            }
        }
    }
}


def find_transport_for_travel(data_frame):
    stations = {}
    results = {
        'car': 0,
        'train': 0,
        'bike': 0,
        'walk': 0,
        'subway': 0,
        'airplane': 0
    }
    stations = {
        'train': 0,
        'subway': 0,
        'airplane': 0
    }
    last_requests_results = []

    for index, row in data_frame.iterrows():
        lat = data_frame.iat[index, 0]
        lon = data_frame.iat[index, 1]
        speed = data_frame.at[index, 'speed']
        request_response = request_details_from_coordinates(lat, lon)
        if request_response is False:
            class_ = 'default'
            type_ = 'default'
        else:
            class_ = request_response['class']
            type_ = request_response['type']
        if type_ == 'station' and class_ in stations:
            stations[class_] = stations[class_] + 1
        results_new = get_score_per_request_and_speed(speed, class_, type_)
        print(results_new)
        if len(last_requests_results) >= 10:
            last_requests_results.pop(0)
        last_requests_results.append(get_greatest(results))
        for key in results_new:
            results[key] += results_new[key]
        additional_results = calculate_additional_from_last_results(last_requests_results, speed)
        if additional_results is not None:
            for key in results_new:
                results[key] += results_new[key]
        for key in stations:
            results[key] = results[key] + stations[key]

    return get_greatest(results)


def get_score_per_request_and_speed(speed, class_, type_):
    results = {
        'car': 0,
        'train': 0,
        'bike': 0,
        'walk': 0,
        'subway': 0,
        'airplane': 0
    }
    if class_ not in transports_taken_into_account:
        class_ = 'default'
    if type_ not in transports_taken_into_account[class_]:
        type_ = 'default'
    for key, value in transports_taken_into_account[class_][type_].items():
        results[key] = calculate_points_from_speed(speed, value['min'], value['max'], value['ideal'])
    return results


def calculate_points_from_speed(speed, min, max, ideal):
    print(speed, min, max, ideal)
    if int(speed) < int(min) or int(speed) > int(max):
        return 0
    if speed < ideal:
        return ((speed - min) / (ideal - min)) ** 2
    diff_max = max - ideal
    diff_speed = speed - ideal
    return ((diff_max - diff_speed) / diff_max) ** 2


def get_greatest(results):
    greatest = {
        'key': None,
        'value': 0
    }
    for key in results:
        if results[key] > greatest['value']:
            greatest['key'] = key
            greatest['value'] = results[key]
    return greatest['key']


transports_taken_into_account_for_last_results = {
    'train': {
        'min': 80,
        'max': 400,
        'ideal': 140,
    },
    'bike': {
        'min': 10,
        'max': 50,
        'ideal': 25,
    },
    'walk': {
        'min': 0,
        'max': 8,
        'ideal': 4,
    },
    'airplane': {
        'min': 400,
        'max': 1100,
        'ideal': 800,
    },
    'subway': {
        'min': 25,
        'max': 70,
        'ideal': 45
    }
}


def calculate_additional_from_last_results(last_requests_results, speed):
    results = {
        'train': 0,
        'bike': 0,
        'walk': 0,
        'subway': 0,
        'airplane': 0
    }
    length = len(last_requests_results)
    if length <= 0 or length == 1:
        return None
    if last_requests_results[length - 2] == last_requests_results[length - 1]:
        return None
    number_of_changes_in_chain = number_of_changes(last_requests_results)
    for key, value in transports_taken_into_account_for_last_results.items():
        results[key] = calculate_points_from_speed(speed, value['min'], value['max'], value['ideal'])
    greatest = get_greatest(results)
    returned_result = {}[greatest] = results[greatest] + ((10 - number_of_changes_in_chain) ** 2)
    return returned_result


def number_of_changes(last_requests_results):
    number = 0
    length = len(last_requests_results)
    for i in range(length):
        if i < length - 1:
            if last_requests_results[i] != last_requests_results[i + 1]:
                number = number + 1
    return number


In [2]:
#####################################
### paths to files, please set those
#####################################

# root data
root_path_windows = 'C:\\Users\\Yan\\Documents\\Projects\\ETNA\\Mappy\\group-768539'
root_path_linux = '/home/yanis.bouzidi/ETNA-Projects/Mappy Geolife transport modal/group-768539'

# geolife data path
geolife_data_path_windows = 'C:\\Users\\Yan\\Documents\\Projects\\ETNA\\Mappy\\group-768539\\Data'
geolife_data_path_linux = '/home/yanis.bouzidi/ETNA-Projects/Mappy Geolife transport modal/group-768539/Data'

# pickles paths
pickles_path_windows = 'C:\\Users\\Yan\\Documents\\Projects\\ETNA\\Mappy\\group-768539\\Pickles'
pickles_path_linux = '/home/yanis.bouzidi/ETNA-Projects/Mappy Geolife transport modal/group-768539/Pickles'

In [193]:
# loads all plt for geofile folder
data_frames_and_labels = load_from_plt(geolife_data_path_windows)

0.0%
1.45%
2.9%
4.35%
5.8%
7.25%
8.7%
10.14%
11.59%
13.04%
14.49%
15.94%
17.39%
18.84%
20.29%
21.74%
23.19%
24.64%
26.09%
27.54%
28.99%
30.43%
31.88%
33.33%
34.78%
36.23%
37.68%
39.13%
40.58%
42.03%
43.48%
44.93%
46.38%
47.83%
49.28%
50.72%
52.17%
53.62%
55.07%
56.52%
57.97%
59.42%
60.87%
62.32%
63.77%
65.22%
66.67%
68.12%
69.57%
71.01%
72.46%
73.91%
75.36%
76.81%
78.26%
79.71%
81.16%
82.61%
84.06%
85.51%
86.96%
88.41%
89.86%
91.3%
92.75%
94.2%
95.65%
97.1%
98.55%
100%


In [194]:
# get result from the load above
data_frames = data_frames_and_labels['data']
data_frames_labels = data_frames_and_labels['labels']

In [195]:
# adds user number to the data set
data_frames = add_user_number_to_data_frame(data_frames)

In [196]:
# add transport type to the data set
# takes a while
data_frames = add_transport_modes_to_data_frame(data_frames_with_user_number, data_frames_labels)

0.0%
0.04%
0.08%
0.12%
0.16%
0.2%
0.24%
0.28%
0.32%
0.36%
0.4%
0.44%
0.48%
0.52%
0.56%
0.6%
0.64%
0.68%
0.72%
0.76%
0.8%
0.84%
0.88%
0.92%
0.96%
1.0%
1.04%
1.08%
1.12%
1.16%
1.2%
1.24%
1.28%
1.32%
1.36%
1.4%
1.44%
1.48%
1.52%
1.56%
1.6%
1.64%
1.68%
1.72%
1.76%
1.8%
1.84%
1.88%
1.92%
1.96%
2.0%
2.04%
2.08%
2.12%
2.16%
2.2%
2.23%
2.27%
2.31%
2.35%
2.39%
2.43%
2.47%
2.51%
2.55%
2.59%
2.63%
2.67%
2.71%
2.75%
2.79%
2.83%
2.87%
2.91%
2.95%
2.99%
3.03%
3.07%
3.11%
3.15%
3.19%
3.23%
3.27%
3.31%
3.35%
3.39%
3.43%
3.47%
3.51%
3.55%
3.59%
3.63%
3.67%
3.71%
3.75%
3.79%
3.83%
3.87%
3.91%
3.95%
3.99%
4.03%
4.07%
4.11%
4.15%
4.19%
4.23%
4.27%
4.31%
4.35%
4.39%
4.43%
4.47%
4.51%
4.55%
4.59%
4.63%
4.67%
4.71%
4.75%
4.79%
4.83%
4.87%
4.91%
4.95%
4.99%
5.03%
5.07%
5.11%
5.15%
5.19%
5.23%
5.27%
5.31%
5.35%
5.39%
5.43%
5.47%
5.51%
5.55%
5.59%
5.63%
5.67%
5.71%
5.75%
5.79%
5.83%
5.87%
5.91%
5.95%
5.99%
6.03%
6.07%
6.11%
6.15%
6.19%
6.23%
6.27%
6.31%
6.35%
6.39%
6.43%
6.47%
6.51%
6.55%
6.59%
6.62%
6.66%
6.7%

46.93%
46.97%
47.01%
47.05%
47.09%
47.13%
47.17%
47.2%
47.21%
47.25%
47.29%
47.33%
47.37%
47.41%
47.45%
47.49%
47.53%
47.57%
47.61%
47.65%
47.69%
47.73%
47.77%
47.81%
47.85%
47.89%
47.93%
47.97%
47.97%
48.01%
48.05%
48.09%
48.13%
48.17%
48.21%
48.25%
48.29%
48.33%
48.37%
48.41%
48.45%
48.49%
48.53%
48.57%
48.61%
48.65%
48.69%
48.73%
48.77%
48.81%
48.85%
48.89%
48.93%
48.97%
49.01%
49.05%
49.09%
49.13%
49.17%
49.21%
49.25%
49.29%
49.33%
49.37%
49.41%
49.45%
49.49%
49.53%
49.57%
49.61%
49.65%
49.69%
49.73%
49.77%
49.81%
49.85%
49.89%
49.93%
49.97%
50.01%
50.05%
50.09%
50.13%
50.17%
50.21%
50.25%
50.29%
50.33%
50.37%
50.41%
50.45%
50.49%
50.53%
50.57%
50.61%
50.65%
50.69%
50.72%
50.76%
50.8%
50.84%
50.88%
50.92%
50.96%
51.0%
51.04%
51.08%
51.12%
51.16%
51.2%
51.24%
51.28%
51.32%
51.33%
51.36%
51.4%
51.44%
51.48%
51.52%
51.56%
51.6%
51.64%
51.68%
51.72%
51.76%
51.8%
51.84%
51.88%
51.92%
51.96%
52.0%
52.04%
52.08%
52.12%
52.16%
52.2%
52.24%
52.28%
52.32%
52.36%
52.4%
52.44%
52.48%
52.52%
52

93.79%
93.83%
93.87%
93.91%
93.95%
93.99%
94.03%
94.07%
94.11%
94.15%
94.19%
94.23%
94.27%
94.31%
94.35%
94.39%
94.43%
94.47%
94.51%
94.55%
94.59%
94.63%
94.67%
94.71%
94.75%
94.79%
94.83%
94.86%
94.9%
94.94%
94.98%
95.02%
95.06%
95.1%
95.14%
95.18%
95.22%
95.26%
95.3%
95.34%
95.38%
95.42%
95.46%
95.5%
95.54%
95.58%
95.62%
95.66%
95.7%
95.74%
95.78%
95.82%
95.86%
95.9%
95.94%
95.98%
96.02%
96.06%
96.1%
96.14%
96.18%
96.22%
96.26%
96.3%
96.34%
96.38%
96.42%
96.46%
96.5%
96.54%
96.58%
96.62%
96.66%
96.7%
96.74%
96.78%
96.82%
96.86%
96.9%
96.94%
96.98%
97.02%
97.06%
97.1%
97.14%
97.18%
97.22%
97.26%
97.3%
97.34%
97.38%
97.42%
97.46%
97.5%
97.54%
97.58%
97.62%
97.66%
97.7%
97.74%
97.78%
97.82%
97.86%
97.9%
97.94%
97.98%
98.02%
98.06%
98.1%
98.14%
98.18%
98.22%
98.26%
98.3%
98.34%
98.38%
98.42%
98.46%
98.5%
98.54%
98.55%
98.58%
98.59%
98.62%
98.64%
98.65%
98.66%
98.7%
98.74%
98.78%
98.82%
98.86%
98.9%
98.94%
98.98%
99.02%
99.06%
99.1%
99.14%
99.18%
99.22%
99.25%
99.29%
99.33%
99.37%
99.41%


In [206]:
# merge all data sets to one
data_frames = merge_all_data_frames_to_one(data_frames)

In [213]:
# drop rows without transport type
data_frames = drop_rows_without_transport_mode(data_frames)

In [216]:
# add travel number to the data set : trajet numéro ..
data_frame = add_travel_number(data_frames)

0.0%
0.09%
0.18%
0.28%
0.37%
0.46%
0.55%
0.64%
0.74%
0.83%
0.92%
1.01%
1.1%
1.19%
1.29%
1.38%
1.47%
1.56%
1.65%
1.75%
1.84%
1.93%
2.02%
2.11%
2.21%
2.3%
2.39%
2.48%
2.57%
2.67%
2.76%
2.85%
2.94%
3.03%
3.12%
3.22%
3.31%
3.4%
3.49%
3.58%
3.68%
3.77%
3.86%
3.95%
4.04%
4.14%
4.23%
4.32%
4.41%
4.5%
4.6%
4.69%
4.78%
4.87%
4.96%
5.05%
5.15%
5.24%
5.33%
5.42%
5.51%
5.61%
5.7%
5.79%
5.88%
5.97%
6.07%
6.16%
6.25%
6.34%
6.43%
6.52%
6.62%
6.71%
6.8%
6.89%
6.98%
7.08%
7.17%
7.26%
7.35%
7.44%
7.54%
7.63%
7.72%
7.81%
7.9%
8.0%
8.09%
8.18%
8.27%
8.36%
8.45%
8.55%
8.64%
8.73%
8.82%
8.91%
9.01%
9.1%
9.19%
9.28%
9.37%
9.47%
9.56%
9.65%
9.74%
9.83%
9.93%
10.02%
10.11%
10.2%
10.29%
10.38%
10.48%
10.57%
10.66%
10.75%
10.84%
10.94%
11.03%
11.12%
11.21%
11.3%
11.4%
11.49%
11.58%
11.67%
11.76%
11.86%
11.95%
12.04%
12.13%
12.22%
12.31%
12.41%
12.5%
12.59%
12.68%
12.77%
12.87%
12.96%
13.05%
13.14%
13.23%
13.33%
13.42%
13.51%
13.6%
13.69%
13.79%
13.88%
13.97%
14.06%
14.15%
14.24%
14.34%
14.43%
14.52%
14.61%
14.7%

In [4]:
# calculate speed for each row and adds it
data_frame = add_speed_to_data_frame(data_frame__)

0.0%
0.04%
0.08%
0.12%
0.16%
0.2%
0.24%
0.28%
0.32%
0.36%
0.4%
0.44%
0.48%
0.52%
0.56%
0.6%
0.64%
0.68%
0.72%
0.76%
0.8%
0.84%
0.88%
0.92%
0.96%
1.0%
1.04%
1.08%
1.12%
1.16%
1.2%
1.24%
1.28%
1.32%
1.36%
1.4%
1.44%
1.48%
1.52%
1.56%
1.6%
1.64%
1.68%
1.72%
1.76%
1.8%
1.84%
1.88%
1.92%
1.96%
2.0%
2.04%
2.08%
2.12%
2.16%
2.2%
2.24%
2.28%
2.32%
2.36%
2.4%
2.44%
2.48%
2.52%
2.56%
2.6%
2.64%
2.68%
2.72%
2.76%
2.8%
2.84%
2.88%
2.92%
2.96%
3.0%
3.04%
3.08%
3.12%
3.16%
3.2%
3.24%
3.28%
3.32%
3.36%
3.4%
3.44%
3.48%
3.52%
3.56%
3.6%
3.63%
3.67%
3.71%
3.75%
3.79%
3.83%
3.87%
3.91%
3.95%
3.99%
4.03%
4.07%
4.11%
4.15%
4.19%
4.23%
4.27%
4.31%
4.35%
4.39%
4.43%
4.47%
4.51%
4.55%
4.59%
4.63%
4.67%
4.71%
4.75%
4.79%
4.83%
4.87%
4.91%
4.95%
4.99%
5.03%
5.07%
5.11%
5.15%
5.19%
5.23%
5.27%
5.31%
5.35%
5.39%
5.43%
5.47%
5.51%
5.55%
5.59%
5.63%
5.67%
5.71%
5.75%
5.79%
5.83%
5.87%
5.91%
5.95%
5.99%
6.03%
6.07%
6.11%
6.15%
6.19%
6.23%
6.27%
6.31%
6.35%
6.39%
6.43%
6.47%
6.51%
6.55%
6.59%
6.63%
6.67%
6.71%
6.75%

48.93%
48.97%
49.01%
49.05%
49.09%
49.13%
49.17%
49.21%
49.25%
49.29%
49.33%
49.37%
49.41%
49.45%
49.49%
49.53%
49.57%
49.61%
49.65%
49.69%
49.73%
49.77%
49.81%
49.85%
49.89%
49.93%
49.97%
50.01%
50.05%
50.09%
50.13%
50.17%
50.21%
50.25%
50.29%
50.33%
50.37%
50.41%
50.45%
50.49%
50.53%
50.57%
50.61%
50.65%
50.69%
50.73%
50.77%
50.81%
50.85%
50.89%
50.93%
50.97%
51.01%
51.05%
51.09%
51.13%
51.17%
51.21%
51.25%
51.29%
51.33%
51.37%
51.41%
51.45%
51.49%
51.53%
51.57%
51.61%
51.65%
51.69%
51.73%
51.77%
51.81%
51.85%
51.89%
51.93%
51.97%
52.01%
52.05%
52.09%
52.13%
52.17%
52.21%
52.25%
52.29%
52.33%
52.37%
52.41%
52.45%
52.49%
52.53%
52.57%
52.61%
52.65%
52.69%
52.73%
52.77%
52.81%
52.85%
52.89%
52.93%
52.97%
53.01%
53.05%
53.09%
53.13%
53.17%
53.21%
53.25%
53.29%
53.33%
53.37%
53.41%
53.45%
53.49%
53.53%
53.57%
53.61%
53.65%
53.69%
53.73%
53.77%
53.81%
53.85%
53.89%
53.93%
53.97%
54.0%
54.04%
54.08%
54.12%
54.16%
54.2%
54.24%
54.28%
54.32%
54.36%
54.4%
54.44%
54.48%
54.52%
54.56%
54.6%
54.

96.31%
96.35%
96.39%
96.43%
96.47%
96.51%
96.55%
96.59%
96.63%
96.67%
96.71%
96.75%
96.79%
96.83%
96.87%
96.91%
96.95%
96.99%
97.03%
97.07%
97.11%
97.15%
97.18%
97.22%
97.26%
97.3%
97.34%
97.38%
97.42%
97.46%
97.5%
97.54%
97.58%
97.62%
97.66%
97.7%
97.74%
97.78%
97.82%
97.86%
97.9%
97.94%
97.98%
98.02%
98.06%
98.1%
98.14%
98.18%
98.22%
98.26%
98.3%
98.34%
98.38%
98.42%
98.46%
98.5%
98.54%
98.58%
98.62%
98.66%
98.7%
98.74%
98.78%
98.82%
98.86%
98.9%
98.94%
98.98%
99.02%
99.06%
99.1%
99.14%
99.18%
99.22%
99.26%
99.3%
99.34%
99.38%
99.42%
99.46%
99.5%
99.54%
99.58%
99.62%
99.66%
99.7%
99.74%
99.78%
99.82%
99.86%
99.9%
99.94%
99.98%
100%


In [5]:
# save data set to one file, pickle
save_merged_pickle(root_path_windows, data_frame_sp)

In [225]:
data_frame
# go to find_speeds file to use the data

Unnamed: 0,latitude,longitude,other,altitude,timestamp,date,time,user,transport,travel_number,speed
0,39.894178,116.318200,0,-777.0,39535.621296,2008-03-28,14:54:40,010,train,0,0
1,39.894505,116.321132,0,-777.0,39535.621690,2008-03-28,14:55:14,010,train,0,26.7716
2,39.894953,116.326452,0,-777.0,39535.622373,2008-03-28,14:56:13,010,train,0,27.868
3,39.894600,116.332542,0,-777.0,39535.623056,2008-03-28,14:57:12,010,train,0,31.8015
4,39.889622,116.337040,0,-777.0,39535.623738,2008-03-28,14:58:11,010,train,0,41.1102
...,...,...,...,...,...,...,...,...,...,...,...
5440611,40.029320,116.411975,0,289.0,39781.103808,2008-11-29,02:29:29,179,subway,7746,41.8458
5440612,40.029111,116.411963,0,275.0,39781.103831,2008-11-29,02:29:31,179,subway,7746,41.8851
5440613,40.028904,116.411962,0,274.0,39781.103854,2008-11-29,02:29:33,179,subway,7746,41.4445
5440614,40.028697,116.411961,0,274.0,39781.103877,2008-11-29,02:29:35,179,subway,7746,41.4445
