Le but de ce fichier est de mettre en forme les données avant de les utiliser pour le réseau de neurones.

In [1]:
import csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
import os
import re
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import models, transforms, datasets

# Data loading and cleaning

In [2]:
df_train=pd.read_csv('data_truncate_train_label.csv')
df_train=df_train.drop(df_train.columns[0],axis=1)

df_test=pd.read_csv('data_truncate_test_label.csv')
df_test=df_test.drop(df_test.columns[0],axis=1)

In [3]:
df_train.dropna(axis=0, inplace=True)
df_test.dropna(axis=0, inplace=True)

# New features
## origin_call, origin_stand and taxi_id

In [4]:
call_id = list(set(df_train['ORIGIN_CALL'].values))
stand_id = list(set(df_train['ORIGIN_STAND'].values))
taxi_id = list(set(df_train['TAXI_ID'].values))

In [5]:
convert_for_embed_call = dict(zip(call_id, range(len(call_id))))
convert_for_embed_stand = dict(zip(stand_id, range(len(stand_id))))
convert_for_embed_taxi = dict(zip(taxi_id, range(len(taxi_id))))

In [6]:
df_train['ORIGIN_CALL'] = df_train['ORIGIN_CALL'].map(convert_for_embed_call)
df_train['ORIGIN_STAND'] = df_train['ORIGIN_STAND'].map(convert_for_embed_stand)
df_train['TAXI_ID'] = df_train['TAXI_ID'].map(convert_for_embed_taxi)

df_test['ORIGIN_CALL'] = df_test['ORIGIN_CALL'].map(convert_for_embed_call)
df_test['ORIGIN_STAND'] = df_test['ORIGIN_STAND'].map(convert_for_embed_stand)
df_test['TAXI_ID'] = df_test['TAXI_ID'].map(convert_for_embed_taxi)

## Five first and last points of taxi's trajectory

In [7]:
#créé une fonction qui renvoie le point position en partant de la gauche (attention on inverse long/lat une fois sur deux)
def start_points(poly, num):
    poly = poly.replace("[","").replace("]","").split(",")
    n = len(poly)
    if (n == 1):
        return None
    elif (num < n):
        return float(poly[num])
    else:
        if (num%2 == 0):
            return float(poly[n-2])
        else:
            return float(poly[n-1])
        
def end_points(poly, num):
    poly = poly.replace("[","").replace("]","").split(",")
    n = len(poly)
    if (n == 1):
        return None
    elif (num < n):
        return float(poly[n-num-1])
    else:
        if (num%2 == 0):
            return float(poly[1])
        else:
            return float(poly[0])

In [8]:
df_train['START_1_LON'] = df_train['POLYLINE_TRUNCATE'].apply(lambda x: start_points(x,0))
df_train['START_1_LAT'] = df_train['POLYLINE_TRUNCATE'].apply(lambda x: start_points(x,1))
df_train['START_2_LON'] = df_train['POLYLINE_TRUNCATE'].apply(lambda x: start_points(x,2))
df_train['START_2_LAT'] = df_train['POLYLINE_TRUNCATE'].apply(lambda x: start_points(x,3))
df_train['START_3_LON'] = df_train['POLYLINE_TRUNCATE'].apply(lambda x: start_points(x,4))
df_train['START_3_LAT'] = df_train['POLYLINE_TRUNCATE'].apply(lambda x: start_points(x,5))
df_train['START_4_LON'] = df_train['POLYLINE_TRUNCATE'].apply(lambda x: start_points(x,6))
df_train['START_4_LAT'] = df_train['POLYLINE_TRUNCATE'].apply(lambda x: start_points(x,7))
df_train['START_5_LON'] = df_train['POLYLINE_TRUNCATE'].apply(lambda x: start_points(x,8))
df_train['START_5_LAT'] = df_train['POLYLINE_TRUNCATE'].apply(lambda x: start_points(x,9))

df_test['START_1_LON'] = df_test['POLYLINE_TRUNCATE'].apply(lambda x: start_points(x,0))
df_test['START_1_LAT'] = df_test['POLYLINE_TRUNCATE'].apply(lambda x: start_points(x,1))
df_test['START_2_LON'] = df_test['POLYLINE_TRUNCATE'].apply(lambda x: start_points(x,2))
df_test['START_2_LAT'] = df_test['POLYLINE_TRUNCATE'].apply(lambda x: start_points(x,3))
df_test['START_3_LON'] = df_test['POLYLINE_TRUNCATE'].apply(lambda x: start_points(x,4))
df_test['START_3_LAT'] = df_test['POLYLINE_TRUNCATE'].apply(lambda x: start_points(x,5))
df_test['START_4_LON'] = df_test['POLYLINE_TRUNCATE'].apply(lambda x: start_points(x,6))
df_test['START_4_LAT'] = df_test['POLYLINE_TRUNCATE'].apply(lambda x: start_points(x,7))
df_test['START_5_LON'] = df_test['POLYLINE_TRUNCATE'].apply(lambda x: start_points(x,8))
df_test['START_5_LAT'] = df_test['POLYLINE_TRUNCATE'].apply(lambda x: start_points(x,9))

In [9]:
df_train['END_1_LON'] = df_train['POLYLINE_TRUNCATE'].apply(lambda x: end_points(x,9))
df_train['END_1_LAT'] = df_train['POLYLINE_TRUNCATE'].apply(lambda x: end_points(x,8))
df_train['END_2_LON'] = df_train['POLYLINE_TRUNCATE'].apply(lambda x: end_points(x,7))
df_train['END_2_LAT'] = df_train['POLYLINE_TRUNCATE'].apply(lambda x: end_points(x,6))
df_train['END_3_LON'] = df_train['POLYLINE_TRUNCATE'].apply(lambda x: end_points(x,5))
df_train['END_3_LAT'] = df_train['POLYLINE_TRUNCATE'].apply(lambda x: end_points(x,4))
df_train['END_4_LON'] = df_train['POLYLINE_TRUNCATE'].apply(lambda x: end_points(x,3))
df_train['END_4_LAT'] = df_train['POLYLINE_TRUNCATE'].apply(lambda x: end_points(x,2))
df_train['END_5_LON'] = df_train['POLYLINE_TRUNCATE'].apply(lambda x: end_points(x,1))
df_train['END_5_LAT'] = df_train['POLYLINE_TRUNCATE'].apply(lambda x: end_points(x,0))

df_test['END_1_LON'] = df_test['POLYLINE_TRUNCATE'].apply(lambda x: end_points(x,9))
df_test['END_1_LAT'] = df_test['POLYLINE_TRUNCATE'].apply(lambda x: end_points(x,8))
df_test['END_2_LON'] = df_test['POLYLINE_TRUNCATE'].apply(lambda x: end_points(x,7))
df_test['END_2_LAT'] = df_test['POLYLINE_TRUNCATE'].apply(lambda x: end_points(x,6))
df_test['END_3_LON'] = df_test['POLYLINE_TRUNCATE'].apply(lambda x: end_points(x,5))
df_test['END_3_LAT'] = df_test['POLYLINE_TRUNCATE'].apply(lambda x: end_points(x,4))
df_test['END_4_LON'] = df_test['POLYLINE_TRUNCATE'].apply(lambda x: end_points(x,3))
df_test['END_4_LAT'] = df_test['POLYLINE_TRUNCATE'].apply(lambda x: end_points(x,2))
df_test['END_5_LON'] = df_test['POLYLINE_TRUNCATE'].apply(lambda x: end_points(x,1))
df_test['END_5_LAT'] = df_test['POLYLINE_TRUNCATE'].apply(lambda x: end_points(x,0))

# Data labels

In [10]:
df_train['LABEL_LON'] = df_train['LABEL'].apply(lambda x: end_points(x,1))
df_train['LABEL_LAT'] = df_train['LABEL'].apply(lambda x: end_points(x,0))

df_test['LABEL_LON'] = df_test['LABEL'].apply(lambda x: end_points(x,1))
df_test['LABEL_LAT'] = df_test['LABEL'].apply(lambda x: end_points(x,0))

# Data cleaning and saving

In [11]:
df_train = df_train.drop(['POLYLINE_TRUNCATE','LABEL'],axis=1)
df_test = df_test.drop(['POLYLINE_TRUNCATE','LABEL'],axis=1)

df_train = df_train.dropna(axis=0)
df_test = df_test.dropna(axis=0)

In [12]:
df_train.to_csv('train_avant_tenseur.csv')
df_test.to_csv('test_avant_tenseur.csv')

In [51]:
df_try = df_train.loc[:10000,]
df_try.to_csv('try_avant_tenseur.csv')