In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import pyproj 
from tqdm import tqdm
import reverse_geocoder as rg

from shapely.geometry import Point
from shapely.geometry import LineString

from preprocessing.pre_process_stations import *
from preprocessing.pre_process_traffic import *
from preprocessing.helping_functions import *

from features.config import *
from features.financials_part_2 import *
from features.question_1 import *
from features.question_2 import *

In [2]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
class Config:
    PATH = '../data/'

class Params:
    PARAM = 0
config = Config()
p = Params()

## Load data

In [32]:
##### Load new coordinates
df_new_points = gpd.read_file(config.PATH+'new_coordinates/new_coordinates.shp')

##### Load hub data 
df_hub_dense = gpd.read_file(config.PATH+'F-aire-logistiques-donnees-detaillees/Aires_logistiques_denses.shp')
df_hub_enlarged = gpd.read_file(config.PATH+'F-aire-logistiques-donnees-detaillees/Aires_logistiques_elargies.shp')

In [33]:
##### Load traffic data
df_traffic = gpd.read_file(config.PATH+'E-tmja2019-shp/TMJA2019.shp')
df_traffic = preprocess_data(df_traffic)
df_traffic = fix_tmja(df_traffic)

In [34]:
df_new_points = distance_to_hub(df_hub_dense, df_hub_enlarged, df_new_points)

100%|██████████| 13978/13978 [00:48<00:00, 285.60it/s]


In [35]:
df_new_points

Unnamed: 0,index,easting,northing,route,geometry,closest_dense_hub,distance_to_closest_dense_hub,closest_elargie_hub,distance_to_closest_large_hub
0,0,511656.784599997,6204078.3607,31D0044,POINT (511656.785 6204078.361),1.0,0 80409.81839 dtype: float64,5.0,0 77833.926035 dtype: float64
1,1,511677.2349017429,6204069.939337369,31D0044,POINT (511677.235 6204069.939),1.0,0 80403.066689 dtype: float64,5.0,0 77816.517819 dtype: float64
2,2,511367.215000004,6204210.064,31D0044E,POINT (511367.215 6204210.064),1.0,0 80496.520264 dtype: float64,5.0,0 78077.336309 dtype: float64
3,3,511736.9510113234,6204015.573582755,31D0044E,POINT (511736.951 6204015.574),1.0,0 80406.158813 dtype: float64,5.0,0 77773.778673 dtype: float64
4,4,844036.840899996,6510806.4495,69D0301,POINT (844036.841 6510806.450),1.0,0 0.0 dtype: float64,5.0,0 0.0 dtype: float64
...,...,...,...,...,...,...,...,...,...
13973,13973,440432.83714999165,6360261.379555852,P0524,POINT (440432.837 6360261.380),1.0,0 28209.517913 dtype: float64,5.0,0 28209.517913 dtype: float64
13974,13974,458798.853,6325912.317,P0524,POINT (458798.853 6325912.317),1.0,0 36411.21702 dtype: float64,5.0,0 36411.21702 dtype: float64
13975,13975,458011.7067121267,6324604.018405371,P0524,POINT (458011.707 6324604.018),1.0,0 35410.422032 dtype: float64,5.0,0 35410.422032 dtype: float64
13976,13976,548722.843000002,6283728.592,P0542,POINT (548722.843 6283728.592),1.0,0 14101.845404 dtype: float64,5.0,0 11284.702491 dtype: float64


In [39]:
df_new_points = station_distances_all(df_new_points)

100%|██████████| 13978/13978 [03:24<00:00, 68.19it/s]


In [40]:
df_new_points

Unnamed: 0,index,easting,northing,route,geometry,closest_dense_hub,distance_to_closest_dense_hub,closest_elargie_hub,distance_to_closest_large_hub,distance_to_point_0,...,distance_to_point_13968,distance_to_point_13969,distance_to_point_13970,distance_to_point_13971,distance_to_point_13972,distance_to_point_13973,distance_to_point_13974,distance_to_point_13975,distance_to_point_13976,distance_to_point_13977
0,0,511656.784599997,6204078.3607,31D0044,POINT (511656.785 6204078.361),1.0,0 80409.81839 dtype: float64,5.0,0 77833.926035 dtype: float64,0.000000,...,95690.545129,94828.849457,95388.653530,96100.137959,172451.851977,171656.593434,132806.151366,131925.086875,87852.444653,88577.350931
1,1,511677.2349017429,6204069.939337369,31D0044,POINT (511677.235 6204069.939),1.0,0 80403.066689 dtype: float64,5.0,0 77816.517819 dtype: float64,22.116378,...,95688.536479,94826.391777,95385.876020,96096.923993,172467.985764,171672.741600,132822.017253,131941.097209,87851.454304,88576.316975
2,2,511367.215000004,6204210.064,31D0044E,POINT (511367.215 6204210.064),1.0,0 80496.520264 dtype: float64,5.0,0 78077.336309 dtype: float64,318.113679,...,95708.508147,94853.294210,95417.710712,96135.493480,172212.197767,171416.740862,132570.249561,131687.184350,87855.786597,88581.315666
3,3,511736.9510113234,6204015.573582755,31D0044E,POINT (511736.951 6204015.574),1.0,0 80406.158813 dtype: float64,5.0,0 77773.778673 dtype: float64,101.827676,...,95709.100340,94845.358253,95403.697152,96113.172608,172542.203805,171746.989894,132895.666885,132015.055580,87875.602481,88600.312665
4,4,844036.840899996,6510806.4495,69D0301,POINT (844036.841 6510806.450),1.0,0 0.0 dtype: float64,5.0,0 0.0 dtype: float64,452281.574116,...,363624.801267,363619.885487,362567.420238,361204.146118,430660.233341,430766.769758,427310.364436,428586.921838,372524.778669,371801.246730
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13973,13973,440432.83714999165,6360261.379555852,P0524,POINT (440432.837 6360261.380),1.0,0 28209.517913 dtype: float64,5.0,0 28209.517913 dtype: float64,171656.593434,...,135824.870412,137820.180687,139177.346669,141066.000282,811.912675,0.000000,38950.848985,39755.050739,132604.649006,132649.143762
13974,13974,458798.853,6325912.317,P0524,POINT (458798.853 6325912.317),1.0,0 36411.21702 dtype: float64,5.0,0 36411.21702 dtype: float64,132806.151366,...,104005.862862,105829.947492,107278.334730,109273.245420,39734.670328,38950.848985,0.000000,1526.841344,99326.686406,99501.842632
13975,13975,458011.7067121267,6324604.018405371,P0524,POINT (458011.707 6324604.018),1.0,0 35410.422032 dtype: float64,5.0,0 35410.422032 dtype: float64,131925.086875,...,104286.422933,106094.691055,107548.090145,109548.402336,40545.358033,39755.050739,1526.841344,0.000000,99495.279941,99681.462427
13976,13976,548722.843000002,6283728.592,P0542,POINT (548722.843 6283728.592),1.0,0 14101.845404 dtype: float64,5.0,0 11284.702491 dtype: float64,87852.444653,...,8906.825468,9249.582246,10578.328584,12414.818037,133210.186501,132604.649006,99326.686406,99495.279941,0.000000,745.502273


## Genetic algorithm deap library


In [41]:
df_stations = df_new_points.merge(df_traffic, how='left', on='route')

MemoryError: Unable to allocate 6.49 MiB for an array with shape (850004, 1) and data type float64

In [20]:
df_stations = sales(df_stations,2030)

In [19]:
df_stations.to_csv('../data/df_stations.csv')

In [27]:
df_stations.columns

Index(['easting', 'northing', 'route', 'geometry_x', 'dateRefere', 'longueur',
       'prD', 'depPrD', 'concession', 'absD', 'cumulD', 'xD', 'yD', 'zD',
       'prF', 'depPrF', 'concessi_1', 'absF', 'cumulF', 'xF', 'yF', 'zF',
       'anneeMesur', 'typeCompta', 'typeComp_1', 'TMJA', 'ratio_PL',
       'geometry_y', 'TMJA_PL', 'percentage_traffic', 'lonD', 'latD', 'lonF',
       'latF', 'Quantity_sold_per_day(in kg)', 'Revenues_day'],
      dtype='object')

In [23]:
import random
import numpy as np
from deap import base, creator, tools

# Define the fitness function that takes a pandas DataFrame as input and returns a fitness score
def fitness(X, data: pd.DataFrame=df_stations):
    index = X[0]

    if type(index)==int:
        print('error')
        index = np.random.choice(list(range(13978)), size=376, replace=False)

    columns_distance = [c for c in data.columns if c.startswith('distance_to_point_')==True]
    print(len(columns_distance))
    if data.iloc[index,:].shape[0]>0:
        
        columns_distance_drop = [columns_distance[i] for i in index]
        data_sub = get_closer_station(data.iloc[index,:].drop(columns=columns_distance_drop))
        data_sub = data_sub[data_sub['distance_closer_station'] < 150000*0.8]
        # constraint profitability: make sure the station is profitable 
        fit = - (data_sub['distance_to_closest_large_hub'].mean() + data_sub['distance_to_closest_dense_hub'].mean() - data_sub['distance_closer_station'].mean())/100 + data_sub['Revenues_day'].mean()

    else:
        fit = 10e10
        
    return fit


In [25]:
columns_distance = [c for c in df_stations.columns if c.startswith('distance_to_point_')==True]
columns_distance

[]

In [24]:

# Define the evaluation function
def evaluate(individual):
    return fitness(individual),

# Set up the DEAP toolbox
creator.create("FitnessMax", base.Fitness, weights=[1.0,])
creator.create("Individual", list, fitness=creator.FitnessMax)
toolbox = base.Toolbox()

# Register the genetic operators
toolbox.register("individual", tools.initRepeat, creator.Individual, lambda: np.random.choice(list(range(3089)), size=376, replace=False),n=1)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutUniformInt, low=0, up=3, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("evaluate", evaluate)

# Set the number of generations and the population size
num_generations = 50
population_size = 50

# Create the initial population
population = toolbox.population(n=population_size)

# Evaluate the initial population
fitnesses = list(map(toolbox.evaluate, population))
for ind, fit in zip(population, fitnesses):        
    ind.fitness.values = fit

# Set up the hall of fame and statistics objects
hof = tools.HallOfFame(1)
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("avg", np.mean)
stats.register("max", np.max)
stats.register("min", np.min)
stats.register("argmin", np.argmin)

avg_fitness = []
min_fitness = []
max_fitness = []
arg_min_fitness = []

# Start the evolution process
for generation in range(num_generations):
    
    population = [x for x in population if type(x)!=int]

    # Select the next generation individuals
    offspring = toolbox.select(population, len(population))

    # Clone the selected individuals
    offspring = list(map(toolbox.clone, offspring))

    # Apply crossover and mutation on the offspring
    for child1, child2 in zip(offspring[::2], offspring[1::2]):
        if len(child1[0])==1:
            child1[0] = np.random.choice(list(range(3089)), size=376, replace=False)
        elif len(child1[0])==1:
            child2[0] = np.random.choice(list(range(3089)), size=376, replace=False)
        else:
            toolbox.mate(child1[0], child2[0])
        del child1.fitness.values
        del child2.fitness.values

    for mutant in offspring:
        toolbox.mutate(mutant)
        del mutant.fitness.values

    # Evaluate the new individuals
    fresh_individuals = [ind for ind in offspring if not ind.fitness.valid]
    fitnesses = map(toolbox.evaluate, fresh_individuals)
    for ind, fit in zip(fresh_individuals, fitnesses):
        ind.fitness.values = fit

    # Add the new individuals to the population
    population[:] = offspring

    # Update the hall of fame and statistics
    #hof.update(population)
    record = stats.compile(population)
    print("Generation {}:".format(generation + 1))
    avg_fitness.append(record["avg"])
    min_fitness.append(record["min"])
    max_fitness.append(record["max"])
    arg_min_fitness.append(record["argmin"])

0


ValueError: Cannot set a DataFrame with multiple columns to the single column distance_closer_station

In [15]:
fig = plt.figure(figsize=(10,6))
plt.plot(np.arange(len(avg_fitness)),avg_fitness,label='mean')
plt.xlabel('Generations')
plt.ylabel('Average fitness function')
plt.legend()
plt.title('Evolution of fitness function')
plt.show()

NameError: name 'avg_fitness' is not defined

<Figure size 1000x600 with 0 Axes>

In [427]:
results = pd.DataFrame(population) # the best individual per generation

In [428]:
results

Unnamed: 0,0
0,"[625, 1820, 2820, 2110, 2458, 2556, 2018, 2354..."
1,3
2,"[625, 1820, 2820, 2110, 2458, 2556, 2018, 2354..."
3,"[625, 1820, 2820, 2110, 2458, 2556, 2018, 2354..."
4,1
5,"[625, 1820, 2820, 2110, 2458, 2556, 2018, 2354..."
6,"[625, 1820, 2820, 2110, 2458, 2556, 2018, 2354..."
7,"[625, 1820, 2820, 2110, 2458, 2556, 2018, 2354..."
8,"[625, 1820, 2820, 2110, 2458, 2556, 2018, 2354..."
9,"[625, 1820, 2820, 2110, 2458, 2556, 2018, 2354..."


In [429]:
results.to_csv(config.PATH+'results_part_2.csv')

In [431]:
best_gen = np.argmin(min_fitness)
best_gen

In [375]:
arg_min_fitness[best_gen]

7

In [None]:
list_of_station_points = pd.Series(results.iloc[best_gen].values[0].replace('[ ', '').replace(']', '').replace('\n', '').replace('   ', ' ').replace('  ', ' ').split(' ')).astype(int)

In [None]:
### NEXT STEPS

# optimisation sur les valeurs des coordonnées des routes qu'on a choisi 
# (pour optimiser sur ces valeurs et non sur les stations existantes)

# fonction qui mesure distance entre deux stations sur reseaux routier (et non a vol d'oiseau)
