In [39]:
import pandas as pd
import numpy as np
import time
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor

In [40]:
dfValue = pd.read_csv('train.csv',engine='c',memory_map=True)

In [41]:
dfTest = pd.read_csv('test.csv',engine='c',memory_map=True)

In [42]:
X = dfValue.loc[:, dfValue.columns != 'trip_duration']
Y = dfValue.trip_duration

In [43]:
pickupDatetime  = pd.to_datetime(X.pickup_datetime)
dropoffDatetime = pd.to_datetime(X.dropoff_datetime)

XDrop = X.drop(['id','pickup_datetime','dropoff_datetime'],axis=1)

# On reccréé des colonnes de pickup en séparant les informations de la date
XDrop['pickupHour'] = pickupDatetime.dt.hour
XDrop['pickupDayOfWeek'] = pickupDatetime.dt.dayofweek
XDrop['pickupWeekOfYear'] = pickupDatetime.dt.weekofyear
XDrop['pickupDayOfYear'] = pickupDatetime.dt.dayofyear

# Pareil pour le dropoff
XDrop['dropoffHour'] = dropoffDatetime.dt.hour
XDrop['dropoffDayOfWeek'] = dropoffDatetime.dt.dayofweek
XDrop['dropoffWeekOfYear'] = dropoffDatetime.dt.weekofyear
XDrop['dropoffDayOfYear'] = dropoffDatetime.dt.dayofyear

XTrain, XTest, YTrain, Ytest = train_test_split(XDrop,Y,train_size=0.6,random_state=19061996)

In [44]:
#pipeline = Pipeline(steps=[('preprocessor', preprocessor),('model', model)]).fit(XTrain,YTrain)


## PREPROCESSING LABEL ENCODER ##
# labelEncoder = LabelEncoder()
# XTrainFinal = XTrain.copy()
# XTestFinal = XTest.copy()

# XTrainFinal.store_and_fwd_flag = labelEncoder.fit_transform(XTrain.store_and_fwd_flag)
# XTestFinal.store_and_fwd_flag = labelEncoder.transform(XTest.store_and_fwd_flag)

## PREPROCESSING ONE HOT ENCODER
OHEncoder = OneHotEncoder(sparse=False)
OHColumnsTrain = pd.DataFrame(OHEncoder.fit_transform(XTrain[['store_and_fwd_flag']]))
OHColumnsTest  = pd.DataFrame(OHEncoder.transform(XTest[['store_and_fwd_flag']]))

OHColumnsTrain.index = XTrain.index
OHColumnsTest.index = XTest.index

XTrainCategoricalRemoved = XTrain.loc[:, XTrain.columns != 'store_and_fwd_flag']
XTestCategoricalRemoved = XTest.loc[:, XTest.columns != 'store_and_fwd_flag']

XTrainFinal = pd.concat([XTrainCategoricalRemoved, OHColumnsTrain], axis=1)
XTestFinal = pd.concat([XTestCategoricalRemoved, OHColumnsTest], axis=1)

In [45]:
#n_estimators=150 max_depth=10 --> meilleur résultats pour 10_000 lignes avec OHEncoder
def train (n, d):
    startingMsg = "Starting n={} d={}\n".format(n,d)
    print(startingMsg,end='')
    
    model = RandomForestRegressor(random_state=19061996,n_estimators=n, max_depth=d)
    start = time.time()
    model.fit(XTrainFinal,YTrain)
    end = time.time()
    preds = model.predict(XTestFinal)
    mean = mean_absolute_error(Ytest,preds)

    endMsg = "n={} d={} t={}s e={}\n".format(n,d,format(end-start),mean)
    print(endMsg,end='')

In [46]:
N = [2,4,5,10]
D = [2,3,6,7]

# +---------> D
# |
# |
# N
mask = [[0,1,1,0],[1,1,0,0],[0,0,1,1],[1,0,0,1]]

#Je vérifie que je n'est pas fait de la merde
assert(len(mask) == len(N))
for i in range(len(N)):
    assert(len(mask[i]) == len(D))

In [47]:
import threading # Les entraînements commencent à être long, il est temps de passer aux threads
lock = threading.Lock()
def threadFunc():
    for i in range(len(N)):
        for j in range (len(D)):
            lock.acquire(True)

            if mask[i][j] == 1:
                mask[i][j] = 0
                lock.release()
                train(N[i],D[j])
            
            if (lock.locked()):
                lock.release()

In [48]:
print(XTrainFinal.shape, ' ', XTestFinal.shape)
threadNum = 4
threads = []
# Lancement des threads
for i in range(threadNum - 1):
    threads.append(threading.Thread(target=threadFunc))
    threads[i].start()

threading.Thread(target=threadFunc).run()

for t in threads:
    t.join()

(875186, 16)   (583458, 16)
Starting n=2 d=3
Starting n=2 d=6
Starting n=4 d=2
Starting n=4 d=3
n=2 d=3 t=13.756820917129517s e=550.8763485547739
Starting n=5 d=6
n=4 d=2 t=17.968613862991333s e=556.4648143704406
Starting n=5 d=7
n=2 d=6 t=24.574275970458984s e=537.823595540267
Starting n=10 d=2
n=4 d=3 t=24.696449041366577s e=544.4455334762582
Starting n=10 d=7
n=10 d=2 t=35.70218777656555s e=558.2111541509372
n=5 d=6 t=51.17058515548706s e=529.3804401539129
n=5 d=7 t=55.00074005126953s e=526.7995014695272
