In [2]:
import pandas as pd
import numpy as np
import time
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor

In [3]:
dfValue = pd.read_csv('train.csv',engine='c',memory_map=True)

In [4]:
dfTest = pd.read_csv('test.csv',engine='c',memory_map=True)

In [5]:
X = dfValue.loc[:, dfValue.columns != 'trip_duration']
Y = dfValue.trip_duration

In [6]:
pickupDatetime  = pd.to_datetime(X.pickup_datetime)
dropoffDatetime = pd.to_datetime(X.dropoff_datetime)

XDrop = X.drop(['id','pickup_datetime','dropoff_datetime'],axis=1)

# On reccréé des colonnes de pickup en séparant les informations de la date
XDrop['pickupHour'] = pickupDatetime.dt.hour
XDrop['pickupDayOfWeek'] = pickupDatetime.dt.dayofweek
XDrop['pickupWeekOfYear'] = pickupDatetime.dt.weekofyear
XDrop['pickupDayOfYear'] = pickupDatetime.dt.dayofyear

# Pareil pour le dropoff
XDrop['dropoffHour'] = dropoffDatetime.dt.hour
XDrop['dropoffDayOfWeek'] = dropoffDatetime.dt.dayofweek
XDrop['dropoffWeekOfYear'] = dropoffDatetime.dt.weekofyear
XDrop['dropoffDayOfYear'] = dropoffDatetime.dt.dayofyear

XTrain, XTest, YTrain, Ytest = train_test_split(XDrop,Y,train_size=0.6,random_state=19061996)

In [7]:
#pipeline = Pipeline(steps=[('preprocessor', preprocessor),('model', model)]).fit(XTrain,YTrain)


## PREPROCESSING LABEL ENCODER ##
# labelEncoder = LabelEncoder()
# XTrainFinal = XTrain.copy()
# XTestFinal = XTest.copy()

# XTrainFinal.store_and_fwd_flag = labelEncoder.fit_transform(XTrain.store_and_fwd_flag)
# XTestFinal.store_and_fwd_flag = labelEncoder.transform(XTest.store_and_fwd_flag)

## PREPROCESSING ONE HOT ENCODER
OHEncoder = OneHotEncoder(sparse=False)
OHColumnsTrain = pd.DataFrame(OHEncoder.fit_transform(XTrain[['store_and_fwd_flag']]))
OHColumnsTest  = pd.DataFrame(OHEncoder.transform(XTest[['store_and_fwd_flag']]))

OHColumnsTrain.index = XTrain.index
OHColumnsTest.index = XTest.index

XTrainCategoricalRemoved = XTrain.loc[:, XTrain.columns != 'store_and_fwd_flag']
XTestCategoricalRemoved = XTest.loc[:, XTest.columns != 'store_and_fwd_flag']

XTrainFinal = pd.concat([XTrainCategoricalRemoved, OHColumnsTrain], axis=1)
XTestFinal = pd.concat([XTestCategoricalRemoved, OHColumnsTest], axis=1)

In [8]:
#n_estimators=150 max_depth=10 --> meilleur résultats pour 10_000 lignes avec OHEncoder
def train (nList, dList):
    for n in nList:
        for d in dList:
            startingMsg = "Starting n={} d={}\n".format(n,d)
            print(startingMsg,end='')
            
            model = RandomForestRegressor(random_state=19061996,n_estimators=n, max_depth=d)
            start = time.time()
            model.fit(XTrainFinal,YTrain)
            end = time.time()
            preds = model.predict(XTestFinal)
            mean = mean_absolute_error(Ytest,preds)

            endMsg = "n={} d={} t={}s e={}\n".format(n,d,format(end-start),mean)
            print(endMsg,end='')
        print('')

In [9]:
import threading # Les entraînements commencent à être long, il est temps de passer aux threads 
print(XTrainFinal.shape, ' ', XTestFinal.shape)

threadNum = 2
N = [10,11,12]
D = [10,15,17,20,30]

for n in N:
    threadArgs = []
    for i in range(threadNum):
        threadArgs.append([])

    for i in range(len(D)):
        threadArgs[i % threadNum].append(D[i])

    threadList = []

    for i in range(threadNum - 1):
        threadList.append(threading.Thread(target=train,args=([n],threadArgs[i])))
        threadList[i].start()

    threadList.append(threading.Thread(target=train,args=([n],threadArgs[threadNum-1])))
    threadList[threadNum-1].run()

    for t in threadList:
        t.join()


(875186, 16)   (583458, 16)
Starting n=10 d=10
Starting n=10 d=15
n=10 d=10 t=93.8547751903534s e=500.7646538195638
Starting n=10 d=17
n=10 d=15 t=139.27649903297424s e=456.1972427538797
Starting n=10 d=20
n=10 d=17 t=158.70106482505798s e=444.2230268258811
Starting n=10 d=30
n=10 d=20 t=179.82802605628967s e=428.9234625010339



KeyboardInterrupt: 