In [None]:
#@title libraries
import os
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.feature_selection import f_classif
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [None]:
#@title categories encoder
def factor_to_integer(df, colname, start_value=0):
    while df[colname].dtype == object:
        myval = start_value # factor starts at "start_value".
        for sval in df[colname].unique():
            df.loc[df[colname] == sval, colname] = myval
            myval += 1
        df[colname] = df[colname].astype(int, copy=False)

In [None]:
#@title collection des données

data = pd.read_csv("/content/drive/MyDrive/AppMachine/train.csv", parse_dates=['Date'], dtype={'StateHoliday': str, 'SchoolHoliday':str}, low_memory=False)
data2 = pd.read_csv("/content/drive/MyDrive/AppMachine/store.csv", dtype={'StoreType': str, 'Assortment': str}, low_memory=False)
data3 = pd.read_csv("/content/drive/MyDrive/AppMachine/test.csv", parse_dates=['Date'], dtype={'StateHoliday': str, 'SchoolHoliday':str}, low_memory=False)

#DC contient les deux BDD train et Store
DC = pd.merge(data, data2, how="left", on=["Store"])
DC2 = pd.merge(data, data2, how="left", on=["Store"])

test = pd.merge(data3, data2, how="left", on=["Store"])

In [None]:
DC.columns

Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment',
       'CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
       'Promo2SinceYear', 'PromoInterval', 'Year', 'Month'],
      dtype='object')

In [None]:
test.columns

Index(['Id', 'Store', 'DayOfWeek', 'Date', 'Open', 'Promo', 'StateHoliday',
       'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval'],
      dtype='object')

In [None]:
#changer la date par mois et année
DC['Year'] = pd.DatetimeIndex(DC['Date']).year
DC['Month'] = pd.DatetimeIndex(DC['Date']).month


DC.isna().sum()

Store                             0
DayOfWeek                         0
Date                              0
Sales                             0
Customers                         0
Open                              0
Promo                             0
StateHoliday                      0
SchoolHoliday                     0
StoreType                         0
Assortment                        0
CompetitionDistance            2642
CompetitionOpenSinceMonth    323348
CompetitionOpenSinceYear     323348
Promo2                            0
Promo2SinceWeek              508031
Promo2SinceYear              508031
PromoInterval                508031
Year                              0
Month                             0
dtype: int64

In [None]:
#@title traitement des données 

#DC['Open'].fillna(value=DC['Open'].median(), inplace=True)
DC['CompetitionDistance'].fillna(value=-1, inplace=True)
#DC['DayOfWeek'].fillna(value=DC['DayOfWeek'].median(), inplace=True)
#DC['Promo'].fillna(value=DC['Promo'].median(), inplace=True)
#DC['Customers'].fillna(value=DC['Customers'].mean(), inplace=True)

factor_to_integer(DC, 'StateHoliday')
factor_to_integer(DC, 'Assortment')
factor_to_integer(DC, 'StoreType')

X = DC.drop(['Sales','Customers','Date','SchoolHoliday','CompetitionOpenSinceYear','CompetitionOpenSinceMonth','Promo2','Promo2SinceWeek','Promo2SinceYear','PromoInterval'], axis=1)
y = DC['Customers']

print(X.columns)


Index(['Store', 'DayOfWeek', 'Open', 'Promo', 'StateHoliday', 'StoreType',
       'Assortment', 'CompetitionDistance', 'Year', 'Month'],
      dtype='object')


In [None]:
#@title Random Forest -- predire Customers
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)

#On créé un Random Forest de 100 arbres 
rf = RandomForestRegressor(n_estimators = 100, random_state = 42)
rf.fit(X_train, y_train)

#d'entraîner sur l'échantillon de validation
predictions = rf.predict(X_val).round()

#calculer l'erreur entre la valeur predite et Customers
erreurs = abs(predictions - y_val)
print('Mean Absolute Error:', round(np.mean(erreurs), 2), 'degrees.')

In [None]:
print(DC2.columns)

Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment',
       'CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
       'Promo2SinceYear', 'PromoInterval'],
      dtype='object')


In [None]:
#@title predire sales
DC2 = DC.copy()
DC2['Year'] = pd.DatetimeIndex(DC2['Date']).year
DC2['Month'] = pd.DatetimeIndex(DC2['Date']).month
X2 = DC2.drop(['Sales','Date','SchoolHoliday','CompetitionOpenSinceYear','CompetitionOpenSinceMonth','Promo2','Promo2SinceWeek','Promo2SinceYear','PromoInterval'], axis=1)
y2 = DC2['Sales']
print (X2.columns)


Index(['Store', 'DayOfWeek', 'Customers', 'Open', 'Promo', 'StateHoliday',
       'StoreType', 'Assortment', 'CompetitionDistance', 'Year', 'Month'],
      dtype='object')


In [None]:
#X2['CompetitionDistance']=X2.CompetitionDistance.values.astype(np.int64)
#X2['Promo2SinceWeek']=X2.Promo2SinceWeek.values.astype(np.int64)
#factor_to_integer(X2, 'StateHoliday')
#factor_to_integer(X2, 'StoreType')
print(X2.dtypes)

Store                    int64
DayOfWeek                int64
Customers                int64
Open                     int64
Promo                    int64
StateHoliday             int64
StoreType                int64
Assortment               int64
CompetitionDistance    float64
Year                     int64
Month                    int64
dtype: object


In [None]:
#@title predire Sales
X2_train, X2_val, y2_train, y2_val = train_test_split(X2, y2, random_state=42)

#On créé un Random Forest de 100 arbres 
rf2 = RandomForestRegressor(n_estimators = 100 ,random_state = 42)
rf2.fit(X2_train, y2_train)

#d'entraîner sur l'échantillon de validation
predictions2 = rf2.predict(X2_val)

#calculer l'erreur entre la valeur predite et Sales
erreurs2 = abs(predictions2 - y2_val)
print('Mean Absolute Error:', round(np.mean(erreurs2), 2), 'degrees.')

Mean Absolute Error: 284.24 degrees.


In [None]:
from sklearn import metrics
from sklearn.metrics import mean_squared_error
import math
predictions = rf2.predict(X2_val)
print(math.sqrt(mean_squared_error(y2_val,predictions)))

474.69249744637625


In [None]:
test['Year'] = pd.DatetimeIndex(test['Date']).year
test['Month'] = pd.DatetimeIndex(test['Date']).month
print(test.columns)

Index(['Id', 'Store', 'DayOfWeek', 'Date', 'Open', 'Promo', 'StateHoliday',
       'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'Year', 'Month'],
      dtype='object')


In [None]:
#@title Test dataset

#test['Open']=test.Open.values.astype(np.int64)
#test['CompetitionDistance']=test.CompetitionDistance.values.astype(np.int64)

test['Open'].fillna(value=test['Open'].median(), inplace=True)
test['Promo'].fillna(value=test['Promo'].median(), inplace=True)
test['CompetitionDistance'].fillna(value=-1, inplace=True)
test['DayOfWeek'].fillna(value=test['DayOfWeek'].median(), inplace=True)

factor_to_integer(test, 'StateHoliday')
factor_to_integer(test, 'Assortment')
factor_to_integer(test, 'StoreType')

XX = test.drop(['Id','Date','Promo2SinceWeek','SchoolHoliday','Promo2','CompetitionOpenSinceMonth','CompetitionOpenSinceYear','Promo2SinceYear','PromoInterval'], axis=1)


In [None]:
XX.columns


print(XX.dtypes)

Store                    int64
DayOfWeek                int64
Open                   float64
Promo                    int64
StateHoliday             int64
StoreType                int64
Assortment               int64
CompetitionDistance    float64
Year                     int64
Month                    int64
dtype: object


In [None]:
XX['Customers']=rf.predict(XX).round()


In [None]:
XX.head()

Unnamed: 0,Store,DayOfWeek,Open,Promo,StateHoliday,StoreType,Assortment,CompetitionDistance,Year,Month,Customers
0,1,4,1.0,1,0,0,0,1270.0,2015,9,510.0
1,3,4,1.0,1,0,1,0,14130.0,2015,9,789.0
2,7,4,1.0,1,0,1,1,24000.0,2015,9,1000.0
3,8,4,1.0,1,0,1,0,7520.0,2015,9,924.0
4,9,4,1.0,1,0,1,1,2030.0,2015,9,647.0


In [None]:
X2_train.columns

Index(['Store', 'DayOfWeek', 'Customers', 'Open', 'Promo', 'StateHoliday',
       'StoreType', 'Assortment', 'CompetitionDistance', 'Year', 'Month'],
      dtype='object')

In [None]:
XX=XX[X2_train.columns]
print (XX.columns)

Index(['Store', 'DayOfWeek', 'Customers', 'Open', 'Promo', 'StateHoliday',
       'StoreType', 'Assortment', 'CompetitionDistance', 'Year', 'Month'],
      dtype='object')


In [None]:
XX['Sales']=rf2.predict(XX)

In [None]:
XX.head()

Unnamed: 0,Store,DayOfWeek,Customers,Open,Promo,StateHoliday,StoreType,Assortment,CompetitionDistance,Year,Month,Sales
0,1,4,510.0,1.0,1,0,0,0,1270.0,2015,9,4510.74
1,3,4,789.0,1.0,1,0,1,0,14130.0,2015,9,7808.11
2,7,4,1000.0,1.0,1,0,1,1,24000.0,2015,9,9829.67
3,8,4,924.0,1.0,1,0,1,0,7520.0,2015,9,8397.12
4,9,4,647.0,1.0,1,0,1,1,2030.0,2015,9,7575.26


In [None]:
result=XX['Sales']
result=result.reset_index()
result['index']=result['index']+1

In [None]:
result.head()

Unnamed: 0,index,Sales
0,1,4510.74
1,2,7808.11
2,3,9829.67
3,4,8397.12
4,5,7575.26


In [None]:
result.to_csv('/content/drive/MyDrive/AppMachine/result.csv',index=False)