In [2]:
import pandas as pd
import numpy as np


"""Loading the dataset in normal format takes up a lot of space approx. 1.4 GB when combined, This function changes
the datatypes of all columns to reduce memory usage.

As the dataset is really big effective memory management is required to work with it so as to save time and resources.

"""
def reduce_mem_usage(df):
    """ 
    iterate through all the columns of a dataframe and 
    modify the data type to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print(('Memory usage of dataframe is {:.2f}' 
                     'MB').format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max <\
                  np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max <\
                   np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max <\
                   np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max <\
                   np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max <\
                   np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max <\
                   np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')  
        end_mem = df.memory_usage().sum() / 1024**2
        print(('Memory usage after optimization is: {:.2f}' 
                              'MB').format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) 
                                             / start_mem))
    
    return df










dataset=pd.read_csv("2015_sales_data.csv") 
dataset=reduce_mem_usage(dataset)

dataset1=pd.read_csv("2016_sales_data.csv") 
dataset1=reduce_mem_usage(dataset1)

dataset2=pd.read_csv("2017_sales_data.csv") 
dataset2=reduce_mem_usage(dataset2)

dataset3=pd.read_csv("2018_sales_data.csv") 

dataset3=reduce_mem_usage(dataset3)






dataset=dataset.append(dataset1)
dataset=dataset.append(dataset2)
dataset=dataset.append(dataset3)


"""
Extracting year ,month and day from the date column and also a feature day_of_week,
which is later used to create a feature is_weekend
"""



dataset['year'] = pd.DatetimeIndex(dataset['date']).year
dataset['month'] = pd.DatetimeIndex(dataset['date']).month
dataset['day'] = pd.DatetimeIndex(dataset['date']).day
import datetime as dt
dataset['date']= pd.to_datetime(dataset['date']) 
dataset['day_of_week'] = dataset.date.dt.dayofweek


del dataset['date']




"""
Creating a features is_weekend which is a categorical feature that tells whether its a weekend or not.
The EDA shows that the footfall and sales increases during weekend and to capture that A feature is_weekend is created.


Not creating categorical feature of products for If the product ID was assigned sequentially so the
latest product added would have the largest product ID.

This is just a Hypothesis as I do not know how the product IDs were assigned.



"""



def getNumber(x):
    if x>=5:
        return 1
    else:
        return 0
dataset["is_weekend"]=dataset["day_of_week"].apply(getNumber)




"""
The EDA shows that the footfall increases during weekend and during certain period of time every year,
such as christmas. Normally a Feature would be created for festivals or other such periods, but I plan on using 
a Tree based model. The advantage of a Tree based model is that it automatically calculates these things and
create the splits for the tree automatically.

This can be verified by plotting the trees using graphviz library.

This is why the day,year,month,day_of_week have been extracted from the columns so the tree algorithm can split
based on these features.

"""







"""
Saving the data as a csv,
but also as a pickle file so that loading it again is faster and the dtypes of the columns are stored.

"""













dataset.to_csv("dataset.csv",index=False)
dataset.to_pickle("dataset.pkl")






Memory usage of dataframe is 140.64MB
Memory usage after optimization is: 114.28MB
Decreased by 18.7%
Memory usage after optimization is: 83.52MB
Decreased by 40.6%
Memory usage after optimization is: 57.15MB
Decreased by 59.4%
Memory usage after optimization is: 30.78MB
Decreased by 78.1%
Memory usage of dataframe is 191.46MB
Memory usage after optimization is: 155.58MB
Decreased by 18.7%
Memory usage after optimization is: 113.69MB
Decreased by 40.6%
Memory usage after optimization is: 77.79MB
Decreased by 59.4%
Memory usage after optimization is: 41.90MB
Decreased by 78.1%
Memory usage of dataframe is 223.69MB
Memory usage after optimization is: 181.76MB
Decreased by 18.7%
Memory usage after optimization is: 132.83MB
Decreased by 40.6%
Memory usage after optimization is: 90.89MB
Decreased by 59.4%
Memory usage after optimization is: 48.95MB
Decreased by 78.1%
Memory usage of dataframe is 89.22MB
Memory usage after optimization is: 69.71MB
Decreased by 21.9%
Memory usage after optimi

In [None]:

"""
also modifying test data for the features added.
"""





test=pd.read_csv("test_data.csv") 
test=reduce_mem_usage(test)




test['year'] = pd.DatetimeIndex(test['date']).year
test['month'] = pd.DatetimeIndex(test['date']).month
test['day'] = pd.DatetimeIndex(test['date']).day
test['date']= pd.to_datetime(test['date']) 
test['day_of_week'] = test.date.dt.dayofweek


del test['date']



test["is_weekend"]=test["day_of_week"].apply(getNumber)








test.to_pickle("test.pkl")






In [None]:
"""
While the dataset of 2015 and 2016 has been useful for getting insights like increase in footfall during certain
specific periods of the year and also when discount has been offered.

The data of 2017 is more likely to predict the sales of 2018 better than 2015 and 2016.

Another way would be to create 2 models one for data>=2017 and one for data<=2017 and 
assign weights to their prediction like

final_prediction=0.7*pred_2017+0.3*pred_2015_16


but that would consume a more time and memory

So after generating insights from all the data, working the recent data i.e. 2017 and above seemed best for creating
a baseline model.

"""




dataset=dataset[dataset['year']>=2017]


"""
Even after discarding the 2015 and 2016 data, the dataset was still very heavy and computation was taking a lot of time.
It was running out of memory on some models too. One solution to this was using dask instead of pandas and dask_ml
instead of sklearn to load and train models. It would solve the out of memory issue but would still take time.

another was to implement incremental learning,but that would still consume time while training.

The best solution given the time limit was to take a sample of the dataset and test the hypothesis on the sample.

I took 20% of the dataset as the sample and implemented few algorithms.

"""




dataset=dataset.sample(frac=0.2,random_state=1)

In [None]:
"""
I tried various model like decision tree,Xgboost,LightGBM, sklearn's implementation of GBM
and the RMSE error was varying from 70 to 108.

Finally after training random forest I got rmse score of 38 for n_estimators=10. 

I did manual Hyperparameter tuning for Random forest instead of using gridsearch because as the n_estimators increase
the training time was greatly increasing.


At last I found that at n_estimators=100, with no max_depth, 
the RF was performing good and giving an RMSE between 13 to 16.

"""

dataset_test=dataset['sales']
del dataset['sales']
dataset_train=dataset





from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


X_train, X_test, Y_train, Y_test = train_test_split(dataset_train, dataset_test, test_size = 0.25, random_state = 0)


from sklearn import ensemble


"""
n_jobs=3 parameter builds the trees Using backend ThreadingBackend with 3 concurrent workers.
This decreases the training time.

verbose=3 prints information while training like which tree is being built, so we can calculate how much time is left.

"""



reg=ensemble.RandomForestRegressor(n_jobs=3,verbose=3)

print('='*30)
reg.fit(X_train,Y_train)
name=reg.__class__.__name__
print('='*30)
print(name)






In [None]:
predictions=reg.predict(X_test)
rmse=np.sqrt(mean_squared_error(Y_test,predictions))
print("RMSError is: {}".format(rmse))


"""
Calculating the RMSE error on the test data in order to prevent leaderboard overfitting.
Normally calculating a CV score would be a better option but the algorithm was taking too long to train, so i 
decided to go with 25% test data.


"""




In [None]:
"""
reading the test pickle file which i had saved earlier after feature engineering.
Reading the sample submission file and adding the prediction, converting it into int.
saving the Dataframe.

"""






test=pd.read_pickle("test.pkl") 
pred=reg.predict(test)


submit=pd.read_csv("sample_submission.csv")
submit['sales']=pred






c=[]
for i in submit['sales']:
    c.append(int(i))
    
submit['sales']=c

submit.to_csv('rforest_100estimator_dummy_2017[0.2]data.csv',index=False)