In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor

#imports training data
sales_data = pd.read_csv("./data/sales.csv")

#converts state_holiday data (because it is a boolean) into numerical forms
le = LabelEncoder()
sales_data.state_holiday = le.fit_transform(sales_data.state_holiday)

#extracts date and reformats it into new columns of month and year using numeric forms
sales_data.date = pd.to_datetime(sales_data.date)
sales_data["month"] = sales_data.date.dt.month
sales_data["year"] = sales_data.date.dt.year

#drops the original date column from the df
sales_data.drop(columns=["date"], inplace=True)

#renames the unnamed index column to index
sales_data.rename(columns={sales_data.columns[0]: 'index'}, inplace=True)

#initializes test data coefficient and intercept
y = sales_data.sales
X = sales_data.drop(columns=["sales"], axis=1)

#splits the data in training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)

#uses boost regression to train model
rf_model = XGBRegressor(subsample=0.8, reg_lambda=1, reg_alpha=0, n_jobs=-1, n_estimators=500, max_depth=8, learning_rate=0.1, gamma=0, colsample_bytree=1.0).fit(X_train, y_train) 

#makes a prediction from test data
y_test_pred_rf = rf_model.predict(X_test)

#create r2_score
r2_score_test_rf = r2_score(y_test, y_test_pred_rf)


print(r2_score_test_rf)

sales_data.head()



0.9448134303092957


Unnamed: 0,index,store_ID,day_of_week,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales,month,year
0,425390,366,4,517,1,0,0,0,4422,4,2013
1,291687,394,6,694,1,0,0,0,8297,4,2015
2,411278,807,4,970,1,1,0,0,9729,8,2013
3,664714,802,2,473,1,1,0,0,6513,5,2013
4,540835,726,4,1068,1,1,0,0,10882,10,2013


In [None]:
real_data = pd.read_csv("./data/Real_Data.csv")

le = LabelEncoder()
real_data.state_holiday = le.fit_transform(real_data.state_holiday)

#converts date column into numeric format
real_data.date = pd.to_datetime(real_data.date, format="%d/%m/%Y")
real_data["month"] = real_data.date.dt.month
real_data["year"] = real_data.date.dt.year

real_data.drop(columns=["date"], inplace=True)

real_data.rename(columns={real_data.columns[0]: 'index'}, inplace=True)

real_data_pred_rf = rf_model.predict(real_data)
print(real_data_pred_rf)

#adds real data target prediction as sales column into the real data df
real_data["sales"] = real_data_pred_rf

real_data.head()

#adds real data to G2.csv file
real_data.to_csv("G2.csv")


[ 127.04358    12.424799 6551.721    ... 5304.588    6905.9375
 7902.824   ]
