# Import packages

In [44]:
# Database accessibility 
import pyodbc
from dotenv import dotenv_values 

# Analysis libraries
import pandas as pd 
import numpy as np
from sklearn.impute import SimpleImputer

In [45]:
# Load environment variables from .env file into a dictionary
environment_variables = dotenv_values('.env')


# Get the values for the credentials you set in the '.env' file
database = environment_variables.get("database")
server = environment_variables.get("server")
username = environment_variables.get("user")
password = environment_variables.get("password")


connection_string = f"DRIVER={{SQL Server}};SERVER={server};DATABASE={database};UID={username};PWD={password}"



In [46]:
# Use the connect method of the pyodbc library and pass in the connection string.
# This will connect to the server and might take a few seconds to be complete. 
# Check your internet connection if it takes more time than necessary

connection = pyodbc.connect(connection_string)

In [47]:
# Define SQL queries for each table
query1 = 'SELECT * FROM dbo.oil'
query2 = 'SELECT * FROM dbo.holidays_events'
query3 = 'SELECT * FROM dbo.stores'

# Read data from tables into pandas DataFrames
oil = pd.read_sql(query1, connection)
holidays_events = pd.read_sql(query2, connection)
stores = pd.read_sql(query3, connection)

# Close the database connection
connection.close()

  oil = pd.read_sql(query1, connection)
  holidays_events = pd.read_sql(query2, connection)
  stores = pd.read_sql(query3, connection)


In [48]:
oil.head()

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.139999
2,2013-01-03,92.970001
3,2013-01-04,93.120003
4,2013-01-07,93.199997


Rename second col to dailly oil prices

In [49]:
holidays_events.head()
holidays_events.rename(columns={
    'type': 'holiday_type'
}, inplace=True)

# Print the resulting DataFrame
holidays_events

Unnamed: 0,date,holiday_type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False
...,...,...,...,...,...,...
345,2017-12-22,Additional,National,Ecuador,Navidad-3,False
346,2017-12-23,Additional,National,Ecuador,Navidad-2,False
347,2017-12-24,Additional,National,Ecuador,Navidad-1,False
348,2017-12-25,Holiday,National,Ecuador,Navidad,False


Rename type here as holiday type and concat with oil['dailyoilprices']

In [7]:
stores.head()

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission.head()

Do away with this set since sales has no values.

In [51]:
transactions = pd.read_csv('data/transactions.csv')
transactions.head()

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


In [52]:
train = pd.read_csv('data/train.csv')
train.sample(5)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
489193,489193,2013-10-02,35,BABY CARE,0.0,0
1508750,1508750,2015-04-29,41,MAGAZINES,4.0,0
2600182,2600182,2017-01-03,16,GROCERY II,5.0,0
2708956,2708956,2017-03-05,18,LADIESWEAR,4.0,0
1005783,1005783,2014-07-20,3,DELI,362.34,0


In [53]:
train[(train['sales'] == 770) & (train['store_nbr'] == 25) ]

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
250053,250053,2013-05-21,25,GROCERY I,770.0,0
417552,417552,2013-08-23,25,BEVERAGES,770.0,0
631396,631396,2013-12-21,25,CLEANING,770.0,0


...

In [54]:
oil.to_csv('data/oil.csv',index=False)
transactions.to_csv('data/transactions.csv',index=False)
holidays_events.to_csv('data/holidays_events.csv',index=False)
stores.to_csv('data/stores.csv',index=False)

## Join Tables

### Join to display data contained in both dataframes

In [55]:
full_transaction= pd.merge(transactions, train)
full_transaction.sample(5)

Unnamed: 0,date,store_nbr,transactions,id,family,sales,onpromotion
60781,2013-02-11,1,396,73090,POULTRY,44.441998,0
943105,2014-09-08,2,2009,1094542,SCHOOL AND OFFICE SUPPLIES,0.0,0
274026,2013-06-30,24,1948,321315,PLAYERS AND ELECTRONICS,0.0,0
1048672,2014-11-13,40,1296,1212913,SCHOOL AND OFFICE SUPPLIES,0.0,0
14292,2013-01-11,19,999,18153,BEVERAGES,1228.0,0


## Join the full transactions based on stores

In [56]:
result = pd.merge(full_transaction, stores, on='store_nbr', how='inner')
result.head(5)


Unnamed: 0,date,store_nbr,transactions,id,family,sales,onpromotion,city,state,type,cluster
0,2013-01-01,25,770,561,AUTOMOTIVE,0.0,0,Salinas,Santa Elena,D,1
1,2013-01-01,25,770,562,BABY CARE,0.0,0,Salinas,Santa Elena,D,1
2,2013-01-01,25,770,563,BEAUTY,2.0,0,Salinas,Santa Elena,D,1
3,2013-01-01,25,770,564,BEVERAGES,810.0,0,Salinas,Santa Elena,D,1
4,2013-01-01,25,770,565,BOOKS,0.0,0,Salinas,Santa Elena,D,1


## Join the full transactions based on oil data for each date

In [57]:
result1= pd.merge(result, oil, on='date', how='inner')
result1.sample(5)


Unnamed: 0,date,store_nbr,transactions,id,family,sales,onpromotion,city,state,type,cluster,dcoilwtico
128977,2013-04-30,50,2295,213556,GROCERY II,35.0,0,Ambato,Tungurahua,A,14,93.220001
175020,2013-06-11,41,712,288078,LINGERIE,11.0,0,Machala,El Oro,D,4,95.5
461155,2014-02-27,37,1516,751225,GROCERY II,53.0,0,Cuenca,Azuay,D,2,102.68
664580,2014-09-02,18,1682,1083779,PET SUPPLIES,3.0,0,Quito,Pichincha,B,16,92.919998
468949,2014-03-06,38,1747,763738,LADIESWEAR,24.0,0,Loja,Loja,D,4,101.82


## Join the full transactions based on holidays

In [58]:
salesdata= pd.merge(result1, holidays_events, on='date', how='inner')
salesdata.reset_index(drop=True,inplace=True)
salesdata.head(5)


Unnamed: 0,date,store_nbr,transactions,id,family,sales,onpromotion,city,state,type,cluster,dcoilwtico,holiday_type,locale,locale_name,description,transferred
0,2013-01-01,25,770,561,AUTOMOTIVE,0.0,0,Salinas,Santa Elena,D,1,,Holiday,National,Ecuador,Primer dia del ano,False
1,2013-01-01,25,770,562,BABY CARE,0.0,0,Salinas,Santa Elena,D,1,,Holiday,National,Ecuador,Primer dia del ano,False
2,2013-01-01,25,770,563,BEAUTY,2.0,0,Salinas,Santa Elena,D,1,,Holiday,National,Ecuador,Primer dia del ano,False
3,2013-01-01,25,770,564,BEVERAGES,810.0,0,Salinas,Santa Elena,D,1,,Holiday,National,Ecuador,Primer dia del ano,False
4,2013-01-01,25,770,565,BOOKS,0.0,0,Salinas,Santa Elena,D,1,,Holiday,National,Ecuador,Primer dia del ano,False


## Drop some columns (id column)

In [59]:
salesdata.drop(columns='id', inplace=True)

## Rename columns

In [62]:
salesdata.rename(columns={
    'store_nbr': 'store_number',
    'dcoilwtico': 'oil_prices',
}, inplace=True)
salesdata.sample(5)

Unnamed: 0,date,store_number,transactions,family,sales,onpromotion,city,state,type,cluster,oil_prices,holiday_type,locale,locale_name,description,transferred
105218,2014-12-01,45,3846,HARDWARE,4.0,0,Quito,Pichincha,A,11,68.980003,Event,National,Ecuador,Cyber Monday,False
315414,2017-08-15,6,1589,AUTOMOTIVE,7.0,0,Quito,Pichincha,D,13,47.57,Holiday,Local,Riobamba,Fundacion de Riobamba,False
43817,2013-12-26,19,1427,PET SUPPLIES,0.0,0,Guaranda,Bolivar,C,15,99.18,Additional,National,Ecuador,Navidad+1,False
181548,2015-12-23,39,2383,HOME AND KITCHEN I,54.0,0,Cuenca,Azuay,B,6,36.759998,Additional,National,Ecuador,Navidad-2,False
119412,2014-12-31,43,2312,HOME CARE,312.0,0,Esmeraldas,Esmeraldas,E,10,53.450001,Additional,National,Ecuador,Primer dia del ano-1,False
