In [1]:
import pandas as pd
from pandas.io import sql

Pandas can be used to connect to most relational databases. In this demonstration, we will create and connect to a SQLite database. SQLite creates portable SQL databases saved in a single file. These databases are stored in a very efficient manner and allow fast querying, making them ideal for small databases or databases that need to be moved across machines.

If you are looking to start using a database without the setup of `mysql` or `postgres`, SQLite is a good start.

In [2]:
import sqlite3

In [3]:
conn = sqlite3.connect('dat-test.db')

Let's return to the Rossmann sales data and load that into the database.

Data is moved to the database through the `to_sql` command, similar to the `to_csv` command.

`to_sql` takes as arugments:
    - `name`, the table name to create
    - `con`, a connection to a database
    - `index`, whether to input the index column
    - `schema`, if we want to write a custom schema for the new table
    - `if_exists`, what to do if the table already exists. We can overwrite it, add to it, or fail

In [4]:
data = pd.read_csv('../../../lesson-15/code/dataset/rossmann.csv', low_memory=False)
data.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


In [5]:
data.to_sql('rossmann_sales',
            con=conn,
            if_exists='replace',
            index=False)

In [6]:
sql.read_sql('select * from rossmann_sales limit 10', con=conn)

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1
5,6,5,2015-07-31,5651,589,1,1,0,1
6,7,5,2015-07-31,15344,1414,1,1,0,1
7,8,5,2015-07-31,8492,833,1,1,0,1
8,9,5,2015-07-31,8565,687,1,1,0,1
9,10,5,2015-07-31,7185,681,1,1,0,1


> #### CHECK: Load the Rossmann Store metadata in `rossmann-stores.csv` and create a table into the database from it

Four kinds of queries
    1. select
    2. update
    3. insert
    4. delete

> #### CHECK: (SELECT) Have the students write a query that returns the `Store`, `Date` and `Customers`

In [7]:
sql.read_sql('select store, customers, date from rossmann_sales', con=conn)
# the table will show with the columns in the same order as the query

Unnamed: 0,Store,Customers,Date
0,1,555,2015-07-31
1,2,625,2015-07-31
2,3,821,2015-07-31
3,4,1498,2015-07-31
4,5,559,2015-07-31
5,6,589,2015-07-31
6,7,1414,2015-07-31
7,8,833,2015-07-31
8,9,687,2015-07-31
9,10,681,2015-07-31


> #### CHECK: (WHERE) Have the students write a query that returns the `Store`, `Date` and `Customers` for when the stores were open and running a promotion

In [8]:
sql.read_sql('select store, customers, date from rossmann_sales where open = 1 and promo = 1', con=conn)

Unnamed: 0,Store,Customers,Date
0,1,555,2015-07-31
1,2,625,2015-07-31
2,3,821,2015-07-31
3,4,1498,2015-07-31
4,5,559,2015-07-31
5,6,589,2015-07-31
6,7,1414,2015-07-31
7,8,833,2015-07-31
8,9,687,2015-07-31
9,10,681,2015-07-31


> #### CHECK: (GROUP BY) Have the students write a query that returns the total sales on promotion days.

In [9]:
#groupby
sql.read_sql('select promo, sum(sales), avg(sales) from rossmann_sales where open = 1 group by promo', con=conn)

Unnamed: 0,Promo,sum(sales),avg(sales)
0,0,2771974337,5929.407603
1,1,3101206286,8228.281239


In [10]:
# ordering adding a new 
sql.read_sql('select store, sum(sales) as customers, date from rossmann_sales where open = 1 and promo = 1', con=conn)

Unnamed: 0,Store,customers,Date
0,1115,3101206286,2013-01-07


#### Exercises

1. Load the Walmart sales and store features data
1. Create a table for each of those datasets
1. Select the store, date and fuel price on days it was over 90 degrees
1. Select the store, date and weekly sales and temperature
1. What were average sales on holiday vs. non-holiday sales
1. What were average sales on holiday vs. non-holiday sales when the temperature was below 32 degrees

In [11]:
#Loading csv files into dataframes
dataWMsales = pd.read_csv('../../../lesson-17/code/dataset/walmart-sales.csv', low_memory=False)
dataWMfeatures = pd.read_csv('../../../lesson-17/code/dataset/features.csv', low_memory=False)

In [12]:
# Creating a new 'connection' or database
WMconn = sqlite3.connect('Walmart.db')

In [13]:
#Putting dataWMsales dataframe into db and calling it WMSales 
dataWMsales.to_sql('WMsales',
            con=WMconn,
            if_exists='replace',
            index=False)

In [14]:
#Putting dataWMfeatures dataframe into db and calling it WMfeatures 
dataWMfeatures.to_sql('WMfeatures',
            con=WMconn,
            if_exists='replace',
            index=False)

In [15]:
dataWMsales.head(2)

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,24924.5,False
1,1,1,2010-02-12,46039.49,True


In [16]:
dataWMfeatures.head(2)

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,1,2010-02-05,42.31,2.572,,,,,,211.096358,8.106,False
1,1,2010-02-12,38.51,2.548,,,,,,211.24217,8.106,True


In [17]:
#Select the store, date and fuel price on days it was over 90 degrees
sql.read_sql('select store, date, fuel_price from WMfeatures where temperature > 90', con=WMconn)


Unnamed: 0,Store,Date,Fuel_Price
0,1,2011-08-05,3.684
1,1,2011-08-12,3.638
2,2,2011-07-29,3.682
3,2,2011-08-05,3.684
4,2,2011-08-12,3.638
5,2,2011-08-26,3.523
6,2,2012-08-03,3.417
7,5,2011-08-05,3.684
8,5,2011-08-12,3.638
9,5,2011-09-02,3.533


SELECT a.Store, a.Sales, s.CompetitionDistance

FROM rossmann_sales a

JOIN rossmann_stores s

ON a.Store = s.Store

In [24]:
print dataWMsales.columns
print dataWMfeatures.columns

Index([u'Store', u'Dept', u'Date', u'Weekly_Sales', u'IsHoliday'], dtype='object')
Index([u'Store', u'Date', u'Temperature', u'Fuel_Price', u'MarkDown1',
       u'MarkDown2', u'MarkDown3', u'MarkDown4', u'MarkDown5', u'CPI',
       u'Unemployment', u'IsHoliday'],
      dtype='object')


In [25]:
#Select the store, date and weekly sales and temperature
sql.read_sql("""
select s.store, s.date, sum(s.weekly_sales), f.temperature 
    from WMsales s 
    join WMfeatures f 
    on s.store = f.store and s.date = f.date 
    group by s.store 
    LIMIT 10""", con=WMconn)

Unnamed: 0,Store,Date,sum(s.weekly_sales),Temperature
0,1,2012-10-12,222402800.0,62.99
1,2,2012-10-12,275382400.0,60.97
2,3,2012-08-03,57586740.0,86.55
3,4,2012-10-12,299544000.0,57.11
4,5,2012-08-10,45475690.0,86.96
5,6,2012-10-12,223756100.0,65.43
6,7,2012-05-11,81598280.0,48.54
7,8,2012-10-26,129951200.0,64.74
8,9,2012-05-18,77789220.0,69.52
9,10,2012-10-26,271617700.0,70.5


In [27]:
#What were average sales on holiday vs. non-holiday sales
sql.read_sql('select isholiday, avg(weekly_sales) from WMsales group by isholiday',con=WMconn)

Unnamed: 0,IsHoliday,avg(weekly_sales)
0,0,15901.445069
1,1,17035.823187


In [31]:
#    What were average sales on holiday vs. non-holiday sales when the temperature was below 32 degrees

sql.read_sql("""
SELECT avg(s.weekly_sales), s.isholiday 
from WMsales s
join wmfeatures f
on s.store = f.store and s.date = f.date
where f.temperature <32
group by s.isholiday""",
con=WMconn)

Unnamed: 0,avg(s.weekly_sales),IsHoliday
0,15275.770307,0
1,15111.471238,1
