# The sqlite3 package

In [1]:
import sqlite3
sqlite_db = 'test_db.sqlite'
conn = sqlite3.connect(sqlite_db)
c = conn.cursor()

In [2]:
c.execute('CREATE TABLE houses (field1 INTEGER PRIMARY KEY, sqft INTEGER, bdrms INTEGER, age INTEGER, price INTEGER);')

# Save (commit) the changes
conn.commit()

In [3]:
last_sale = (None, 4000, 5, 22, 619000)
c.execute('INSERT INTO houses VALUES (?,?,?,?,?)',last_sale)

# Remember to commit the changes
conn.commit()

In [4]:
recent_sales = [
  (None, 2390, 4, 34, 319000),
  (None, 1870, 3, 14, 289000),
  (None, 1505, 3, 90, 269000),
]

c.executemany('INSERT INTO houses VALUES (?, ?, ?, ?, ?)', recent_sales)

conn.commit()

In [5]:
from numpy import genfromtxt

#import into nparray of ints, then convert to list of lists
data = (genfromtxt('CSV/housing-data.csv', dtype='i8',
                    delimiter=',', skip_header=1)).tolist()

# append a None value to beginning of each sub-list
for d in data:
    d.insert(0, None)

In [6]:
data[0:3]

[[None, 2104, 3, 70, 399900],
 [None, 1600, 3, 28, 329900],
 [None, 2400, 3, 44, 369000]]

In [7]:
# loop through the data, running an INSERT on each record (i.e. sublist)
for d in data:
    c.execute('INSERT INTO houses VALUES (?,?,?,?,?)', d)

conn.commit()

In [8]:
results = c.execute("SELECT * FROM houses WHERE bdrms = 4")

results.fetchall()

[(2, 2390, 4, 34, 319000),
 (9, 3000, 4, 75, 539900),
 (10, 1985, 4, 61, 299900),
 (15, 1940, 4, 7, 239999),
 (20, 2300, 4, 77, 449900),
 (23, 2609, 4, 5, 499998),
 (24, 3031, 4, 21, 599000),
 (28, 1962, 4, 53, 259900),
 (37, 2040, 4, 75, 314900),
 (39, 1811, 4, 24, 285900),
 (42, 2132, 4, 28, 345000),
 (43, 4215, 4, 66, 549000),
 (44, 2162, 4, 43, 287000),
 (47, 2567, 4, 57, 314000),
 (50, 1852, 4, 64, 299900)]

# Pandas connector

In [9]:
import pandas as pd
from pandas.io import sql

In [10]:
data = pd.read_csv('CSV/housing-data.csv', low_memory=False)
data.head()

Unnamed: 0,sqft,bdrms,age,price
0,2104,3,70,399900
1,1600,3,28,329900
2,2400,3,44,369000
3,1416,2,49,232000
4,3000,4,75,539900


In [11]:
data.to_sql('houses_pandas',
            con=conn,
            if_exists='replace',
            index=False)

In [12]:
sql.read_sql('SELECT * FROM houses_pandas limit 10', con=conn)

Unnamed: 0,sqft,bdrms,age,price
0,2104,3,70,399900
1,1600,3,28,329900
2,2400,3,44,369000
3,1416,2,49,232000
4,3000,4,75,539900
5,1985,4,61,299900
6,1534,3,12,314900
7,1427,3,57,198999
8,1380,3,14,212000
9,1494,3,15,242500


Questions:

    What's the average price per room for 1 bedroom apartments?
    What's the average price per room for 2 bedrooms apartments?
    What's the most frequent apartment size (in terms of bedrooms)?
    How many are there of that apartment kind?
    What fraction of the total number are of that kind?
    How old is the oldest 3 bedrooms apartment?
    How old is the youngest apartment?
    What's the average age for the whole dataset?
    What's the average age for each bedroom size?

Try to answer all these in SQL.

In [14]:
sql.read_sql("SELECT AVG(price) FROM houses_pandas WHERE bdrms=1", con=conn)

Unnamed: 0,AVG(price)
0,169900.0


In [21]:
sql.read_sql("SELECT AVG(price)/2.0 FROM houses_pandas WHERE bdrms=2", con=conn)

Unnamed: 0,AVG(price)/2.0
0,140433.333333


In [22]:
sql.read_sql("SELECT COUNT(bdrms) FROM houses_pandas GROUP BY bdrms", con=conn)

Unnamed: 0,COUNT(bdrms)
0,1
1,6
2,25
3,14
4,1


In [23]:
sql.read_sql("SELECT COUNT(bdrms)/47.0 FROM houses_pandas GROUP BY bdrms", con=conn)

Unnamed: 0,COUNT(bdrms)/47.0
0,0.021277
1,0.12766
2,0.531915
3,0.297872
4,0.021277


In [24]:
sql.read_sql("SELECT MAX(age) FROM houses_pandas WHERE bdrms=3", con=conn)

Unnamed: 0,MAX(age)
0,78


In [25]:
sql.read_sql("SELECT MIN(age) FROM houses_pandas", con=conn)

Unnamed: 0,MIN(age)
0,5


In [26]:
sql.read_sql("SELECT AVG(age) FROM houses_pandas", con=conn)

Unnamed: 0,AVG(age)
0,42.744681


In [27]:
sql.read_sql("SELECT AVG(age) FROM houses_pandas GROUP BY bdrms", con=conn)

Unnamed: 0,AVG(age)
0,5.0
1,56.666667
2,38.36
3,46.857143
4,49.0
