# Intro to Databases and SQL

![](images/sqlite.gif)


**GOALS**:

- Access and navigate SQL databases with Python
- Use basic SQL commands to query a database

In [1]:
import sqlite3

In [2]:
conn = sqlite3.connect('data/example.db')

In [3]:
c = conn.cursor()

In [4]:
c.execute('''CREATE TABLE stocks
            (date text, trans text, symbol text, qty real, price real)''')

<sqlite3.Cursor at 0x10e942f10>

In [5]:
# Insert a row of data
c.execute("INSERT INTO stocks VALUES ('2006-01-05','BUY','RHAT',100,35.14)")

<sqlite3.Cursor at 0x10e942f10>

In [6]:
# Save (commit) the changes
conn.commit()

In [7]:
c.close()

In [8]:
# We can also close the connection if we are done with it.
# Just be sure any changes have been committed or they will be lost.
conn.close()

### Problem

Create a table that contains the following information:

| Author | Title | Year | Price |
| -----  | ----- | ----- | ----- |
| Donald Knuth | Concrete Mathematics | 1989 | 5.00 |
| Isaac Newton | System of the World  |  1687 | 8.50 |
| Michel Foucault | Death and the Labyrinth | 1964 | 3.00 |


In [9]:
conn = sqlite3.connect('data/books.db')

In [10]:
cur = conn.cursor()

In [11]:
books = '''
CREATE TABLE customers (
    id integer PRIMARY KEY,
    Author text NOT NULL,
    Title text NOT NULL,
    Year integer,
    Price real)'''

In [12]:
cur.execute(books)

<sqlite3.Cursor at 0x10e942f80>

In [13]:
cur.execute("SELECT name FROM sqlite_master WHERE type='table'")

<sqlite3.Cursor at 0x10e942f80>

In [14]:
print(cur.fetchall())

[('customers',)]


In [15]:
cur.close()
conn.close()

In [17]:
conn = sqlite3.connect('data/survey.db')

In [18]:
cur = conn.cursor()
cur.execute("SELECT Site.lat, Site.long FROM Site;")

<sqlite3.Cursor at 0x10e9c60a0>

In [19]:
results = cur.fetchall()

In [20]:
for r in results:
    print(r)

(-49.85, -128.57)
(-47.15, -126.72)
(-48.87, -123.4)


In [21]:
cur.execute("SELECT name FROM sqlite_master WHERE type='table'")
print(cur.fetchall())

[('Person',), ('Site',), ('Visited',), ('Survey',)]


In [22]:
cur.close()
conn.close()

In [24]:
conn = sqlite3.connect('data/survey.db')
cur = conn.cursor()
cur.execute("SELECT family, personal FROM Person;")

<sqlite3.Cursor at 0x10e9c61f0>

In [25]:
cur.fetchall()

[('Dyer', 'William'),
 ('Pabodie', 'Frank'),
 ('Lake', 'Anderson'),
 ('Roerich', 'Valentina'),
 ('Danforth', 'Frank')]

In [26]:
cur.execute("SELECT  personal, family FROM Person;")
cur.fetchall()

[('William', 'Dyer'),
 ('Frank', 'Pabodie'),
 ('Anderson', 'Lake'),
 ('Valentina', 'Roerich'),
 ('Frank', 'Danforth')]

In [27]:
cur.execute("SELECT id, id, id FROM Person;")
cur.fetchall()

[('dyer', 'dyer', 'dyer'),
 ('pb', 'pb', 'pb'),
 ('lake', 'lake', 'lake'),
 ('roe', 'roe', 'roe'),
 ('danforth', 'danforth', 'danforth')]

In [31]:
import pandas as pd

conn = sqlite3.connect('data/survey.db')
cur = conn.cursor()

results = cur.fetchall()
df = pd.read_sql('SELECT * FROM person', con=conn)
cur.close()
conn.close()

In [32]:
df.head()

Unnamed: 0,id,personal,family
0,dyer,William,Dyer
1,pb,Frank,Pabodie
2,lake,Anderson,Lake
3,roe,Valentina,Roerich
4,danforth,Frank,Danforth


In [29]:
for r in results:
    print(r)

('dyer', 'William', 'Dyer')
('pb', 'Frank', 'Pabodie')
('lake', 'Anderson', 'Lake')
('roe', 'Valentina', 'Roerich')
('danforth', 'Frank', 'Danforth')


### Problem

1. Write a query that select only the `name` column from the `Site` table.

In [33]:
conn = sqlite3.connect('data/survey.db')

df2 = pd.read_sql('SELECT name FROM Site', con=conn)

conn.close()

In [34]:
df2.head()

Unnamed: 0,name
0,DR-1
1,DR-3
2,MSK-4


In [35]:
conn = sqlite3.connect('data/survey.db')
pd.read_sql('SELECT name FROM Site', con=conn)

Unnamed: 0,name
0,DR-1
1,DR-3
2,MSK-4


### Sort and Remove Duplicates

In [36]:
conn = sqlite3.connect('data/survey.db')
cur = conn.cursor()

In [37]:
cur.execute("SELECT quant FROM Survey;")

<sqlite3.Cursor at 0x1110ab9d0>

In [38]:
cur.fetchall()

[('rad',),
 ('sal',),
 ('rad',),
 ('sal',),
 ('rad',),
 ('sal',),
 ('temp',),
 ('rad',),
 ('sal',),
 ('temp',),
 ('rad',),
 ('temp',),
 ('sal',),
 ('rad',),
 ('sal',),
 ('temp',),
 ('sal',),
 ('rad',),
 ('sal',),
 ('sal',),
 ('rad',)]

In [39]:
cur.execute("SELECT DISTINCT quant FROM Survey;")

<sqlite3.Cursor at 0x1110ab9d0>

In [40]:
cur.fetchall()

[('rad',), ('sal',), ('temp',)]

In [41]:
cur.execute("SELECT DISTINCT taken, quant FROM Survey;")
cur.fetchall()

[(619, 'rad'),
 (619, 'sal'),
 (622, 'rad'),
 (622, 'sal'),
 (734, 'rad'),
 (734, 'sal'),
 (734, 'temp'),
 (735, 'rad'),
 (735, 'sal'),
 (735, 'temp'),
 (751, 'rad'),
 (751, 'temp'),
 (751, 'sal'),
 (752, 'rad'),
 (752, 'sal'),
 (752, 'temp'),
 (837, 'rad'),
 (837, 'sal'),
 (844, 'rad')]

In [42]:
cur.execute("SELECT * FROM Person ORDER BY id;")
cur.fetchall()

[('danforth', 'Frank', 'Danforth'),
 ('dyer', 'William', 'Dyer'),
 ('lake', 'Anderson', 'Lake'),
 ('pb', 'Frank', 'Pabodie'),
 ('roe', 'Valentina', 'Roerich')]

In [43]:
cur.execute("SELECT * FROM Person ORDER BY id DESC;")
cur.fetchall()

[('roe', 'Valentina', 'Roerich'),
 ('pb', 'Frank', 'Pabodie'),
 ('lake', 'Anderson', 'Lake'),
 ('dyer', 'William', 'Dyer'),
 ('danforth', 'Frank', 'Danforth')]

In [45]:
cur.execute("SELECT taken, person, quant FROM Survey ORDER BY taken ASC, person DESC LIMIT 1;")
cur.fetchall()

[(619, 'dyer', 'rad')]

1. Write a query that selects distinct dates from the `Visited` table.
2. Write a query that displays the full names of the scientists in the `Person` table, ordered by family name.

In [54]:
pd.read_sql("SELECT DISTINCT dated FROM Visited", con = conn)

Unnamed: 0,dated
0,1927-02-08
1,1927-02-10
2,1930-01-07
3,1930-01-12
4,1930-02-26
5,
6,1932-01-14
7,1932-03-22


In [56]:
pd.read_sql("SELECT personal, family FROM Person ORDER BY family", con = conn)

Unnamed: 0,personal,family
0,Frank,Danforth
1,William,Dyer
2,Anderson,Lake
3,Frank,Pabodie
4,Valentina,Roerich


In [58]:
pd.read_sql("SELECT family, personal FROM Person ORDER BY family", con = conn).columns

Index(['family', 'personal'], dtype='object')

In [None]:
pd.read_sql("SELECT *")

### Filtering

In [59]:
cur.execute("SELECT * FROM Visited WHERE site='DR-1';")
cur.fetchall()

[('619', 'DR-1', '1927-02-08'),
 ('622', 'DR-1', '1927-02-10'),
 ('844', 'DR-1', '1932-03-22')]

In [60]:
cur.execute("SELECT * FROM Visited WHERE site='DR-1' AND dated<'1930-01-01';")
cur.fetchall()

[('619', 'DR-1', '1927-02-08'), ('622', 'DR-1', '1927-02-10')]

In [61]:
cur.execute("SELECT * FROM Survey WHERE person='lake' OR person='roe';")
cur.fetchall()

[(734, 'lake', 'sal', 0.05),
 (751, 'lake', 'sal', 0.1),
 (752, 'lake', 'rad', 2.19),
 (752, 'lake', 'sal', 0.09),
 (752, 'lake', 'temp', -16.0),
 (752, 'roe', 'sal', 41.6),
 (837, 'lake', 'rad', 1.46),
 (837, 'lake', 'sal', 0.21),
 (837, 'roe', 'sal', 22.5),
 (844, 'roe', 'rad', 11.25)]

In [62]:
cur.execute("SELECT * FROM Survey WHERE person IN ('lake', 'roe');")
cur.fetchall()

[(734, 'lake', 'sal', 0.05),
 (751, 'lake', 'sal', 0.1),
 (752, 'lake', 'rad', 2.19),
 (752, 'lake', 'sal', 0.09),
 (752, 'lake', 'temp', -16.0),
 (752, 'roe', 'sal', 41.6),
 (837, 'lake', 'rad', 1.46),
 (837, 'lake', 'sal', 0.21),
 (837, 'roe', 'sal', 22.5),
 (844, 'roe', 'rad', 11.25)]

In [63]:
cur.execute("SELECT * FROM Visited WHERE site LIKE 'DR%';")
cur.fetchall()

[('619', 'DR-1', '1927-02-08'),
 ('622', 'DR-1', '1927-02-10'),
 ('734', 'DR-3', '1930-01-07'),
 ('735', 'DR-3', '1930-01-12'),
 ('751', 'DR-3', '1930-02-26'),
 ('752', 'DR-3', None),
 ('844', 'DR-1', '1932-03-22')]

1. Suppose we want to select all sites that lie more than 42 degrees from the poles. Our first query is:
```sql
SELECT * FROM Site WHERE (lat > -48) OR (lat < 48);
```
Explain why this is wrong, and rewrite the query so that it is correct.

2. 
Normalized salinity readings are supposed to be between 0.0 and 1.0. Write a query that selects all records from Survey with salinity values outside this range.

In [68]:
cur.execute("SELECT * FROM Site WHERE lat > 42 OR lat < -42;")

<sqlite3.Cursor at 0x1110ab9d0>

In [69]:
cur.fetchall()

[('DR-1', -49.85, -128.57),
 ('DR-3', -47.15, -126.72),
 ('MSK-4', -48.87, -123.4)]

In [67]:
pd.read_sql("SELECT * from Survey WHERE reading < 0 or reading > 1", con = conn)

Unnamed: 0,taken,person,quant,reading
0,619,dyer,rad,9.82
1,622,dyer,rad,7.8
2,734,pb,rad,8.41
3,734,pb,temp,-21.5
4,735,pb,rad,7.22
5,735,,temp,-26.0
6,751,pb,rad,4.35
7,751,pb,temp,-18.5
8,752,lake,rad,2.19
9,752,lake,temp,-16.0


![](images/exploits_of_a_mom.png)

In [27]:
# Never do this -- insecure!
symbol = 'RHAT'
conn = sqlite3.connect('data/example.db')
c = conn.cursor()
c.execute("SELECT * FROM stocks WHERE symbol = '%s'" % symbol)

<sqlite3.Cursor at 0x103dd3260>

In [28]:
conn.close()

In [70]:
conn = sqlite3.connect('data/example.db')
c = conn.cursor()
# Larger example that inserts many records at a time
purchases = [('2006-03-28', 'BUY', 'IBM', 1000, 45.00),
             ('2006-04-05', 'BUY', 'MSFT', 1000, 72.00),
             ('2006-04-06', 'SELL', 'IBM', 500, 53.00),
            ]
c.executemany('INSERT INTO stocks VALUES (?,?,?,?,?)', purchases)

<sqlite3.Cursor at 0x10cba3880>

In [71]:
pd.read_sql('PRAGMA table_info(Person)', con=conn)

Unnamed: 0,cid,name,type,notnull,dflt_value,pk


In [30]:
conn.close()