In [1]:
import pandas as pd
import sqlite3

# 1.1 Movies Database

Take the movies dataset and turn it into a single `sqlite` database. It should have one table for each csv file in the movies dataset

In [3]:
conn = sqlite3.connect('data/movies.sqlite')

In [13]:
pd.read_sql("SELECT name FROM sqlite_master WHERE type='table'", con=conn)

Unnamed: 0,name
0,credits
1,keywords
2,links
3,links_small
4,movies_metadata
5,ratings
6,ratings_small


# 1.2 Queries

#### **1.2.1** Use a single query to pull the original title of movies with a budget above $5m

In [35]:
df = pd.read_sql_query("""
SELECT original_title
FROM movies_metadata
WHERE budget > 5000000
""", conn)

df.head()

Unnamed: 0,original_title
0,Jumanji
1,Heat
2,Sabrina
3,GoldenEye
4,The American President


#### **1.2.2** Use a query to pull the english-language films with the word `war` in their title

In [33]:
df = pd.read_sql_query("""
SELECT original_title
FROM movies_metadata
WHERE original_language = 'en'
AND title LIKE '%war%'
""", conn)

df.head()

Unnamed: 0,original_title
0,Star Wars
1,Once Were Warriors
2,The War
3,The War Room
4,Snow White and the Seven Dwarfs


#### **1.2.3** Left join the average ratings from the `ratings` table onto the `movies_metadata` table, so you can have a relation between budget and rating. Hint: use a subquery.

In [31]:
df = pd.read_sql_query("""
SELECT 
    title,
    budget,
    ratings.rating
FROM movies_metadata
LEFT JOIN (
    SELECT 
        AVG(rating) as rating,
        movieId
    FROM ratings
    GROUP BY movieId
) as ratings
ON movies_metadata.id = ratings.movieId
""", conn)

df.head()

Unnamed: 0,title,budget,rating
0,Toy Story,30000000,3.59893
1,Jumanji,65000000,3.760163
2,Grumpier Old Men,0,
3,Waiting to Exhale,16000000,
4,Father of the Bride Part II,0,


# 2. Baseball Database

The [Baseball Database](http://www.seanlahman.com/baseball-archive/statistics/) has an sqlite version. Download it for these exercises.

In [2]:
conn = sqlite3.connect('data/lahmansbaseballdb.sqlite')
pd.read_sql("SELECT name FROM sqlite_master WHERE type='table'", con=conn)

Unnamed: 0,name
0,allstarfull
1,appearances
2,awardsmanagers
3,awardsplayers
4,awardssharemanagers
5,awardsshareplayers
6,batting
7,battingpost
8,collegeplaying
9,divisions


##### **2.1** Which player has had the most homeruns?

In [85]:
df = pd.read_sql_query(""" SELECT playerID, SUM(HR) as HR FROM batting GROUP BY playerID """, conn)
df.loc[df.HR == df.HR.max()]

Unnamed: 0,playerID,HR
1621,bondsba01,762


#### **2.2** Is there a relation between how many homeruns a player has made in a year and his salary that year? Pull both colums together in a single query

In [54]:
df = pd.read_sql_query("""
SELECT 
    s.playerID,
    b.HR,
    salary,
    s.yearID
FROM salaries as s
LEFT JOIN batting as b
ON s.playerID = b.playerID
AND s.yearID = b.yearID
""", conn)

df

Unnamed: 0,playerID,HR,salary,yearID
0,barkele01,0.0,870000.0,1985
1,bedrost01,0.0,550000.0,1985
2,benedbr01,0.0,545000.0,1985
3,campri01,1.0,633333.0,1985
4,ceronri01,3.0,625000.0,1985
...,...,...,...,...
29114,strasst01,0.0,10400000.0,2016
29115,taylomi02,7.0,524000.0,2016
29116,treinbl01,0.0,524900.0,2016
29117,werthja01,21.0,21733615.0,2016
