Dylan Hastings

# 1.1 Movies Database

Take the movies dataset and turn it into a single `sqlite` database. It should have one table for each csv file in the movies dataset

In [2]:
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm
import pandas as pd
import sqlite3
from os import listdir

In [2]:
conn = sqlite3.connect(f"data/movies.sqlite")

In [3]:
pd.read_sql("SELECT name FROM sqlite_master WHERE type = 'table'", con=conn)

Unnamed: 0,name
0,credits
1,keywords
2,links_small
3,links
4,movies_metadata
5,ratings_small
6,ratings


In [5]:
table_names = [
    'credits', 'keywords', 'links_small', 'links', 
    'movies_metadata', 'ratings_small', 'ratings',
]

In [None]:
for t in table_names:
    df = pd.read_csv(f'data/{t}.csv')
    print(f'data/{t}.csv')
    df.to_sql(t, con=conn)

In [6]:
pd.read_sql("SELECT name FROM sqlite_master WHERE type = 'table'", con=conn)

Unnamed: 0,name
0,credits
1,keywords
2,links_small
3,links
4,movies_metadata
5,ratings_small
6,ratings


# 1.2 Queries

**1.2.1** Use a single query to pull the original title of movies with a budget above $5m



In [7]:
pd.read_sql_query(
"""
SELECT  
    original_title
FROM movies_metadata
WHERE budget > 5e6
""", 
    con=conn
)

Unnamed: 0,original_title
0,Jumanji
1,Heat
2,Sabrina
3,GoldenEye
4,The American President
...,...
2182,The Emoji Movie
2183,Pattaya
2184,House of the Long Shadows
2185,Все и сразу


**1.2.2** Use a query to pull the english-language films with the word `war` in their title



In [8]:
pd.read_sql_query(
"""
SELECT *
FROM movies_metadata
WHERE original_language = 'en'
    and original_title LIKE '%war%'
""", 
    con=conn
)

Unnamed: 0,index,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,256,False,"{'id': 10, 'name': 'Star Wars Collection', 'po...",11000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",http://www.starwars.com/films/star-wars-episod...,11,tt0076759,en,Star Wars,...,1977-05-25,775398007.0,121.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"A long time ago in a galaxy far, far away...",Star Wars,0,8.1,6778.0
1,286,False,"{'id': 300546, 'name': 'Once were Warriors Col...",0,"[{'id': 18, 'name': 'Drama'}]",,527,tt0110729,en,Once Were Warriors,...,1994-09-02,2201126.0,99.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,"A family in crisis, a life in chaos... Nothing...",Once Were Warriors,0,7.6,106.0
2,335,False,,0,"[{'id': 18, 'name': 'Drama'}]",,19855,tt0111667,en,The War,...,1994-11-04,0.0,126.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,The War,0,6.3,39.0
3,551,False,,0,"[{'id': 99, 'name': 'Documentary'}, {'id': 36,...",,26408,tt0108515,en,The War Room,...,1993-12-05,0.0,96.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,They Changed The Way Campaigns Are Won,The War Room,0,7.3,19.0
4,587,False,,1488423,"[{'id': 14, 'name': 'Fantasy'}, {'id': 16, 'na...",http://movies.disney.com/snow-white-and-the-se...,408,tt0029583,en,Snow White and the Seven Dwarfs,...,1937-12-20,184925486.0,83.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"The Happiest, Dopiest, Grumpiest, Sneeziest mo...",Snow White and the Seven Dwarfs,0,6.9,1973.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318,44303,False,,60000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",https://www.netflix.com/title/80068327,354287,tt4758646,en,War Machine,...,2017-05-26,0.0,122.0,"[{'iso_639_1': 'ps', 'name': 'پښتو'}, {'iso_63...",Released,We're gonna liberate the sh** out of you.,War Machine,0,5.8,243.0
319,44350,False,,0,"[{'id': 10770, 'name': 'TV Movie'}, {'id': 99,...",http://www.hbo.com/documentaries/warning-this-...,452372,tt6710212,en,Warning: This Drug May Kill You,...,2017-05-01,0.0,60.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,America's prescription opiods kill more than p...,Warning: This Drug May Kill You,0,7.0,3.0
320,44765,False,,0,"[{'id': 99, 'name': 'Documentary'}]",http://metanoia-films.org/psywar.php,59406,tt1900958,en,PsyWar: The real battlefield is your mind,...,2010-10-08,0.0,99.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A Lucid and insightful study of the manipulati...,PsyWar: The real battlefield is your mind,0,6.6,4.0
321,44973,False,,0,"[{'id': 18, 'name': 'Drama'}]",,46661,tt0423510,en,Warm Springs,...,2005-04-30,0.0,121.0,[],Released,,Warm Springs,0,5.8,6.0


**1.2.3** Left join the average ratings from the `ratings` table onto the `movies_metadata` table, so you can have a relation between budget and rating. Hint: use a subquery.

In [9]:
pd.read_sql("""
SELECT *
FROM movies_metadata md
LEFT JOIN (
    SELECT movieId, AVG(rating)
    FROM ratings
    group by movieId
    LIMIT 10
)  r
    ON r.movieId = md.id
""", con=conn)

Unnamed: 0,index,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,movieId,AVG(rating)
0,0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,0.0,7.7,5415.0,,
1,1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,0.0,6.9,2413.0,,
2,2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,...,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,0.0,6.5,92.0,,
3,3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,...,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,0.0,6.1,34.0,,
4,4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,...,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,0.0,5.7,173.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45461,45461,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,...,90.0,"[{'iso_639_1': 'fa', 'name': 'فارسی'}]",Released,Rising and falling between a man and woman,Subdue,0.0,4.0,1.0,,
45462,45462,False,,0,"[{'id': 18, 'name': 'Drama'}]",,111109,tt2028550,tl,Siglo ng Pagluluwal,...,360.0,"[{'iso_639_1': 'tl', 'name': ''}]",Released,,Century of Birthing,0.0,9.0,3.0,,
45463,45463,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,67758,tt0303758,en,Betrayal,...,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,0.0,3.8,6.0,,
45464,45464,False,,0,[],,227506,tt0008536,en,Satana likuyushchiy,...,87.0,[],Released,,Satan Triumphant,0.0,0.0,0.0,,


# 2. Baseball Database

The [Baseball Database](http://www.seanlahman.com/baseball-archive/statistics/) has an sqlite version. Download it for these exercises.

**2.1** Which player has had the most homeruns?



In [3]:
conn2 = sqlite3.connect(f"data/lahmansbaseballdb.sqlite")

In [16]:
pd.read_sql("SELECT name FROM sqlite_master WHERE type = 'table'", con=conn2)

Unnamed: 0,name
0,batting
1,salaries


In [12]:
table_names = [
    'batting', 'salaries'
]

In [None]:
for t in table_names:
    df = pd.read_csv(f'data/{t}.csv')
    print(f'data/{t}.csv')
    df.to_sql(t, con=conn2)

In [18]:
pd.read_sql_query(
"""
SELECT playerID, max(HR)
FROM 
(SELECT *
FROM batting
)
""", con=conn2
)

Unnamed: 0,playerID,max(HR)
0,bondsba01,73


**2.2** Is there a relation between how many homeruns a player has made in a year and his salary that year? Pull both colums together in a single query

In [25]:
pd.read_sql("""
SELECT *
FROM Batting md
LEFT JOIN (
    SELECT playerID, salary
    FROM Salaries
    group by playerID
    LIMIT 10
)  r
""", con=conn2)

Unnamed: 0,index,playerID,yearID,stint,teamID,lgID,G,AB,R,H,...,CS,BB,SO,IBB,HBP,SH,SF,GIDP,playerID.1,salary
0,0,abercda01,1871,1,TRO,,1,4,0,0,...,0.0,0,0.0,,,,,0.0,aardsda01,300000
1,0,abercda01,1871,1,TRO,,1,4,0,0,...,0.0,0,0.0,,,,,0.0,aasedo01,600000
2,0,abercda01,1871,1,TRO,,1,4,0,0,...,0.0,0,0.0,,,,,0.0,abadan01,327000
3,0,abercda01,1871,1,TRO,,1,4,0,0,...,0.0,0,0.0,,,,,0.0,abadfe01,418000
4,0,abercda01,1871,1,TRO,,1,4,0,0,...,0.0,0,0.0,,,,,0.0,abbotje01,175000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1087885,108788,zuninmi01,2020,1,TBA,AL,28,75,8,11,...,0.0,6,37.0,0.0,3.0,0.0,0.0,0.0,abbotji01,68000
1087886,108788,zuninmi01,2020,1,TBA,AL,28,75,8,11,...,0.0,6,37.0,0.0,3.0,0.0,0.0,0.0,abbotku01,109000
1087887,108788,zuninmi01,2020,1,TBA,AL,28,75,8,11,...,0.0,6,37.0,0.0,3.0,0.0,0.0,0.0,abbotky01,109000
1087888,108788,zuninmi01,2020,1,TBA,AL,28,75,8,11,...,0.0,6,37.0,0.0,3.0,0.0,0.0,0.0,abbotpa01,100000
