# 1.1 Movies Database

Take the movies dataset and turn it into a single `sqlite` database. It should have one table for each csv file in the movies dataset

In [1]:
import sqlite3
import pandas as pd

In [2]:
metadata = pd.read_csv('archive/movies_metadata.csv')
credits = pd.read_csv('archive/credits.csv')
keywords = pd.read_csv('archive/keywords.csv')
links = pd.read_csv('archive/links.csv')
ratings_small = pd.read_csv('archive/ratings_small.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
conn = sqlite3.connect('archive/movies.sqlite')

metadata.to_sql(name='metadata', con=conn, schema=None, if_exists='replace', index=True)
credits.to_sql(name='credits', con=conn, schema=None, if_exists='replace', index=True)
keywords.to_sql(name='keywords', con=conn, schema=None, if_exists='replace', index=True)
links.to_sql(name='links', con=conn, schema=None, if_exists='replace', index=True)
ratings_small.to_sql(name='ratings_small', con=conn, schema=None, if_exists='replace', index=True)

# 1.2 Queries

**1.2.1** Use a single query to pull the original title of movies with a budget above $5m

**1.2.2** Use a query to pull the english-language films with the word `war` in their title

**1.2.3** Left join the average ratings from the `ratings` table onto the `movies_metadata` table, so you can have a relation between budget and rating. Hint: use a subquery.

In [13]:
#1.2.1) 

five_million = pd.read_sql('select original_title from metadata WHERE budget > 5000000', conn)
five_million

Unnamed: 0,original_title
0,Jumanji
1,Heat
2,Sabrina
3,GoldenEye
4,The American President
...,...
2182,The Emoji Movie
2183,Pattaya
2184,House of the Long Shadows
2185,Все и сразу


In [18]:
# 1.2.2)

war = pd.read_sql("SELECT original_title FROM metadata WHERE original_title LIKE '% war %'", conn)
war

Unnamed: 0,original_title
0,The War Room
1,The War at Home
2,The War of the Worlds
3,The War Zone
4,The War of the Roses
5,At War with the Army
6,I Was a Male War Bride
7,The War Wagon
8,Outfoxed: Rupert Murdoch's War on Journalism
9,The War Within


In [4]:
#1.2.3) 

left_join = pd.read_sql("""SELECT ratings_small.movieId, metadata.original_title, ratings_small.rating,budget
                           FROM metadata 
                           LEFT JOIN ratings_small 
                           ON metadata.id = ratings_small.movieId 
                           GROUP BY ratings_small.movieId""", con=conn)

left_join

Unnamed: 0,movieId,original_title,rating,budget
0,,Toy Story,,30000000
1,2.0,Ariel,1.5,0
2,3.0,Varjoja paratiisissa,0.5,0
3,5.0,Four Rooms,1.0,4000000
4,6.0,Judgment Night,1.0,0
...,...,...,...,...
2826,140174.0,Rise of the Zombies,3.5,0
2827,142507.0,Exit,3.0,0
2828,148652.0,The Eleventh Victim,2.5,0
2829,158238.0,Stolen Seas,3.5,0


# 2. Baseball Database

The [Baseball Database](http://www.seanlahman.com/baseball-archive/statistics/) has an sqlite version. Download it for these exercises.

**2.1** Which player has had the most homeruns?

**2.2** Is there a relation between how many homeruns a player has made in a year and his salary that year? Pull both colums together in a single query



In [21]:
conn = sqlite3.connect('data/baseball.db')
c = conn.cursor()

In [22]:
batting = pd.read_csv('data/baseball/core/Batting.csv')
salary = pd.read_csv('data/baseball/core/Salaries.csv')

In [23]:
batting.to_sql('Batting', conn, if_exists='replace', index = False)
salary.to_sql('Salaries', conn, if_exists='replace', index = False)

In [24]:
homerun = pd.read_sql("""SELECT SUM(HR) as HR, playerID
                       FROM batting
                       GROUP BY playerID
                       ORDER BY HR DESC""", conn)

homerun

Unnamed: 0,HR,playerID
0,762,bondsba01
1,755,aaronha01
2,714,ruthba01
3,696,rodrial01
4,662,pujolal01
...,...,...
19893,0,abadijo01
19894,0,abadfe01
19895,0,abadan01
19896,0,aasedo01


In [25]:
homerun_salary = pd.read_sql("""SELECT salary, HR, s.yearID, b.playerID
                                FROM salaries s
                                JOIN batting b
                                ON b.playerID = s.playerID
                                AND b.yearID = s.yearID
                                ORDER BY salary dESC
                                LIMIT 10""", conn)

homerun_salary

Unnamed: 0,salary,HR,yearID,playerID
0,33000000,30,2009,rodrial01
1,33000000,30,2010,rodrial01
2,33000000,0,2016,kershcl01
3,32571000,0,2015,kershcl01
4,32000000,16,2011,rodrial01
5,31799030,0,2016,greinza01
6,30000000,18,2012,rodrial01
7,30000000,0,2016,priceda01
8,29000000,7,2013,rodrial01
9,28000000,35,2008,rodrial01


In [26]:
from scipy.stats import linregress

linregress(homerun_salary['HR'], homerun_salary['salary'])

#p-value of 0.696 indicates that it is not statistically significant

LinregressResult(slope=-18417.428508091336, intercept=31487480.027710043, rvalue=-0.1415555366381403, pvalue=0.696477832635459, stderr=45536.72186282755)