# 1.1 Movies Database

Take the movies dataset and turn it into a single `sqlite` database. It should have one table for each csv file in the movies dataset

In [9]:
import sqlite3
import pandas as pd

meta = pd.read_csv('data/movies_metadata.csv')
ratings = pd.read_csv('data/ratings_small.csv')
credits = pd.read_csv('data/credits.csv')
keywords = pd.read_csv('data/keywords.csv')

# conn = sqlite3.connect('data/movies.sqlite')

meta.to_sql(
    name="meta",
    con=conn, 
    schema=None, 
    if_exists='replace', 
    index=True, 
)

ratings.to_sql(
    name="ratings",
    con=conn, 
    schema=None, 
    if_exists='replace', 
    index=True, 
)

credits.to_sql(
    name="credits",
    con=conn, 
    schema=None, 
    if_exists='replace', 
    index=True, 
)

keywords.to_sql(
    name="keywords",
    con=conn, 
    schema=None, 
    if_exists='replace', 
    index=True, 
)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# 1.2 Queries

**1.2.1** Use a single query to pull the original title of movies with a budget above $5m

**1.2.2** Use a query to pull the english-language films with the word `war` in their title

**1.2.3** Left join the average ratings from the `ratings` table onto the `movies_metadata` table, so you can have a relation between budget and rating. Hint: use a subquery.

In [10]:
# 1.2.1 
pd.read_sql("""select original_title, budget from meta 
                where budget >= 5000000""", con=conn)

Unnamed: 0,original_title,budget
0,Jumanji,65000000
1,Heat,60000000
2,Sabrina,58000000
3,GoldenEye,58000000
4,The American President,62000000
...,...,...
2468,Pattaya,5402000
2469,House of the Long Shadows,7500000
2470,Все и сразу,750000
2471,Dikari,800000


In [11]:
# 1.2.2

pd.read_sql("""select original_title, original_language from meta 
                where original_language = 'en'
                and original_title like '%war%'""", con=conn)

Unnamed: 0,original_title,original_language
0,Star Wars,en
1,Once Were Warriors,en
2,The War,en
3,The War Room,en
4,Snow White and the Seven Dwarfs,en
...,...,...
318,War Machine,en
319,Warning: This Drug May Kill You,en
320,PsyWar: The real battlefield is your mind,en
321,Warm Springs,en


In [12]:
# 1.2.3

pd.read_sql("""select m.original_title, r.avg_rating  
                from (select movieId, avg(rating) as avg_rating from ratings group by movieId) as r left join meta as m on m.id = r.movieId""", con=conn)

Unnamed: 0,original_title,avg_rating
0,,3.872470
1,Ariel,3.401869
2,Varjoja paratiisissa,3.161017
3,,2.384615
4,Four Rooms,3.267857
...,...,...
9062,,5.000000
9063,,4.500000
9064,,5.000000
9065,,3.000000


In [16]:
pd.read_sql("""select m.original_title, r.avg_rating  
                from meta as m left join (select movieId, avg(rating) as avg_rating from ratings group by movieId) as r on m.id = r.movieId WHERE r.avg_rating > 0""", con=conn)

Unnamed: 0,original_title,avg_rating
0,Heat,3.593750
1,GoldenEye,1.500000
2,Cutthroat Island,3.616279
3,Casino,3.555556
4,Sense and Sensibility,5.000000
...,...,...
2826,Sang Pemimpi,3.250000
2827,Frankenstein Created Woman,3.965517
2828,Puteshestvie s domashnimi zhivotnymi,4.166667
2829,Zolushka,4.000000


# 2. Baseball Database

The [Baseball Database](http://www.seanlahman.com/baseball-archive/statistics/) has an sqlite version. Download it for these exercises.

**2.1** Which player has had the most homeruns?

**2.2** Is there a relation between how many homeruns a player has made in a year and his salary that year? Pull both colums together in a single query



In [17]:
conn = sqlite3.connect('data/lahmansbaseballdb.sqlite')
pd.read_sql("SELECT name FROM sqlite_master WHERE type='table'", con=conn)

Unnamed: 0,name
0,allstarfull
1,appearances
2,awardsmanagers
3,awardsplayers
4,awardssharemanagers
5,awardsshareplayers
6,batting
7,battingpost
8,collegeplaying
9,divisions


In [18]:
pd.read_sql("""SELECT max(sum_HR), playerID FROM (SELECT playerID, sum(HR) as sum_HR FROM batting group by playerID)""", con=conn)

Unnamed: 0,max(sum_HR),playerID
0,762,bondsba01


In [20]:
# player, sum of homeruns, salary

pd.read_sql("""SELECT * FROM salaries""", con=conn)

Unnamed: 0,ID,yearID,teamID,team_ID,lgID,playerID,salary
0,1,1985,ATL,1918,NL,barkele01,870000.0
1,2,1985,ATL,1918,NL,bedrost01,550000.0
2,3,1985,ATL,1918,NL,benedbr01,545000.0
3,4,1985,ATL,1918,NL,campri01,633333.0
4,5,1985,ATL,1918,NL,ceronri01,625000.0
...,...,...,...,...,...,...,...
26423,26424,2016,WAS,2835,NL,strasst01,10400000.0
26424,26425,2016,WAS,2835,NL,taylomi02,524000.0
26425,26426,2016,WAS,2835,NL,treinbl01,524900.0
26426,26427,2016,WAS,2835,NL,werthja01,21733615.0


In [21]:
pd.read_sql("""SELECT yearID, playerID, sum(HR) as sum_HR FROM batting group by playerID""", con=conn)

Unnamed: 0,yearID,playerID,sum_HR
0,2004,aardsda01,0
1,1954,aaronha01,755
2,1962,aaronto01,13
3,1977,aasedo01,0
4,2001,abadan01,0
...,...,...,...
19684,1957,zupofr01,0
19685,1982,zuvelpa01,2
19686,1951,zuverge01,0
19687,1910,zwilldu01,30


In [24]:
df = pd.read_sql("""select b.playerID, b.sum_HR, s.salary, s.yearID
                from salaries as s join (SELECT yearID, playerID, sum(HR) as sum_HR FROM batting group by playerID) as b on s.playerID = b.playerID AND s.yearID = b.yearID """, con=conn)

In [25]:
df.corr()

Unnamed: 0,sum_HR,salary,yearID
sum_HR,1.0,-0.010798,-0.099632
salary,-0.010798,1.0,0.343008
yearID,-0.099632,0.343008,1.0


In [None]:
# there is not statistically significant relation between Homeruns in a year and the salary earned.