In [3]:
import pandas as pd
import sqlite3

# 1.1 Movies Database

Take the movies dataset and turn it into a single `sqlite` database. It should have one table for each csv file in the movies dataset

In [2]:
conn = sqlite3.connect('data/movies.sqlite')
c = conn.cursor()

movies = [
"movies_metadata",
"keywords",
"ratings",
"credits",
"ratings_small",
"links_small",
"links",
]

In [None]:
for i in range(0, len(movies)):
    name = movies[i]
    path = 'data/'+name+'.csv'
    df = pd.read_csv(path)
    df.to_sql(name, conn, if_exists='append', index=False

In [5]:
### testing to see if the database was created

for i in range(0, len(movies)):
    name = movies[i]
    cursor = c.execute(f"SELECT * FROM {name}")
    column_names = list(map(lambda x: x[0], cursor.description))
    print(column_names)

['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'video', 'vote_average', 'vote_count']
['id', 'keywords']
['userId', 'movieId', 'rating', 'timestamp']
['cast', 'crew', 'id']
['userId', 'movieId', 'rating', 'timestamp']
['movieId', 'imdbId', 'tmdbId']
['movieId', 'imdbId', 'tmdbId']


# 1.2 Queries

**1.2.1** Use a single query to pull the original title of movies with a budget above $5m

**1.2.2** Use a query to pull the english-language films with the word `war` in their title

**1.2.3** Left join the average ratings from the `ratings` table onto the `movies_metadata` table, so you can have a relation between budget and rating. Hint: use a subquery.

In [9]:
budget = 5000000
cursor = c.execute(f"SELECT * FROM movies_metadata WHERE budget > {budget}")

column_names = list(map(lambda x: x[0], cursor.description))

rows = list(map(lambda x: dict(zip(column_names, x)), cursor.fetchall()))

for row in rows:
    print("{original_title}:\t{budget}".format(**row))

Jumanji:	65000000
Heat:	60000000
Sabrina:	58000000
GoldenEye:	58000000
The American President:	62000000
Cutthroat Island:	98000000
Casino:	52000000
Money Train:	60000000
Assassins:	50000000
Pocahontas:	55000000
The Usual Suspects:	6000000
Fair Game:	50000000
Things to Do in Denver When You're Dead:	8000000
Antonia:	900000
Nick of Time:	8169363
Broken Arrow:	50000000
Bottle Rocket:	7000000
Braveheart:	72000000
紅番區:	7500000
Flirting with Disaster:	7000000
Jade:	50000000
Up Close & Personal:	60000000
Apollo 13:	52000000
Casper:	50000000
Congo:	50000000
Crimson Tide:	53000000
Desperado:	7000000
Die Hard: With a Vengeance:	90000000
First Knight:	55000000
Judge Dredd:	90000000
Mallrats:	6000000
The Prophecy:	8
The Scarlet Letter:	50000000
Smoke:	7000000
Under Siege 2: Dark Territory:	60000000
White Man's Burden:	8000000
Disclosure:	55000000
Hoop Dreams:	700000
Interview with the Vampire:	60000000
Junior:	60000000
Love Affair:	60000000
Miami Rhapsody:	6
My Family:	5500000
Outbreak:	50000000
P

तलाश:	8510000
Captain Phillips:	55000000
Baggage Claim:	8500000
Escape Plan:	50000000
How to Meet Girls from a Distance:	80000
Escape from Tomorrow:	650
ゴジラ×メガギラス G消滅作戦:	8300000
Gojira tai Megaro:	760000
All Is Lost:	9000000
Such Hawks Such Hounds: Scenes from the American Hard Rock Underground:	75000
Free Birds:	55000000
ゴジラ×メカゴジラ:	8500000
Bailey's Billion$:	9000000
실미도:	8000000
कृष ३:	6218100
Mr. Morgan's Last Love:	8200000
Fright Night 2: New Blood:	8000000
Sweetwater:	7000000
The Secret Life of Walter Mitty:	90000000
I-See-You.Com:	6200000
All Things To All Men:	6000000
The Nutcracker: The Untold Story:	90000000
Anchorman 2: The Legend Continues:	50000000
Fase 7:	600000
Mio min Mio:	7200000
The Human Race:	567000
Only Lovers Left Alive:	7000000
Jack Ryan: Shadow Recruit:	60000000
Divergent:	85000000
Reasonable Doubt:	8000000
Sur la Piste du Marsupilami:	50000000
Fear No Evil:	840000
Devil's Due:	7000000
L'Antisémite:	90000
Yeh Jawaani Hai Deewani:	7700000
Banshee Chapter:	950000
I,

In [13]:
cursor = c.execute(f"SELECT * FROM movies_metadata WHERE original_title LIKE '%war%'")
                   
column_names = list(map(lambda x: x[0], cursor.description))

rows = list(map(lambda x: dict(zip(column_names, x)), cursor.fetchall()))

for row in rows:
    print("{original_title}".format(**row))        

Star Wars
Once Were Warriors
The War
The War Room
Snow White and the Seven Dwarfs
Homeward Bound II: Lost in San Francisco
The Haunted World of Edward D. Wood, Jr.
War Stories Our Mother Never Told Us
I Shot Andy Warhol
Homeward Bound: The Incredible Journey
In Love and War
Warriors of Virtue
The War at Home
A Nightmare on Elm Street 3: Dream Warriors
Squanto: A Warrior's Tale
Edward Scissorhands
Howard the Duck
Star Wars: Episode I - The Phantom Menace
The War of the Worlds
The 13th Warrior
Melvin and Howard
How I Won the War
The War Zone
Howards End
The Art of War
Pay It Forward
Morgan Stewart's Coming Home
Death Warrant
The Beast of War
The Milagro Beanfield War
Casualties of War
The War of the Roses
Warlock
Warlock: The Armageddon
The Dogs of War
Hart's War
Star Wars: Episode II - Attack of the Clones
V.I. Warshawski
The Swarm
Urgh! A Music War
War and Peace
Murphy's War
War Photographer
War and Peace
At War with the Army
WarGames
Spring Forward
The Fog of War: Eleven Lessons from 

In [8]:
df = pd.read_sql(
"""
SELECT 
    *
FROM movies_metadata 
LEFT JOIN ratings
    ON movies_metadata.id = ratings.userid 
WHERE
    budget > 0
""", con=conn)
df

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,status,tagline,title,video,vote_average,vote_count,userId,movieId,rating,timestamp
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Released,,Toy Story,0,7.7,5415.0,862.0,50.0,4.0,9.650839e+08
1,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Released,,Toy Story,0,7.7,5415.0,862.0,318.0,5.0,9.650839e+08
2,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Released,,Toy Story,0,7.7,5415.0,862.0,593.0,5.0,9.650839e+08
3,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Released,,Toy Story,0,7.7,5415.0,862.0,858.0,4.0,9.650839e+08
4,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Released,,Toy Story,0,7.7,5415.0,862.0,912.0,4.0,9.650837e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
770314,False,,5000000,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",,63898,tt1110037,ru,Антидурь,Failing to complete an important assignment wi...,...,Released,,Antidur,0,1.0,1.0,63898.0,3108.0,4.0,1.378225e+09
770315,False,,5000000,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",,63898,tt1110037,ru,Антидурь,Failing to complete an important assignment wi...,...,Released,,Antidur,0,1.0,1.0,63898.0,3157.0,3.5,1.378225e+09
770316,False,,5000000,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",,63898,tt1110037,ru,Антидурь,Failing to complete an important assignment wi...,...,Released,,Antidur,0,1.0,1.0,63898.0,3174.0,4.0,1.378225e+09
770317,False,,5000000,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",,63898,tt1110037,ru,Антидурь,Failing to complete an important assignment wi...,...,Released,,Antidur,0,1.0,1.0,63898.0,3639.0,2.0,1.378225e+09


# 2. Baseball Database

The [Baseball Database](http://www.seanlahman.com/baseball-archive/statistics/) has an sqlite version. Download it for these exercises.

**2.1** Which player has had the most homeruns?

**2.2** Is there a relation between how many homeruns a player has made in a year and his salary that year? Pull both colums together in a single query



In [4]:
conn = sqlite3.connect('data/baseball.sqlite')
c = conn.cursor()
res = conn.execute("SELECT name FROM sqlite_master WHERE type='table';")
for name in res:
    print (name[0])
    cursor = c.execute(f"SELECT * FROM {name[0]}")
    column_names = list(map(lambda x: x[0], cursor.description)) 
    print(column_names,"\n") 

allstarfull
['ID', 'playerID', 'yearID', 'gameNum', 'gameID', 'teamID', 'team_ID', 'lgID', 'GP', 'startingPos'] 

appearances
['ID', 'yearID', 'teamID', 'team_ID', 'lgID', 'playerID', 'G_all', 'GS', 'G_batting', 'G_defense', 'G_p', 'G_c', 'G_1b', 'G_2b', 'G_3b', 'G_ss', 'G_lf', 'G_cf', 'G_rf', 'G_of', 'G_dh', 'G_ph', 'G_pr'] 

awardsmanagers
['ID', 'playerID', 'awardID', 'yearID', 'lgID', 'tie', 'notes'] 

awardsplayers
['ID', 'playerID', 'awardID', 'yearID', 'lgID', 'tie', 'notes'] 

awardssharemanagers
['ID', 'awardID', 'yearID', 'lgID', 'playerID', 'pointsWon', 'pointsMax', 'votesFirst'] 

awardsshareplayers
['ID', 'awardID', 'yearID', 'lgID', 'playerID', 'pointsWon', 'pointsMax', 'votesFirst'] 

batting
['ID', 'playerID', 'yearID', 'stint', 'teamID', 'team_ID', 'lgID', 'G', 'G_batting', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'IBB', 'HBP', 'SH', 'SF', 'GIDP'] 

battingpost
['ID', 'yearID', 'round', 'playerID', 'teamID', 'team_ID', 'lgID', 'G', 'AB', 'R', 'H

In [34]:
df = pd.read_sql(
"""
SELECT 
    nameGiven, nameLast, MAX(HR)
FROM people 
INNER JOIN batting
    ON people.playerID = batting.playerID
""", con=conn)
df

Unnamed: 0,nameGiven,nameLast,MAX(HR)
0,Barry Lamar,Bonds,73


In [25]:
df = pd.read_sql(
"""
SELECT
    nameGIVEN, nameLast, HR, salary
FROM people
    INNER JOIN batting
        ON people.playerID = batting.playerID
    INNER JOIN salaries
        ON people.playerID = salaries.playerID
ORDER BY salary
""", con=conn
)
print("There's no distinct correlation between the number of home runs and salary")
df

There's no distinct correlation between the number of home runs and salary


Unnamed: 0,nameGiven,nameLast,HR,salary
0,Dion,James,0,0.0
1,Dion,James,1,0.0
2,Dion,James,0,0.0
3,Dion,James,10,0.0
4,Dion,James,3,0.0
...,...,...,...,...
318823,Alexander Enmanuel,Rodriguez,9,33000000.0
318824,Alexander Enmanuel,Rodriguez,9,33000000.0
318825,Clayton Edward,Kershaw,0,33000000.0
318826,Clayton Edward,Kershaw,0,33000000.0
