In [2]:
## necessary imports

import pandas as pd


# SQL

Domain-specific language used in programming and managing relational databases.

**COMMANDS:**

* **SELECT**: Specifies which columns to retrieve.
* **FROM**: Specifies the tables from which to retrieve the data.
* **WHERE**: Filters the result set based on a condition.
* **GROUP BY**:  Groups rows that have the same values in specified columns.
* **HAVING**: Filters the result set of a GROUP BY. This is like a WHERE clause but for grouped records.

### GROUP BY

This groups rows that have the same values in specified columns into aggregate data, such as sum or average. The result table will include only the columns listed in the GROUP BY clause and columns that have aggregate functions applied to them.

Consider the below tables:

**Table: students**

| student_id | student_name | major             |
|------------|--------------|-------------------|
| 1          | Alice        | Computer Science  |
| 2          | Bob          | Physics           |
| 3          | Charlie      | Math              |
| 4          | David        | Computer Science  |


**Table: courses**  
| major            | COUNT(student_id) |
|------------------|-------------------|
| Computer Science | 2                 |
| Physics          | 1                 |
| Math             | 1                 |


|------------------|-------------------||------------------||------------------|



**SQL Query:**

```sql
SELECT major, COUNT(student_id)
FROM students
GROUP BY major;
```

|------------------|-------------------||------------------|-------------------|


**Table result:**

| major            | COUNT(student_id) |
|------------------|-------------------|
| Computer Science | 2                 |
| Physics          | 1                 |
| Math             | 1                 |



In [11]:
# Students Table:

students_data = {
    'student_id': [1, 2, 3, 4, 5],
    'student_name': ['Alice', 'Bob', 'Charlie', 'David', 'Carl'],
    'major': ['Computer Science', 'Physics', 'Math', 'Biology', 'Biology']
}

students_df = pd.DataFrame(students_data)
print(students_df)


   student_id student_name             major
0           1        Alice  Computer Science
1           2          Bob           Physics
2           3      Charlie              Math
3           4        David           Biology
4           5         Carl           Biology


In [12]:
# Courses Table:

courses_data = {
    'course_id': [101, 102, 103, 104],
    'course_name': ['Programming', 'Quantum Mechanics', 'Algebra', 'Genetics'],
    'department': ['Computer Science', 'Physics', 'Math', 'Biology']
}

courses_df = pd.DataFrame(courses_data)
print(courses_df)


   course_id        course_name        department
0        101        Programming  Computer Science
1        102  Quantum Mechanics           Physics
2        103            Algebra              Math
3        104           Genetics           Biology


In [14]:
# Enrollments Table:

enrollments_data = {
    'student_id': [1, 1, 2, 3, 3, 4, 5],
    'course_id': [101, 103, 102, 103, 101, 104, 104]
}

enrollments_df = pd.DataFrame(enrollments_data)
print(enrollments_df)


   student_id  course_id
0           1        101
1           1        103
2           2        102
3           3        103
4           3        101
5           4        104
6           5        104


## SQL COMMANDS

In [15]:
# SELECT student_name, major FROM Students
selected_data = students_df[['student_name', 'major']]
print(selected_data)

  student_name             major
0        Alice  Computer Science
1          Bob           Physics
2      Charlie              Math
3        David           Biology
4         Carl           Biology


In [16]:
# SELECT student_name FROM Students WHERE major = 'Computer Science'
cs_students = students_df[students_df['major'] == 'Computer Science'][['student_name']]
print(cs_students)

  student_name
0        Alice


In [17]:
# SELECT student_name, course_name 
# FROM Students
# JOIN Enrollments ON Students.student_id = Enrollments.student_id
# JOIN Courses ON Enrollments.course_id = Courses.course_id

# Joining tables to retrieve student names and their respective courses


joined_data = students_df.merge(enrollments_df, on='student_id').merge(courses_df, on='course_id')[['student_name', 'course_name']]
print(joined_data)

  student_name        course_name
0        Alice        Programming
1      Charlie        Programming
2        Alice            Algebra
3      Charlie            Algebra
4          Bob  Quantum Mechanics
5        David           Genetics
6         Carl           Genetics


In [18]:
# SELECT major, COUNT(student_id) 
# FROM Students 
# GROUP BY major

grouped_data = students_df.groupby('major').size().reset_index(name='number_of_students')
print(grouped_data)


              major  number_of_students
0           Biology                   2
1  Computer Science                   1
2              Math                   1
3           Physics                   1


In [19]:
# SELECT major, COUNT(student_id)   # major and  the number of student IDs, enrolled in that major
# FROM Students 
# GROUP BY major
# HAVING COUNT(student_id) > 1      # creates unique groups based on the values in the major column.

# Grouping by major and filtering majors which have more than 1 student --> FILTERING ON THE GROUP BY

grouped_having_data = students_df.groupby('major').filter(lambda x: len(x) > 1).groupby('major').size().reset_index(name='number_of_students')
print(grouped_having_data)

     major  number_of_students
0  Biology                   2


# BUILDING A SQLITE DB FROM CSV

## Read the file into df

In [20]:
# Read the CSV into a DataFrame
netflix_df = pd.read_csv('netflix_titles.csv')

# Display the first few rows
netflix_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


## Convert into SQLite DB 

In [21]:
import sqlite3

# Connect to a database (or create one if it doesn't exist)
conn = sqlite3.connect('netflix_db.sqlite')
cursor = conn.cursor()

# Store the DataFrame as a table in the SQLite database
netflix_df.to_sql('netflix', conn, if_exists='replace', index=False)


8807

## Basic SQL Commands for Explaratory Data Analysis

In [28]:
# SELECT * FROM netflix LIMIT 10. Show the results as pd 
pd.read_sql('SELECT * FROM netflix LIMIT 10', conn)




Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
5,s6,TV Show,Midnight Mass,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, H...",,"September 24, 2021",2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries",The arrival of a charismatic young priest brin...
6,s7,Movie,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, ...",,"September 24, 2021",2021,PG,91 min,Children & Family Movies,Equestria's divided. But a bright-eyed hero be...
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...","September 24, 2021",1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s..."
8,s9,TV Show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho...",United Kingdom,"September 24, 2021",2021,TV-14,9 Seasons,"British TV Shows, Reality TV",A talented batch of amateur bakers face off in...
9,s10,Movie,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,"September 24, 2021",2021,PG-13,104 min,"Comedies, Dramas",A woman adjusting to life after a loss contend...


In [24]:
# Count the number of Movies and TV shows
query = """
SELECT type, COUNT(type) AS count
FROM netflix
GROUP BY type
"""

cursor.execute(query)

# Fetch the results
results = cursor.fetchall()
results

[('Movie', 6131), ('TV Show', 2676)]

In [25]:
#  Display titles released after 2020:

query = """
SELECT title, release_year
FROM netflix
WHERE release_year > 2020
"""

cursor.execute(query)

# Fetch the results
results = cursor.fetchall()

# Display the results
results

[('Blood & Water', 2021),
 ('Ganglands', 2021),
 ('Jailbirds New Orleans', 2021),
 ('Kota Factory', 2021),
 ('Midnight Mass', 2021),
 ('My Little Pony: A New Generation', 2021),
 ('The Great British Baking Show', 2021),
 ('The Starling', 2021),
 ('Vendetta: Truth, Lies and The Mafia', 2021),
 ('Bangkok Breaking', 2021),
 ('Je Suis Karl', 2021),
 ('Confessions of an Invisible Girl', 2021),
 ('Crime Stories: India Detectives', 2021),
 ('Dear White People', 2021),
 ('Intrusion', 2021),
 ('Jaguar', 2021),
 ('Monsters Inside: The 24 Faces of Billy Milligan', 2021),
 ('Go! Go! Cory Carson: Chrissy Takes the Wheel', 2021),
 ('Love on the Spectrum', 2021),
 ('Ankahi Kahaniya', 2021),
 ('Chicago Party Aunt', 2021),
 ('Squid Game', 2021),
 ('The Father Who Moves Mountains', 2021),
 ('The Stronghold', 2021),
 ('Chhota Bheem', 2021),
 ('He-Man and the Masters of the Universe', 2021),
 ('My Heroes Were Cowboys', 2021),
 ('Castle and Castle', 2021),
 ('Nailed It', 2021),
 ('Nightbooks', 2021),
 ('Nu

## More Advanced SQL Commands

In [26]:
# Number of shows/movies by country:

query = '''
SELECT country, COUNT(*) as count
FROM netflix
GROUP BY country
ORDER BY count DESC;
'''
result = pd.read_sql(query, conn)
print(result)


                                               country  count
0                                        United States   2818
1                                                India    972
2                                                 None    831
3                                       United Kingdom    419
4                                                Japan    245
..                                                 ...    ...
744                                  Argentina, France      1
745                             Argentina, Chile, Peru      1
746  Argentina, Brazil, France, Poland, Germany, De...      1
747                                      , South Korea      1
748                                  , France, Algeria      1

[749 rows x 2 columns]


In [29]:
# Find the average duration of movies:

query = '''
SELECT AVG(duration)
FROM netflix
WHERE type = 'Movie';
'''

result = pd.read_sql(query, conn)
print(result)

   AVG(duration)
0      99.577187


In [30]:
# Find directors who have directed more than one title:

query = '''
SELECT director, COUNT(*) as count
FROM netflix
GROUP BY director
HAVING count > 1
ORDER BY count DESC;
'''

result = pd.read_sql(query, conn)
print(result)

                   director  count
0                      None   2634
1             Rajiv Chilaka     19
2    Raúl Campos, Jan Suter     18
3               Suhas Kadav     16
4              Marcus Raboy     16
..                      ...    ...
863           Abhijit Panse      2
864          Abba T. Makama      2
865            Aaron Sorkin      2
866         A.R. Murugadoss      2
867             A. L. Vijay      2

[868 rows x 2 columns]


In [31]:
# Find directors who have directed more than one title. DO THIS WIHOUT GROUP BY

query = '''
SELECT director, COUNT(*) as count
FROM netflix
WHERE director IS NOT NULL
GROUP BY director
HAVING count > 1;
'''
result = pd.read_sql(query, conn)
print(result)


                                              director  count
0                                          A. L. Vijay      2
1                                      A.R. Murugadoss      2
2                                         Aaron Sorkin      2
3                                       Abba T. Makama      2
4    Abbas Alibhai Burmawalla, Mastan Alibhai Burma...      4
..                                                 ...    ...
862                                        Zoya Akhtar      3
863                          Àlex Pastor, David Pastor      2
864                                 Álex de la Iglesia      2
865                                   Ömer Faruk Sorak      2
866                                       Şenol Sönmez      2

[867 rows x 2 columns]


**WHY THE DIFFERENCE? 

PROBABLY DUE TO HANDLING OF NULL VALUES**

## AGGREGATE (?) OPERATION W/O GROUP BY

**THIS IS HIGHLY INEFFICIENT!**

In [40]:
# same thing w/o group by THIS IS INEFFICIENT

query = '''
SELECT DISTINCT director 
FROM netflix n1
WHERE exists(
   select director
   from netflix n2
   where n1.director = n2.director AND n1.show_id !=  n2.show_id
)
'''


result = pd.read_sql(query, conn)
print(result)


                    director
0            Julien Leclercq
1              Mike Flanagan
2              Bruno Garotti
3            Olivier Megaton
4    Alex Woo, Stanley Moore
..                       ...
862              John Duigan
863          Hadi El Bagoury
864              S.S. Wilson
865            Jason Reitman
866  Saratswadee Wongsomphet

[867 rows x 1 columns]
