# IMDB DataSet Retriever

## Download IMDB Data and load into Pandas

In [2]:
!curl -o title.basics.tsv.gz https://datasets.imdbws.com/title.basics.tsv.gz
!curl -o title.ratings.tsv.gz https://datasets.imdbws.com/title.ratings.tsv.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  175M  100  175M    0     0  41.9M      0  0:00:04  0:00:04 --:--:-- 41.9MM      0  0:00:04  0:00:02  0:00:02 40.1M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 6854k  100 6854k    0     0  24.9M      0 --:--:-- --:--:-- --:--:-- 24.9M


In [13]:
import gzip, shutil
import pandas as pd

with gzip.open('title.basics.tsv.gz', 'rb') as f_in:
    with open('title.basics.tsv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
        
with gzip.open('title.ratings.tsv.gz', 'rb') as f_in:
    with open('title.ratings.tsv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
        
basics = pd.read_csv('title.basics.tsv', sep='\t',
    low_memory=False, na_values=['\\N'])
ratings = pd.read_csv('title.ratings.tsv', sep='\t',
    low_memory=False, na_values=['\\N'])
full_data = pd.merge(basics, ratings, on="tconst")
samples = full_data.sample(n=100,random_state=42)
samples.head()
        

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
779540,tt1683657,tvEpisode,Jersey Shore,Jersey Shore,0.0,2010.0,,22.0,Reality-TV,8.4,48
170640,tt0281379,tvEpisode,Dance of the Scorpions,Dance of the Scorpions,0.0,1997.0,,80.0,"Crime,Drama,Mystery",8.2,75
879731,tt21265550,movie,A Cut Above,A Cut Above,0.0,2022.0,,90.0,Comedy,4.7,199
1265981,tt6829484,tvEpisode,Bloods Rising,Bloods Rising,0.0,2017.0,,42.0,Reality-TV,7.2,40
1278399,tt7082530,tvEpisode,Morning,Morning,0.0,2019.0,,,"Crime,Drama,Mystery",9.4,7


In [7]:
print(samples.shape[0])

100


## Load data from Pandas to SQLLite

In [8]:
import pandas as pd
import sqlite3

# SQLite 데이터베이스 연결 및 커서 생성
conn = sqlite3.connect('example.db')
cursor = conn.cursor()

# 데이터프레임을 SQLite 테이블로 저장
samples.to_sql('my_table', conn, index=False, if_exists='replace')

# 테이블에 저장된 데이터 출력
query = "SELECT * FROM my_table"
result = pd.read_sql_query(query, conn)
print(result)

# 연결 종료
conn.close()


        tconst  titleType                       primaryTitle  \
0    tt1683657  tvEpisode                       Jersey Shore   
1    tt0281379  tvEpisode             Dance of the Scorpions   
2   tt21265550      movie                        A Cut Above   
3    tt6829484  tvEpisode                      Bloods Rising   
4    tt7082530  tvEpisode                            Morning   
..         ...        ...                                ...   
95   tt0862974  tvEpisode                    Hayaku ikitee'!   
96   tt5911540      movie                             Dangal   
97   tt1838593    tvMovie                       Madison High   
98   tt0034389      movie              Whistling in the Dark   
99   tt1346874      video  A Brief History of Flying Saucers   

                        originalTitle  isAdult  startYear  endYear  \
0                        Jersey Shore      0.0     2010.0      NaN   
1              Dance of the Scorpions      0.0     1997.0      NaN   
2                    

## Generate the SQL query by using LLM

In [19]:
from langchain.llms import OpenAI
from langchain.chains import create_sql_query_chain
from langchain_community.utilities import SQLDatabase
from langchain.schema import StrOutputParser

OPEN_AI_APIKEY="{YOUR_OPENAI_KEY}"
model = OpenAI(openai_api_key=OPEN_AI_APIKEY)

db = SQLDatabase.from_uri("sqlite:///example.db")
chain = create_sql_query_chain(model, db,k=20) |  StrOutputParser()
result = chain.invoke({"question": 
                         """Please provide a list of  movies that have an averageRating of 8.0 or higher 
                         and have been commercially available since 2008."""})
print(result)

SELECT "primaryTitle" FROM my_table WHERE "titleType" = 'movie' AND "averageRating" >= 8.0 AND "startYear" >= 2008 ORDER BY "averageRating" DESC LIMIT 20;
