In [1]:
%pip install kaggle psycopg2-binary sqlalchemy pandas requests beautifulsoup4 lxml python-dotenv

Collecting kaggle
  Downloading kaggle-1.7.4.2-py3-none-any.whl.metadata (16 kB)
Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting sqlalchemy
  Downloading sqlalchemy-2.0.40-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting lxml
  Downloading lxml-5.4.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.5 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting protobuf (from kaggle)
  Downloading protobuf-6.30.2-cp39-abi3-manylinux2014_x86_64.whl.metadata (593 bytes)
Collecting python-slugify (from kaggle)
  Downloading python_slugify-8.0.4-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting text-unidecode (from kaggle)
  Downloading text_unidecode-1.3-py2.py3-none-any.whl.metadata (2.4 kB)
Collecting tqdm (from kaggle)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting greenlet>=

In [14]:
import os
import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi
from dotenv import load_dotenv
from sqlalchemy import create_engine, text

In [3]:
load_dotenv()

os.environ["KAGGLE_USERNAME"] = os.getenv("KAGGLE_USERNAME")
os.environ["KAGGLE_KEY"] = os.getenv("KAGGLE_KEY")

api = KaggleApi()
api.authenticate()

dataset_slug = "alanjo/graphics-card-full-specs"
api.dataset_download_files(dataset_slug, path='./data/kaggle', unzip=True)

Dataset URL: https://www.kaggle.com/datasets/alanjo/graphics-card-full-specs


In [4]:
for root, dirs, files in os.walk("./data/kaggle"):
    for file in files:
        print(file)

gpu_specs_v6.csv
gpu_specs_v7.csv


In [5]:
df = pd.read_csv("./data/kaggle/gpu_specs_v7.csv")

df.reset_index(drop=True, inplace=True)
print(df.head())

  manufacturer              productName  releaseYear  memSize  memBusWidth  \
0       NVIDIA         GeForce RTX 5090       2025.0     28.0        448.0   
1       NVIDIA         GeForce RTX 5080       2025.0     16.0        256.0   
2       NVIDIA         GeForce RTX 5070       2025.0     12.0        192.0   
3       NVIDIA  GeForce RTX 5060 Mobile       2025.0      8.0        128.0   
4       NVIDIA         GeForce RTX 5060       2025.0      8.0        128.0   

   gpuClock  memClock  unifiedShader  tmu  rop  pixelShader  vertexShader igp  \
0       900    1200.0         8192.0  256  128          NaN           NaN  No   
1       900    1215.0         6912.0  432  192          NaN           NaN  No   
2      1825    2000.0         5120.0  320  128          NaN           NaN  No   
3      2235    2500.0         4608.0  144   48          NaN           NaN  No   
4      1825    2250.0         4608.0  288  192          NaN           NaN  No   

            bus memType       gpuChip  
0  P

In [6]:
print(df.columns)
print(df.info)

Index(['manufacturer', 'productName', 'releaseYear', 'memSize', 'memBusWidth',
       'gpuClock', 'memClock', 'unifiedShader', 'tmu', 'rop', 'pixelShader',
       'vertexShader', 'igp', 'bus', 'memType', 'gpuChip'],
      dtype='object')
<bound method DataFrame.info of      manufacturer              productName  releaseYear  memSize  memBusWidth  \
0          NVIDIA         GeForce RTX 5090       2025.0   28.000        448.0   
1          NVIDIA         GeForce RTX 5080       2025.0   16.000        256.0   
2          NVIDIA         GeForce RTX 5070       2025.0   12.000        192.0   
3          NVIDIA  GeForce RTX 5060 Mobile       2025.0    8.000        128.0   
4          NVIDIA         GeForce RTX 5060       2025.0    8.000        128.0   
...           ...                      ...          ...      ...          ...   
3051          ATI     Radeon X1600 XT Dual          NaN    0.256          NaN   
3052          AMD        Radeon Pro V5300X          NaN    4.000          NaN   
3

In [11]:
load_dotenv()

pg_user = os.getenv('PG_USER')
pg_password = os.getenv('PG_PASSWORD')
pg_host = os.getenv('PG_HOST')
pg_port = os.getenv('PG_PORT')
pg_db = os.getenv('PG_GPU_DB')

pg_conn_str = f"postgresql+psycopg2://{pg_user}:{pg_password}@{pg_host}:{pg_port}/{pg_db}"
pg_engine = create_engine(pg_conn_str)

In [12]:
print("Shape of DataFrame:", df.shape)
print(df.head())

Shape of DataFrame: (3056, 16)
  manufacturer              productName  releaseYear  memSize  memBusWidth  \
0       NVIDIA         GeForce RTX 5090       2025.0     28.0        448.0   
1       NVIDIA         GeForce RTX 5080       2025.0     16.0        256.0   
2       NVIDIA         GeForce RTX 5070       2025.0     12.0        192.0   
3       NVIDIA  GeForce RTX 5060 Mobile       2025.0      8.0        128.0   
4       NVIDIA         GeForce RTX 5060       2025.0      8.0        128.0   

   gpuClock  memClock  unifiedShader  tmu  rop  pixelShader  vertexShader igp  \
0       900    1200.0         8192.0  256  128          NaN           NaN  No   
1       900    1215.0         6912.0  432  192          NaN           NaN  No   
2      1825    2000.0         5120.0  320  128          NaN           NaN  No   
3      2235    2500.0         4608.0  144   48          NaN           NaN  No   
4      1825    2250.0         4608.0  288  192          NaN           NaN  No   

            b

In [15]:
with pg_engine.connect() as conn:
    result = conn.execute(text("SELECT schema_name FROM information_schema.schemata WHERE schema_name = 'sql_project';"))
    for row in result:
        print(row)

('sql_project',)


In [16]:
df.to_sql(name="gpu_specs_raw", con=pg_engine, schema="sql_project", if_exists="replace", index=False) 

56