# Download Netflix Descriptions

Taken ref from: https://www.kaggle.com/code/nulldata/fine-tuning-gpt-2-to-generate-netlfix-descriptions/notebook

Which took ref from: https://medium.com/geekculture/fine-tune-eleutherai-gpt-neo-to-generate-netflix-movie-descriptions-in-only-47-lines-of-code-40c9b4c32475

## Setup

In [1]:
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

In [2]:
%%capture
if IN_COLAB:
    
    #Remove not needed python versions to free space
    !rm -rf "/usr/local/lib/python2.7"
    !rm -rf "/usr/lib/python2.7"

    # Clone the repo.
    # !git clone ""

    # Change the working directory to the repo root.
    # %cd

    # Add the repo root to the Python path.
    # import sys, os
    # sys.path.append(os.getcwd())
    
    #Install packages not native to colab
    !pip install python-dotenv
    # !pip install transformers
    !pip install wandb --upgrade
    !pip install pandas-profiling --upgrade

    #Mount GDrive to access .env file
    from google.colab import drive
    drive.mount('/content/gdrive')

    #Load env file
    #NOTE: gdrive wont allow you to mount dotfiles
    from dotenv import load_dotenv
    load_dotenv("./gdrive/MyDrive/my_env_file")

In [3]:
import wandb
wandb.login()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33ma-sh0ts[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
# !nvidia-smi

In [5]:
# torch.manual_seed(42)

## Download Data from Kaggle into Wandb Artifact with Automated Analysis

In [6]:
from kaggle.api.kaggle_api_extended import KaggleApi
import os
from zipfile import ZipFile
import pandas as pd
from pandas_profiling import ProfileReport

In [7]:
kaggle_user_uploader = "shivamb"
kaggle_dataset_name = "netflix-shows"

In [8]:
def add_convert_for_wandb(artifact, path, profile=True, sample=10000):
    
    artifact.add_dir(path, name="data")

    for file_name in os.listdir(path):
        if file_name.endswith(".csv"):
            path_to_file = os.path.join(path, file_name)
            tab_name = file_name.replace(".csv", "")
            print(f"adding {tab_name}")
            df = pd.read_csv(path_to_file)
            print(f"{tab_name}:{df.shape}")

            if df.shape[0] < sample:
                sampled_df = df
            else:
                sampled_df = df.sample(sample)

            table = wandb.Table(dataframe=sampled_df)
            artifact.add(table, name=tab_name)
            
            if profile:
                #The output of the profile report will be an HTML which we will log to W&B under the artifact made
                data_profile = ProfileReport(df, dark_mode=True, title=tab_name, minimal=True)
                profile_path = f"{tab_name}.html"
                data_profile.to_file(profile_path)
                data_table_profile = wandb.Html(profile_path)
                artifact.add(data_table_profile, f"{tab_name}_profile")
                # artifact.add_file(profile_path)
                
    return None

In [9]:
def download_kaggle_data(user_uploader: str = "shivamb",
                         dataset_name: str = "netflix-shows",
                         project_name: str = "gpt2-netflix", **kwargs):

    print(f"starting new run for {project_name}")
    _conf = {
        "kaggle_user_uploader": user_uploader,
        "kaggle_dataset_name": dataset_name
    }
    run = wandb.init(
        project=project_name, job_type="download", name=f"log-{dataset_name}", config = _conf)
    
    conf = run.config
    kaggle_user_uploader = conf["kaggle_user_uploader"]
    kaggle_dataset_name = conf["kaggle_dataset_name"]
    dataset_stub = f"{kaggle_user_uploader}/{kaggle_dataset_name}"

    api = KaggleApi()
    api.authenticate()
    api.dataset_download_files(dataset_stub)
    
    zip_path = f"{dataset_name}.zip"
    path_to_raw = os.path.join(".", "data", "raw")
    ZipFile(zip_path).extractall(path=path_to_raw)
    os.remove(zip_path)

    # TODO: Remove hack to add data secription
    if dataset_name == "netflix-shows":
        data_description = """
            About this Dataset: Netflix is one of the most popular media and video streaming platforms. 
            They have over 8000 movies or tv shows available on their platform, as of mid-2021, they have over 200M Subscribers globally. 
            This tabular dataset consists of listings of all the movies and tv shows available on Netflix, along with details such as - cast, directors, ratings, release year, duration, etc.
        """

    raw_data_artifact = wandb.Artifact(
        name=kaggle_dataset_name, type="raw_data", description=data_description)
    add_convert_for_wandb(raw_data_artifact, path_to_raw)

    run.log_artifact(raw_data_artifact)
    run.finish()

In [10]:
download_kaggle_data()

starting new run for gpt2-netflix


[34m[1mwandb[0m: Adding directory to artifact (./data/raw)... Done. 0.1s


adding netflix_titles
netflix_titles:(8807, 12)


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Label(value='7.201 MB of 7.201 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…