In [1]:
# Step 1: Import necessary libraries
import os
import zipfile
import requests
from google.colab import drive

# Step 2: Mount Google Drive
drive.mount('/content/drive')

# Step 3: Define the target directory on Google Drive
drive_path = '/content/drive/My Drive/Colab Notebooks/nlp_pro_babu'
corpus_name = "data"
corpus_path = os.path.join(drive_path, corpus_name)

# Step 4: Create the target directory if it doesn't exist
if not os.path.exists(corpus_path):
    os.makedirs(corpus_path)

# Step 5: Download the Cornell Movie Dialogs dataset and save it to the specified path
dataset_url = "https://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip"
dataset_zip_path = os.path.join(corpus_path, "cornell_movie_dialogs_corpus.zip")

# Downloading the dataset
print("Downloading dataset...")
response = requests.get(dataset_url)
with open(dataset_zip_path, "wb") as file:
    file.write(response.content)
print("Dataset downloaded successfully!")

# Step 6: Extract the dataset directly into the corpus directory
print("Extracting dataset...")
with zipfile.ZipFile(dataset_zip_path, 'r') as zip_ref:
    zip_ref.extractall(corpus_path)  # Extracts files into 'movie-corpus' folder
print("Dataset extracted successfully into 'movie-corpus' folder!")

# Step 7: Verify the contents of the directory
print("\nContents of 'movie-corpus' directory on Google Drive:")
print(os.listdir(corpus_path))

# Clean up: Remove the zip file after extraction if you don't need it anymore
os.remove(dataset_zip_path)
print("Cleaned up the zip file.")


Mounted at /content/drive
Downloading dataset...
Dataset downloaded successfully!
Extracting dataset...
Dataset extracted successfully into 'movie-corpus' folder!

Contents of 'movie-corpus' directory on Google Drive:
['cornell_movie_dialogs_corpus.zip', 'movie-corpus']
Cleaned up the zip file.
