In [0]:
%load_ext ai_code_assistant

In [0]:
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
import os

In [0]:
project = dataiku.api_client().get_default_project()
client = dataiku.api_client()


In [0]:
# Retrieve Kaggle username and api key
auth_info = client.get_auth_info(with_secrets=True)
secret_value = None
for secret in auth_info["secrets"]:
    if secret["key"] == "KAGGLE_API_KEY":
        os.environ["KAGGLE_API_KEY"] = secret["value"]
        
    elif secret["key"] == "KAGGLE_USERNAME":
        os.environ["KAGGLE_USERNAME"] = secret["value"]


In [0]:
# Retrieve the folder id where the dataset will be stored
folder_id = next((folder["id"] for folder in project.list_managed_folders() if folder["name"]=="data"), None)
if folder_id is None:
    print("Folder 'data' not found!")


In [0]:
# Set the folder path where the dataset will be stored
folder = dataiku.Folder(folder_id)
folder_path = folder.get_path()
print(folder_path)


In [0]:
import kaggle

In [0]:
# Download the Kaggle dataset from internet
dataset_slug = "kazanova/sentiment140"
kaggle.api.dataset_download_files(dataset_slug, path=folder_path, unzip=True)


In [0]:
# Create the dataframe from the csv file
dataset_info = kaggle.api.dataset_metadata(dataset_slug)
dataset_title = dataset_info["title"]
dataset_name = dataset_title + ".csv"
annotated_tweets_df = pd.read_csv(os.path.join(folder_path, dataset_name))


In [0]:
# Write recipe outputs
annotated_tweets = dataiku.Dataset("annotated_tweets")
annotated_tweets.write_with_schema(annotated_tweets_df)


In [0]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kazanova/sentiment140")

print("Path to dataset files:", path)

In [0]:
import mlcroissant as mlc


In [0]:

# Fetch the Croissant JSON-LD
croissant_dataset = mlc.Dataset('https://www.kaggle.com/datasets/kazanova/sentiment140/croissant/download')

# Check what record sets are in the dataset
record_sets = croissant_dataset.metadata.record_sets
print(record_sets)

# Fetch the records and put them in a DataFrame
record_set_df = pd.DataFrame(croissant_dataset.records(record_set=record_sets[0].uuid))
record_set_df.head()


In [0]:
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd
import matplotlib.pyplot as plt

In [0]:
# Example: load a DSS dataset as a Pandas dataframe
tweets = dataiku.Dataset("tweets")
df = tweets.get_dataframe()

In [0]:
df.head()

In [0]:
df.info()

In [0]:
df['flag'].unique()

In [0]:
df['user'].nunique()

In [0]:
df['tweet_length_chars'] = df['text'].str.len()
df['tweet_length_words'] = df['text'].str.split().apply(len)
df.head()

In [0]:
plt.figure(figsize=(10,5))
plt.hist(df['tweet_length_chars'], bins=20, color='blue', alpha=0.7)
plt.xlabel("Tweet lenght (in characters)")
plt.ylabel("Number of tweets")
plt.title("Distribution of tweets length")
plt.show()