In [1]:
# Charger l'extension
%load_ext webhdfsmagic

The webhdfsmagic extension is already loaded. To reload it, use:
  %reload_ext webhdfsmagic


In [2]:
# Afficher l'aide
%hdfs help

Command,Description
%hdfs help,Display this help
"%hdfs setconfig {""knox_url"": ""..."", ""webhdfs_api"": ""..."",  ""username"": ""..."", ""password"": ""..."", ""verify_ssl"": false}",Set configuration and credentials directly in the notebook
%hdfs ls [path],List files on HDFS
%hdfs mkdir <path>,Create a directory on HDFS
%hdfs rm <path or pattern> [-r],Delete a file/directory. Supports wildcards.  Example: %hdfs rm /user/files* [-r]
%hdfs put <local_file_or_pattern> <hdfs_destination>,"Upload one or more local files (wildcards allowed) to HDFS.  If the HDFS path ends with '/' or '.', the original file name is preserved."
%hdfs get <hdfs_file_or_pattern> <local_destination>,"Download one or more files from HDFS.  If the local destination is a directory (or "".""/~),  the original file name is appended."
%hdfs cat <file> [-n <number_of_lines>],"Display file content. Default is 100 lines.  Use ""-n -1"" to display the full file."
%hdfs chmod [-R] <permission> <path>,"Set permissions (SETPERMISSION).  The ""-R"" option applies recursively."
%hdfs chown [-R] <user:group> <path>,"Set owner and group (SETOWNER).  The ""-R"" option applies recursively."


In [3]:
# V√©rifier la configuration
import json
import os

config_path = os.path.expanduser('~/.webhdfsmagic/config.json')
with open(config_path) as f:
    config = json.load(f)
    
print("Configuration actuelle:")
print(f"  URL: {config['knox_url']}{config['webhdfs_api']}")
print(f"  User: {config['username']}")
print(f"  SSL: {config['verify_ssl']}")

Configuration actuelle:
  URL: http://localhost:8080/gateway/default/webhdfs/v1
  User: testuser
  SSL: False


## 1Ô∏è‚É£ Test de Listing

In [4]:
# Lister le r√©pertoire racine
%hdfs ls /

Unnamed: 0,name,type,size,owner,group,permissions,block_size,modified,replication
0,data,DIR,0,testuser,supergroup,rwxr-xr-x,0,2025-12-04 12:10:49.489,0
1,demo,DIR,0,root,supergroup,rwxr-xr-x,0,2025-12-04 12:17:39.846,0
2,test_mkdir_direct,DIR,0,testuser,supergroup,rwxr-xr-x,0,2025-12-04 10:57:06.101,0
3,test_via_magic,DIR,0,testuser,supergroup,rwxr-xr-x,0,2025-12-04 10:57:16.778,0
4,test_webhdfs,DIR,0,root,supergroup,rwxr-xr-x,0,2025-12-04 10:49:59.125,0


## 2Ô∏è‚É£ Cr√©ation de R√©pertoires

In [5]:
# Cr√©er un r√©pertoire de test
%hdfs mkdir /demo

{'boolean': True}

In [6]:
# Cr√©er une structure de r√©pertoires imbriqu√©s
%hdfs mkdir /demo/data/2024/12

{'boolean': True}

In [7]:
# V√©rifier la cr√©ation
%hdfs ls /

Unnamed: 0,name,type,size,owner,group,permissions,block_size,modified,replication
0,data,DIR,0,testuser,supergroup,rwxr-xr-x,0,2025-12-04 12:10:49.489,0
1,demo,DIR,0,root,supergroup,rwxr-xr-x,0,2025-12-04 12:17:39.846,0
2,test_mkdir_direct,DIR,0,testuser,supergroup,rwxr-xr-x,0,2025-12-04 10:57:06.101,0
3,test_via_magic,DIR,0,testuser,supergroup,rwxr-xr-x,0,2025-12-04 10:57:16.778,0
4,test_webhdfs,DIR,0,root,supergroup,rwxr-xr-x,0,2025-12-04 10:49:59.125,0


In [8]:
# Lister le contenu du r√©pertoire demo
%hdfs ls /demo/data/

Unnamed: 0,name,type,size,owner,group,permissions,block_size,modified,replication
0,2024,DIR,0,root,supergroup,rwxr-xr-x,0,2025-12-04 10:56:15.400,0
1,clients.csv,FILE,178,testuser,supergroup,rw-r--r--,134217728,2025-12-04 12:17:39.376,3


## 3Ô∏è‚É£ Upload de Fichiers

In [9]:
# Cr√©er un fichier de test local
import pandas as pd

# Cr√©er des donn√©es de test
df = pd.DataFrame({
    'id': range(1, 11),
    'nom': [f'Client{i}' for i in range(1, 11)],
    'montant': [100.5 * i for i in range(1, 11)]
})

# Sauvegarder localement
df.to_csv('test_data.csv', index=False)
print("Fichier test_data.csv cr√©√©:")
print(df.head())

Fichier test_data.csv cr√©√©:
   id      nom  montant
0   1  Client1    100.5
1   2  Client2    201.0
2   3  Client3    301.5
3   4  Client4    402.0
4   5  Client5    502.5


In [10]:
# ‚ö†Ô∏è IMPORTANT: Recharger l'extension pour appliquer les corrections de redirection
import sys
import importlib

# Forcer le rechargement complet du module
if 'webhdfsmagic' in sys.modules:
    del sys.modules['webhdfsmagic']
if 'webhdfsmagic.magics' in sys.modules:
    del sys.modules['webhdfsmagic.magics']

# Recharger l'extension
%reload_ext webhdfsmagic
print("‚úì Extension recharg√©e avec les corrections")

‚úì Extension recharg√©e avec les corrections


In [11]:
# Upload vers HDFS
%hdfs put test_data.csv /demo/data/clients.csv

'/workspaces/webhdfsmagic/examples/test_data.csv uploaded successfully to /demo/data/clients.csv'

In [12]:
# Test manuel de l'upload avec debug
import requests
from urllib.parse import urlparse, urlunparse

config_path = os.path.expanduser('~/.webhdfsmagic/config.json')
with open(config_path) as f:
    config = json.load(f)

knox_url = config['knox_url']
webhdfs_api = config['webhdfs_api']
username = config['username']
password = config['password']

# Step 1: Initier la cr√©ation
init_url = f"{knox_url}{webhdfs_api}/demo/data/test_manual.csv"
init_params = {"op": "CREATE", "overwrite": "true"}

print("Step 1: Initiation de la cr√©ation...")
init_response = requests.put(
    init_url,
    params=init_params,
    auth=(username, password),
    verify=False,
    allow_redirects=False,
)
print(f"  Status: {init_response.status_code}")
print(f"  Headers: {dict(init_response.headers)}")

if init_response.status_code == 307:
    redirect_url = init_response.headers.get("Location")
    print(f"\n  Redirect original: {redirect_url}")
    
    # Reconstruire l'URL
    parsed = urlparse(redirect_url)
    gateway_parsed = urlparse(knox_url)
    fixed_url = urlunparse((
        gateway_parsed.scheme,
        gateway_parsed.netloc,
        parsed.path,
        parsed.params,
        parsed.query,
        parsed.fragment
    ))
    print(f"  URL corrig√©e: {fixed_url}")
    
    # Step 2: Upload
    print("\nStep 2: Upload du fichier...")
    with open('test_data.csv', 'rb') as f:
        upload_response = requests.put(
            fixed_url,
            data=f,
            auth=(username, password),
            verify=False,
            allow_redirects=False,
        )
    print(f"  Status: {upload_response.status_code}")
    print(f"  Headers: {dict(upload_response.headers)}")
    
    if upload_response.status_code == 307:
        print(f"  ‚ö†Ô∏è Encore une redirection vers: {upload_response.headers.get('Location')}")
    elif upload_response.status_code in [200, 201]:
        print("  ‚úÖ Upload r√©ussi!")
    else:
        print(f"  ‚ùå Erreur: {upload_response.text}")

Step 1: Initiation de la cr√©ation...
  Status: 307
  Headers: {'Server': 'nginx/1.29.3', 'Date': 'Thu, 04 Dec 2025 12:19:46 GMT', 'Content-Type': 'application/octet-stream', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Cache-Control': 'no-cache', 'Expires': 'Thu, 04 Dec 2025 12:19:46 GMT', 'Pragma': 'no-cache', 'X-Content-Type-Options': 'nosniff', 'X-FRAME-OPTIONS': 'SAMEORIGIN', 'X-XSS-Protection': '1; mode=block', 'Location': 'http://2045fadf12d5:9864/webhdfs/v1/demo/data/test_manual.csv?op=CREATE&namenoderpcaddress=namenode:9000&createflag=&createparent=true&overwrite=true'}

  Redirect original: http://2045fadf12d5:9864/webhdfs/v1/demo/data/test_manual.csv?op=CREATE&namenoderpcaddress=namenode:9000&createflag=&createparent=true&overwrite=true
  URL corrig√©e: http://localhost:8080/webhdfs/v1/demo/data/test_manual.csv?op=CREATE&namenoderpcaddress=namenode:9000&createflag=&createparent=true&overwrite=true

Step 2: Upload du fichier...
  Status: 405
  Headers: {'Serve

In [13]:
# V√©rifier que le fichier existe
%hdfs ls /demo/data

Unnamed: 0,name,type,size,owner,group,permissions,block_size,modified,replication
0,2024,DIR,0,root,supergroup,rwxr-xr-x,0,2025-12-04 10:56:15.400,0
1,clients.csv,FILE,178,testuser,supergroup,rw-r--r--,134217728,2025-12-04 12:19:46.787,3


## 4Ô∏è‚É£ Lecture de Fichiers

In [14]:
# Lire le contenu du fichier
%hdfs cat /demo/data/clients.csv

'id,nom,montant\n1,Client1,100.5\n2,Client2,201.0\n3,Client3,301.5\n4,Client4,402.0\n5,Client5,502.5\n6,Client6,603.0\n7,Client7,703.5\n8,Client8,804.0\n9,Client9,904.5\n10,Client10,1005.0'

In [15]:
# Lire seulement les premi√®res lignes
%hdfs cat -n 5 /demo/data/clients.csv

'id,nom,montant\n1,Client1,100.5\n2,Client2,201.0\n3,Client3,301.5\n4,Client4,402.0'

## 5Ô∏è‚É£ Download de Fichiers

In [16]:
# T√©l√©charger depuis HDFS
%hdfs get /demo/data/clients.csv ./downloaded_clients.csv

'/demo/data/clients.csv downloaded to ./downloaded_clients.csv'

In [17]:
# V√©rifier le fichier t√©l√©charg√©
df_downloaded = pd.read_csv('downloaded_clients.csv')
print("Fichier t√©l√©charg√© depuis HDFS:")
print(df_downloaded)

Fichier t√©l√©charg√© depuis HDFS:
   id       nom  montant
0   1   Client1    100.5
1   2   Client2    201.0
2   3   Client3    301.5
3   4   Client4    402.0
4   5   Client5    502.5
5   6   Client6    603.0
6   7   Client7    703.5
7   8   Client8    804.0
8   9   Client9    904.5
9  10  Client10   1005.0


## 6Ô∏è‚É£ Statistiques et M√©tadonn√©es

In [18]:
# Obtenir les stats du fichier
%hdfs stat /demo/data/clients.csv

'Unknown command: stat'

In [19]:
# Disk usage
%hdfs du /demo

'Unknown command: du'

In [20]:
# Disk usage en format lisible
%hdfs du -h /demo

'Unknown command: du'

## 7Ô∏è‚É£ Copie et D√©placement

In [21]:
# Cr√©er une copie via rename
%hdfs mv /demo/data/clients.csv /demo/data/clients_backup.csv

'Unknown command: mv'

In [22]:
# V√©rifier
%hdfs ls /demo/data

Unnamed: 0,name,type,size,owner,group,permissions,block_size,modified,replication
0,2024,DIR,0,root,supergroup,rwxr-xr-x,0,2025-12-04 10:56:15.400,0
1,clients.csv,FILE,178,testuser,supergroup,rw-r--r--,134217728,2025-12-04 12:19:46.787,3


## 8Ô∏è‚É£ Workflow Complet

In [23]:
# Cr√©er plusieurs fichiers de donn√©es
from datetime import datetime, timedelta

print("üìä G√©n√©ration de donn√©es de ventes...")

for i in range(3):
    date = datetime.now() - timedelta(days=i)
    date_str = date.strftime('%Y%m%d')
    
    # G√©n√©rer des donn√©es
    df_sales = pd.DataFrame({
        'date': [date.strftime('%Y-%m-%d')] * 10,
        'product_id': range(1, 11),
        'quantity': [10 + i*5 + j for j in range(10)],
        'price': [50.0 + j*10 for j in range(10)]
    })
    
    filename = f'sales_{date_str}.csv'
    df_sales.to_csv(filename, index=False)
    
    print(f"  Cr√©√©: {filename} ({len(df_sales)} lignes)")

print("\n‚úì Donn√©es g√©n√©r√©es")

üìä G√©n√©ration de donn√©es de ventes...
  Cr√©√©: sales_20251204.csv (10 lignes)
  Cr√©√©: sales_20251203.csv (10 lignes)
  Cr√©√©: sales_20251202.csv (10 lignes)

‚úì Donn√©es g√©n√©r√©es


In [24]:
# Cr√©er le r√©pertoire de destination
%hdfs mkdir -p /demo/sales/raw

'HTTP Error 405: <html>\r\n<head><title>405 Not Allowed</title></head>\r\n<body>\r\n<center><h1>405 Not Allowed</h1></center>\r\n<hr><center>nginx/1.29.3</center>\r\n</body>\r\n</html>\r\n'

In [25]:
# Upload de tous les fichiers
import glob

print("üì§ Upload des fichiers vers HDFS...\n")

for file in glob.glob('sales_*.csv'):
    remote_path = f'/demo/sales/raw/{file}'
    print(f"Uploading {file}...")
    %hdfs put {file} {remote_path}
    
print("\n‚úì Tous les fichiers upload√©s")

üì§ Upload des fichiers vers HDFS...

Uploading sales_20251203.csv...
Uploading sales_20251202.csv...
Uploading sales_20251204.csv...

‚úì Tous les fichiers upload√©s


In [26]:
# V√©rifier les fichiers upload√©s
print("üìÅ Fichiers dans HDFS:\n")
%hdfs ls /demo/sales/raw

üìÅ Fichiers dans HDFS:



Unnamed: 0,name,type,size,owner,group,permissions,block_size,modified,replication
0,sales_20251202.csv,FILE,247,testuser,supergroup,rw-r--r--,134217728,2025-12-04 12:19:47.324,3
1,sales_20251203.csv,FILE,247,testuser,supergroup,rw-r--r--,134217728,2025-12-04 12:19:47.285,3
2,sales_20251204.csv,FILE,247,testuser,supergroup,rw-r--r--,134217728,2025-12-04 12:19:47.352,3


In [27]:
# Calculer l'espace utilis√©
print("üíæ Espace disque utilis√©:\n")
%hdfs du -s -h /demo/sales

üíæ Espace disque utilis√©:



'Unknown command: du'

## 9Ô∏è‚É£ Nettoyage

In [28]:
# Supprimer un fichier
%hdfs rm /demo/data/clients_backup.csv

{'boolean': False}

In [29]:
# Supprimer un r√©pertoire (attention!)
# %hdfs rm -r /demo/sales

## ‚úÖ R√©sum√© des Tests

Si toutes les cellules ci-dessus se sont ex√©cut√©es avec succ√®s, webhdfsmagic fonctionne correctement avec votre cluster HDFS local !

### Fonctionnalit√©s test√©es:

- ‚úÖ Configuration et connexion
- ‚úÖ Listing de r√©pertoires (`ls`)
- ‚úÖ Cr√©ation de r√©pertoires (`mkdir`)
- ‚úÖ Upload de fichiers (`put`)
- ‚úÖ Lecture de fichiers (`cat`)
- ‚úÖ Download de fichiers (`get`)
- ‚úÖ Statistiques (`stat`, `du`)
- ‚úÖ D√©placement (`mv`)
- ‚úÖ Suppression (`rm`)
- ‚úÖ Workflow complet

### URLs utiles:

- **Interface HDFS NameNode**: http://localhost:9870
- **Gateway WebHDFS**: http://localhost:8080/gateway/default/webhdfs/v1/

### Pour arr√™ter l'environnement:

```bash
docker-compose down
# ou pour supprimer aussi les donn√©es:
docker-compose down -v
```