# üìä Analyse du Dataset Anthropic/AnthropicInterviewer

Dataset contenant 1,250 transcriptions d'entretiens sur l'utilisation de l'IA au travail.

**Source**: https://huggingface.co/datasets/Anthropic/AnthropicInterviewer

## üì¶ Installation des d√©pendances

In [None]:
!pip install -q datasets pandas matplotlib seaborn wordcloud

## üîç Chargement du dataset

In [None]:
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Configuration des graphiques
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("üì• Chargement du dataset...")
dataset = load_dataset("Anthropic/AnthropicInterviewer")
print("‚úÖ Dataset charg√© avec succ√®s!")

## üìã Exploration de la structure

In [None]:
print("üìÇ Splits disponibles:", list(dataset.keys()))
print("\nüìä Statistiques par split:")
for split_name in dataset.keys():
    print(f"  - {split_name}: {len(dataset[split_name])} entretiens")
    print(f"    Colonnes: {dataset[split_name].column_names}")

print(f"\nüìà Total: {sum(len(dataset[split]) for split in dataset.keys())} entretiens")

## üëÄ Exemples d'entretiens

In [None]:
# Afficher un exemple de chaque groupe
for split_name in dataset.keys():
    print(f"\n{'='*80}")
    print(f"üìÑ EXEMPLE D'INTERVIEW ({split_name.upper()})")
    print(f"{'='*80}\n")
    
    example = dataset[split_name][0]
    
    # Afficher toutes les cl√©s disponibles
    for key, value in example.items():
        if isinstance(value, str) and len(value) > 500:
            print(f"**{key}**: {value[:500]}...\n")
        else:
            print(f"**{key}**: {value}\n")

## üî¢ Liste des IDs

In [None]:
for split_name in dataset.keys():
    print(f"\nüîπ {split_name.upper()} - 10 premiers IDs:")
    
    # D√©tecter le nom de la colonne d'ID
    id_col = None
    for col in dataset[split_name].column_names:
        if 'id' in col.lower():
            id_col = col
            break
    
    if id_col:
        for i in range(min(10, len(dataset[split_name]))):
            print(f"  - {dataset[split_name][i][id_col]}")
    else:
        print("  (Pas de colonne ID trouv√©e)")

## üìä Conversion en DataFrames

In [None]:
# Cr√©er des DataFrames pour chaque split
dfs = {}

for split_name in dataset.keys():
    dfs[split_name] = dataset[split_name].to_pandas()
    print(f"\n{'='*80}")
    print(f"üìã {split_name.upper()} - Aper√ßu")
    print(f"{'='*80}")
    print(dfs[split_name].head())
    print(f"\nShape: {dfs[split_name].shape}")

## üìà Statistiques d√©taill√©es

In [None]:
# Statistiques globales
total_interviews = sum(len(df) for df in dfs.values())

print(f"üìä STATISTIQUES GLOBALES")
print(f"{'='*80}\n")
print(f"Nombre total d'interviews: {total_interviews}")
print(f"\nDistribution par groupe:")

for split_name, df in dfs.items():
    percentage = (len(df) / total_interviews) * 100
    print(f"  - {split_name}: {len(df)} ({percentage:.1f}%)")

# Analyse des colonnes textuelles
text_col = None
for col in dfs[list(dfs.keys())[0]].columns:
    if 'text' in col.lower() or 'transcript' in col.lower():
        text_col = col
        break

if text_col:
    print(f"\nüìù Statistiques sur les transcriptions (colonne: {text_col}):")
    for split_name, df in dfs.items():
        lengths = df[text_col].apply(lambda x: len(str(x)) if x else 0)
        print(f"\n  {split_name}:")
        print(f"    - Longueur moyenne: {lengths.mean():.0f} caract√®res")
        print(f"    - M√©diane: {lengths.median():.0f} caract√®res")
        print(f"    - Min: {lengths.min():.0f} caract√®res")
        print(f"    - Max: {lengths.max():.0f} caract√®res")

## üìä Visualisations

In [None]:
# Graphique de distribution par groupe
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Distribution des groupes
group_counts = {name: len(df) for name, df in dfs.items()}
axes[0].bar(group_counts.keys(), group_counts.values(), color=['#1f77b4', '#ff7f0e', '#2ca02c'])
axes[0].set_title('Distribution des Entretiens par Groupe', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Nombre d\'entretiens')
axes[0].set_xlabel('Groupe')

# Camembert
axes[1].pie(group_counts.values(), labels=group_counts.keys(), autopct='%1.1f%%', 
            colors=['#1f77b4', '#ff7f0e', '#2ca02c'], startangle=90)
axes[1].set_title('R√©partition en %', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Distribution des longueurs de transcriptions
if text_col:
    fig, ax = plt.subplots(figsize=(15, 6))
    
    for split_name, df in dfs.items():
        lengths = df[text_col].apply(lambda x: len(str(x)) if x else 0)
        ax.hist(lengths, alpha=0.6, label=split_name, bins=30)
    
    ax.set_title('Distribution des Longueurs de Transcriptions', fontsize=14, fontweight='bold')
    ax.set_xlabel('Longueur (caract√®res)')
    ax.set_ylabel('Fr√©quence')
    ax.legend()
    plt.tight_layout()
    plt.show()

## üîç Analyse de contenu (mots-cl√©s)

In [None]:
if text_col:
    keywords = ['AI', 'automation', 'creative', 'research', 'job', 'tool', 'help', 
                'workflow', 'efficiency', 'concern', 'future', 'learn', 'ChatGPT', 
                'Claude', 'GPT', 'assistant', 'code', 'writing', 'image']
    
    print("üîç ANALYSE DES MOTS-CL√âS\n")
    print("="*80)
    
    for split_name, df in dfs.items():
        print(f"\nüìä {split_name.upper()}:")
        
        keyword_counts = {}
        for keyword in keywords:
            count = df[text_col].str.contains(keyword, case=False, na=False).sum()
            if count > 0:
                percentage = (count / len(df)) * 100
                keyword_counts[keyword] = (count, percentage)
        
        # Trier par fr√©quence
        sorted_keywords = sorted(keyword_counts.items(), key=lambda x: x[1][0], reverse=True)
        
        for keyword, (count, percentage) in sorted_keywords[:10]:
            print(f"  - '{keyword}': {count} mentions ({percentage:.1f}% des entretiens)")

## ‚òÅÔ∏è Word Clouds par groupe

In [None]:
try:
    from wordcloud import WordCloud
    
    if text_col:
        fig, axes = plt.subplots(1, len(dfs), figsize=(18, 5))
        if len(dfs) == 1:
            axes = [axes]
        
        for idx, (split_name, df) in enumerate(dfs.items()):
            # Combiner tous les textes
            all_text = ' '.join(df[text_col].astype(str).values)
            
            # Cr√©er le word cloud
            wordcloud = WordCloud(width=800, height=400, background_color='white',
                                colormap='viridis', max_words=100).generate(all_text)
            
            axes[idx].imshow(wordcloud, interpolation='bilinear')
            axes[idx].set_title(f'Word Cloud - {split_name.upper()}', fontsize=12, fontweight='bold')
            axes[idx].axis('off')
        
        plt.tight_layout()
        plt.show()
except ImportError:
    print("‚ö†Ô∏è wordcloud non install√©. Ex√©cutez: !pip install wordcloud")

## üíæ Export des donn√©es

In [None]:
# Exporter en CSV
for split_name, df in dfs.items():
    filename = f"anthropic_interviewer_{split_name}.csv"
    df.to_csv(filename, index=False)
    print(f"‚úÖ Export√©: {filename} ({len(df)} lignes)")

# Cr√©er un dataset combin√©
combined_df = pd.concat([df.assign(group=name) for name, df in dfs.items()], ignore_index=True)
combined_df.to_csv("anthropic_interviewer_combined.csv", index=False)
print(f"‚úÖ Dataset combin√© export√©: anthropic_interviewer_combined.csv ({len(combined_df)} lignes)")

## üìã Rapport r√©capitulatif

In [None]:
report = f"""
# üìä Rapport d'Analyse - Dataset Anthropic/AnthropicInterviewer

## R√©sum√©

**Total d'entretiens**: {total_interviews}

### Distribution par groupe

"""

for split_name, df in dfs.items():
    percentage = (len(df) / total_interviews) * 100
    report += f"- **{split_name}**: {len(df)} entretiens ({percentage:.1f}%)\n"

if text_col:
    report += f"\n### Statistiques sur les transcriptions\n\n"
    for split_name, df in dfs.items():
        lengths = df[text_col].apply(lambda x: len(str(x)) if x else 0)
        report += f"**{split_name}**:\n"
        report += f"- Longueur moyenne: {lengths.mean():.0f} caract√®res\n"
        report += f"- M√©diane: {lengths.median():.0f} caract√®res\n\n"

report += f"""
## Fichiers g√©n√©r√©s

- `anthropic_interviewer_combined.csv`: Dataset complet combin√©
"""

for split_name in dfs.keys():
    report += f"- `anthropic_interviewer_{split_name}.csv`: Groupe {split_name}\n"

report += f"""
## Source

Dataset: https://huggingface.co/datasets/Anthropic/AnthropicInterviewer
"""

with open("rapport_analyse.md", "w", encoding="utf-8") as f:
    f.write(report)

print("‚úÖ Rapport sauvegard√©: rapport_analyse.md")
print("\n" + report)

## üéØ Analyses personnalis√©es

Utilisez les cellules ci-dessous pour vos propres analyses :

In [None]:
# Votre code ici
