# Grounded Theory Analysis: Professional Identity & AI

## Research Question
**"How do professionals negotiate maintaining their identity and legitimacy in the face of a tool that threatens to replace them?"**

### Method: Gioia Methodology
1. First-order codes (from informant language)
2. Second-order themes (researcher concepts)
3. Aggregate dimensions (theoretical constructs)
4. Theoretical model construction

---

## 1. Setup

In [None]:
# Installation
!pip install -q google-generativeai datasets pandas tqdm

In [None]:
import google.generativeai as genai
from datasets import load_dataset
import json
import re
from typing import List, Dict, Any
import pandas as pd
from tqdm.notebook import tqdm
import time
import random
from IPython.display import display, HTML, Markdown

print("‚úÖ Imports OK")

## 2. Configuration

**Important:** Ajoute ta cl√© API dans les Secrets Colab (üîë √† gauche) :
- Nom: `GOOGLE_API_KEY`
- Valeur: ta cl√© API Gemini

In [None]:
# ============================================
# CONFIGURATION - Modifier ici si besoin
# ============================================

# Cl√© API depuis les Secrets Colab
from google.colab import userdata
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')

# Mod√®le
MODEL_NAME = "gemini-2.5-flash"

# Question de recherche
RESEARCH_QUESTION = """How do professionals negotiate maintaining their identity and legitimacy 
in the face of a tool that threatens to replace them?"""

# Nombre d'interviews √† analyser (None = toutes)
SAMPLE_SIZE = 50

# Configuration API
genai.configure(api_key=GOOGLE_API_KEY)

print(f"‚úÖ Configuration OK")
print(f"   Mod√®le: {MODEL_NAME}")
print(f"   Interviews: {SAMPLE_SIZE or 'Toutes'}")

## 3. Chargement du Dataset

In [None]:
# Charger le dataset
print("üì• Chargement du dataset...")
dataset = load_dataset("Anthropic/AnthropicInterviewer", split="workforce")
print(f"‚úÖ {len(dataset)} interviews charg√©es")
print(f"üìã Colonnes: {dataset.column_names}")

In [None]:
# Pr√©parer les interviews
def prepare_interviews(dataset, sample_size=None):
    interviews = []
    indices = list(range(len(dataset)))
    
    if sample_size and sample_size < len(dataset):
        indices = random.sample(indices, sample_size)
    
    for idx in indices:
        item = dataset[idx]
        content = ""
        
        # Extraire le texte
        if isinstance(item, dict):
            for key in ['text', 'content', 'transcript', 'conversation', 'messages']:
                if key in item and item[key]:
                    val = item[key]
                    if isinstance(val, str):
                        content = val
                    elif isinstance(val, list):
                        content = "\n".join([
                            f"{m.get('role', '')}: {m.get('content', str(m))}" 
                            if isinstance(m, dict) else str(m) 
                            for m in val
                        ])
                    break
            if not content:
                content = json.dumps(item, ensure_ascii=False)
        else:
            content = str(item)
        
        # Nettoyer le texte
        content = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', content)
        
        interviews.append({'id': idx, 'content': content})
    
    return interviews

interviews = prepare_interviews(dataset, SAMPLE_SIZE)
print(f"\n‚úÖ {len(interviews)} interviews pr√©par√©es")
print(f"üìè Longueur moyenne: {sum(len(i['content']) for i in interviews) // len(interviews)} chars")

## 4. Service Gemini

In [None]:
class GeminiService:
    def __init__(self, model_name=MODEL_NAME):
        self.model = genai.GenerativeModel(model_name)
        self.request_count = 0
    
    def generate(self, prompt: str, temperature: float = 0.7, max_retries: int = 3) -> str:
        self.request_count += 1
        if self.request_count % 10 == 0:
            time.sleep(1)
        
        config = genai.types.GenerationConfig(
            temperature=temperature,
            max_output_tokens=8192,
        )
        
        for attempt in range(max_retries):
            try:
                response = self.model.generate_content(prompt, generation_config=config)
                return response.text
            except Exception as e:
                wait = (attempt + 1) * 5
                print(f"‚ö†Ô∏è Retry {attempt+1}/{max_retries} dans {wait}s: {str(e)[:80]}")
                time.sleep(wait)
        return ""
    
    def generate_json(self, prompt: str) -> Any:
        response = self.generate(prompt + "\n\nRespond ONLY with valid JSON.", temperature=0.3)
        if not response:
            return None
        
        # Nettoyer
        cleaned = response.strip()
        for prefix in ['```json', '```']:
            if cleaned.startswith(prefix):
                cleaned = cleaned[len(prefix):]
        if cleaned.endswith('```'):
            cleaned = cleaned[:-3]
        
        try:
            return json.loads(cleaned.strip())
        except:
            match = re.search(r'[\[\{].*[\]\}]', cleaned, re.DOTALL)
            if match:
                try:
                    return json.loads(match.group())
                except:
                    pass
            print(f"‚ö†Ô∏è JSON parse error")
            return None

gemini = GeminiService()
print("‚úÖ GeminiService OK")

## 5. Analyse Gioia - √âtape 1: First-Order Coding

In [None]:
def code_interview(interview: Dict) -> List[str]:
    """Code une interview individuelle - TEXTE COMPLET."""
    
    # Pas de limite - on envoie l'interview compl√®te
    content = interview['content']
    
    prompt = f"""You are an expert qualitative researcher using Gioia methodology.

RESEARCH QUESTION: {RESEARCH_QUESTION}

INTERVIEW (complete transcript):
{content}

Analyze the ENTIRE interview and identify 5-15 FIRST-ORDER CODES related to:
- Professional identity and self-definition
- Perceived threats from AI
- Coping and adaptation strategies  
- Legitimacy claims
- Identity negotiation

Use language close to the informant's words.

Respond with JSON array: ["code 1", "code 2", "code 3"]"""
    
    result = gemini.generate_json(prompt)
    return result if isinstance(result, list) else []

# Ex√©cuter le codage
print("üìù STEP 1: First-Order Coding")
print(f"   {len(interviews)} interviews √† coder (texte complet)...\n")

# Afficher les stats de longueur
lengths = [len(i['content']) for i in interviews]
print(f"   üìè Longueur min: {min(lengths):,} chars")
print(f"   üìè Longueur max: {max(lengths):,} chars")
print(f"   üìè Longueur moyenne: {sum(lengths)//len(lengths):,} chars\n")

all_codes = []
for interview in tqdm(interviews, desc="Coding"):
    codes = code_interview(interview)
    interview['codes'] = codes
    all_codes.extend(codes)
    time.sleep(0.3)  # Rate limiting

unique_codes = list(set(all_codes))
print(f"\n‚úÖ {len(unique_codes)} codes uniques identifi√©s")

## 6. Analyse Gioia - √âtape 2: Second-Order Themes

In [None]:
print("üìù STEP 2: Second-Order Coding")
print(f"   Regroupement de {len(unique_codes)} codes...\n")

# Limiter pour √©viter les erreurs
codes_sample = unique_codes[:150]

prompt = f"""You are an expert qualitative researcher using Gioia methodology.

RESEARCH QUESTION: {RESEARCH_QUESTION}

FIRST-ORDER CODES ({len(codes_sample)}):
{json.dumps(codes_sample, ensure_ascii=False, indent=2)}

Group these into 6-10 SECOND-ORDER THEMES.
Themes should capture:
- Identity maintenance strategies
- Threat perception patterns
- Legitimacy construction
- Adaptation behaviors
- Professional boundary work

Respond with JSON: {{"Theme Name": ["code1", "code2"], "Another Theme": ["code3"]}}"""

second_order_themes = gemini.generate_json(prompt)

if second_order_themes:
    print(f"‚úÖ {len(second_order_themes)} themes cr√©√©s:\n")
    for theme, codes in second_order_themes.items():
        print(f"   üè∑Ô∏è {theme} ({len(codes)} codes)")
else:
    print("‚ùå Erreur - r√©ex√©cute cette cellule")
    second_order_themes = {}

## 7. Analyse Gioia - √âtape 3: Aggregate Dimensions

In [None]:
print("üìù STEP 3: Aggregate Dimensions")

if not second_order_themes:
    print("‚ùå Pas de themes - ex√©cute d'abord l'√©tape 2")
else:
    prompt = f"""You are an expert qualitative researcher.

RESEARCH QUESTION: {RESEARCH_QUESTION}

SECOND-ORDER THEMES:
{json.dumps(list(second_order_themes.keys()), indent=2)}

Aggregate into 3-5 AGGREGATE DIMENSIONS.
These are high-level theoretical constructs that answer the research question.

Respond with JSON: {{"Dimension Name": ["Theme A", "Theme B"]}}"""

    aggregate_dimensions = gemini.generate_json(prompt)
    
    if aggregate_dimensions:
        print(f"\n‚úÖ {len(aggregate_dimensions)} dimensions cr√©√©es:\n")
        for dim, themes in aggregate_dimensions.items():
            print(f"   üì¶ {dim}")
            for t in themes:
                print(f"      ‚îî‚îÄ {t}")
    else:
        print("‚ùå Erreur - r√©ex√©cute cette cellule")
        aggregate_dimensions = {}

## 8. R√©sum√© de l'Analyse Gioia

In [None]:
# Compiler les r√©sultats
results = {
    'interviews': interviews,
    'first_order_codes': unique_codes,
    'second_order_themes': second_order_themes,
    'aggregate_dimensions': aggregate_dimensions
}

print("="*60)
print("üìä GIOIA ANALYSIS SUMMARY")
print("="*60)
print(f"\nüìã Interviews: {len(results['interviews'])}")
print(f"üìù First-order codes: {len(results['first_order_codes'])}")
print(f"üè∑Ô∏è Second-order themes: {len(results['second_order_themes'])}")
print(f"üì¶ Aggregate dimensions: {len(results['aggregate_dimensions'])}")

In [None]:
# Visualisation tableau Gioia
def display_gioia_table(results):
    dimensions = results.get('aggregate_dimensions', {})
    themes = results.get('second_order_themes', {})
    
    html = """<style>
    .gioia {border-collapse: collapse; width: 100%; font-family: Arial;}
    .gioia th, .gioia td {border: 1px solid #ddd; padding: 10px; text-align: left; vertical-align: top;}
    .gioia th {background: #4a90d9; color: white;}
    .dim {background: #e8f4e8; font-weight: bold;}
    .theme {background: #fff8e8;}
    .codes {font-size: 11px; color: #555;}
    </style>
    <h3>üìä Gioia Data Structure</h3>
    <table class='gioia'>
    <tr><th>1st Order Codes</th><th>2nd Order Themes</th><th>Aggregate Dimensions</th></tr>"""
    
    for dim_name, dim_themes in dimensions.items():
        first = True
        for theme in dim_themes:
            codes = themes.get(theme, [])
            codes_html = "<br>".join([f"‚Ä¢ {c}" for c in codes[:5]])
            if len(codes) > 5:
                codes_html += f"<br><i>+{len(codes)-5} more</i>"
            
            html += f"<tr><td class='codes'>{codes_html}</td><td class='theme'>{theme}</td>"
            if first:
                html += f"<td class='dim' rowspan='{len(dim_themes)}'>{dim_name}</td>"
                first = False
            html += "</tr>"
    
    html += "</table>"
    display(HTML(html))

display_gioia_table(results)

## 9. Construction du Mod√®le Th√©orique

In [None]:
print("üèóÔ∏è Building Theoretical Model...\n")

prompt = f"""You are a qualitative research expert building grounded theory.

RESEARCH QUESTION: {RESEARCH_QUESTION}

GIOIA ANALYSIS RESULTS:

Aggregate Dimensions: {json.dumps(aggregate_dimensions, indent=2)}

Second-Order Themes: {json.dumps(list(second_order_themes.keys()), indent=2)}

Construct a THEORETICAL MODEL with:
1. Model name
2. Core argument (2-3 sentences)
3. Key constructs with definitions
4. Testable propositions (P1, P2, etc.)
5. Theoretical contributions
6. Practical implications

Connect to: identity theory, legitimacy theory, technology acceptance, boundary work.

Respond with JSON:
{{
  "model_name": "...",
  "core_argument": "...",
  "constructs": [{{"name": "...", "definition": "...", "type": "independent/dependent/mediator"}}],
  "propositions": ["P1: ...", "P2: ..."],
  "theoretical_contributions": ["..."],
  "practical_implications": ["..."]
}}"""

theoretical_model = gemini.generate_json(prompt)

if theoretical_model:
    print("‚úÖ Model constructed!")
else:
    print("‚ùå Error - retry this cell")
    theoretical_model = {}

In [None]:
# Afficher le mod√®le
if theoretical_model:
    md = f"""# üß† {theoretical_model.get('model_name', 'Theoretical Model')}

## Core Argument
{theoretical_model.get('core_argument', 'N/A')}

---

## Theoretical Constructs
"""
    for c in theoretical_model.get('constructs', []):
        md += f"\n### {c.get('name')} *({c.get('type', 'construct')})*\n"
        md += f"{c.get('definition', '')}\n"
    
    md += "\n---\n\n## Propositions\n"
    for p in theoretical_model.get('propositions', []):
        md += f"\n**{p}**\n"
    
    md += "\n---\n\n## Theoretical Contributions\n"
    for c in theoretical_model.get('theoretical_contributions', []):
        md += f"- {c}\n"
    
    md += "\n## Practical Implications\n"
    for i in theoretical_model.get('practical_implications', []):
        md += f"- {i}\n"
    
    display(Markdown(md))
else:
    print("‚ùå No model - run previous cell first")

## 10. Visualisation (Mermaid Diagram)

In [None]:
if theoretical_model:
    print("üìä Generating diagram...")
    
    prompt = f"""Create a Mermaid flowchart for this model:
{json.dumps(theoretical_model, indent=2)}

Use graph LR. Show constructs as nodes, propositions as arrows.
Return ONLY Mermaid code, no markdown."""
    
    mermaid = gemini.generate(prompt, temperature=0.3)
    mermaid = mermaid.replace('```mermaid', '').replace('```', '').strip()
    
    print("\nüìà Mermaid Code:")
    print(mermaid)
    
    # Display
    html = f"""
    <script src="https://cdn.jsdelivr.net/npm/mermaid/dist/mermaid.min.js"></script>
    <script>mermaid.initialize({{startOnLoad:true}});</script>
    <div class="mermaid">{mermaid}</div>
    """
    display(HTML(html))

## 11. Export des R√©sultats

In [None]:
from datetime import datetime

# Compiler tout
final_results = {
    'metadata': {
        'research_question': RESEARCH_QUESTION,
        'dataset': 'Anthropic/AnthropicInterviewer',
        'sample_size': len(interviews),
        'model': MODEL_NAME,
        'date': datetime.now().isoformat()
    },
    'gioia_analysis': {
        'first_order_codes': unique_codes,
        'second_order_themes': second_order_themes,
        'aggregate_dimensions': aggregate_dimensions
    },
    'theoretical_model': theoretical_model,
    'coded_interviews': [
        {'id': i['id'], 'codes': i.get('codes', [])}
        for i in interviews
    ]
}

# Sauvegarder JSON
filename = f"gioia_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(filename, 'w', encoding='utf-8') as f:
    json.dump(final_results, f, ensure_ascii=False, indent=2)
print(f"‚úÖ Saved: {filename}")

# T√©l√©charger
try:
    from google.colab import files
    files.download(filename)
except:
    pass

In [None]:
# Export CSV des codes
codes_data = []
for theme, codes in second_order_themes.items():
    dim = None
    for d, ts in aggregate_dimensions.items():
        if theme in ts:
            dim = d
            break
    for code in codes:
        codes_data.append({
            'first_order_code': code,
            'second_order_theme': theme,
            'aggregate_dimension': dim
        })

df = pd.DataFrame(codes_data)
csv_file = f"gioia_codes_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
df.to_csv(csv_file, index=False)
print(f"‚úÖ Saved: {csv_file}")
display(df.head(20))

try:
    files.download(csv_file)
except:
    pass

---
## Notes

### M√©thode Gioia
- **1st Order Codes**: Language proche des participants
- **2nd Order Themes**: Concepts abstraits du chercheur
- **Aggregate Dimensions**: Construits th√©oriques

### R√©f√©rence
Gioia, D. A., Corley, K. G., & Hamilton, A. L. (2013). Seeking qualitative rigor in inductive research.