# Survey Data Transformation

Transform unstructured survey responses into structured, analysis-ready datasets using openaivec.

In [None]:
import pandas as pd
from openaivec import pandas_ext
from pydantic import BaseModel
from typing import List, Optional

pandas_ext.responses_model("gpt-4o-mini")

## Sample Survey Data

Realistic free-form survey responses from various demographic groups.

In [None]:
# Sample survey responses
survey_responses = [
    "I'm a 28-year-old software engineer from San Francisco. I love hiking, coding, and coffee. Currently working on AI projects.",
    "45 years old, marketing manager in NYC. Interests include yoga, reading business books, and traveling to Europe.",
    "College student, 20, studying biology in Boston. Enjoys gaming, anime, and volunteer work at animal shelters.",
    "Retired teacher, 62, living in Austin Texas. Passionate about gardening, cooking, and spending time with grandchildren.",
    "35-year-old doctor from Chicago, specializing in pediatrics. Hobbies are running marathons and playing piano.",
    "Freelance graphic designer, 29, based in Portland. Into rock climbing, photography, and sustainable living.",
    "High school student, 17, from Miami. Loves basketball, music production, and dreams of becoming a filmmaker.",
    "Small business owner, 52, runs a bakery in Denver. Enjoys baking (obviously), hiking, and local community events.",
    "Data scientist, 31, working remotely from Vancouver. Interested in machine learning, skiing, and craft beer.",
    "Stay-at-home parent, 38, from Phoenix. Passionate about child development, crafting, and organizing community activities."
]

survey_df = pd.DataFrame({
    "response_id": [f"RESP_{i:03d}" for i in range(1, len(survey_responses) + 1)],
    "response": survey_responses
})

survey_df.head()

## Define Structured Output Schema

Create comprehensive demographic and interest profiles.

In [None]:
class Demographics(BaseModel):
    age: Optional[int]
    age_group: str  # "18-25", "26-35", "36-45", "46-55", "56+"
    occupation: str
    occupation_category: str  # "technology", "healthcare", "education", etc.
    location: str
    location_type: str  # "urban", "suburban", "rural"
    life_stage: str  # "student", "professional", "parent", "retired"

class Interests(BaseModel):
    primary_interests: List[str]
    hobby_categories: List[str]  # "sports", "arts", "technology", etc.
    lifestyle_indicators: List[str]  # "active", "creative", "social", etc.

class PersonProfile(BaseModel):
    demographics: Demographics
    interests: Interests
    personality_traits: List[str]
    potential_products: List[str]  # Products/services they might be interested in

## Transform Unstructured to Structured

Extract comprehensive profiles from free-text responses.

In [None]:
# Extract structured profiles
structured_df = survey_df.assign(
    profile=lambda df: df.response.ai.responses(
        instructions="""
        Extract comprehensive demographic and interest information from the survey response.
        Infer missing information based on context clues when reasonable.
        Categorize interests and suggest relevant product categories.
        """,
        response_format=PersonProfile
    )
).ai.extract("profile")

structured_df.head()

## Demographic Analysis

Extract demographic insights from the structured data.

In [None]:
# Age distribution
print("AGE GROUP DISTRIBUTION:")
age_dist = structured_df.profile_demographics_age_group.value_counts()
print(age_dist)

print("\n" + "="*50 + "\n")

# Occupation categories
print("OCCUPATION CATEGORIES:")
occ_dist = structured_df.profile_demographics_occupation_category.value_counts()
print(occ_dist)

print("\n" + "="*50 + "\n")

# Life stages
print("LIFE STAGE DISTRIBUTION:")
life_dist = structured_df.profile_demographics_life_stage.value_counts()
print(life_dist)

## Interest Pattern Analysis

Analyze hobby and interest patterns across demographics.

In [None]:
# Explode interest categories for analysis
interests_expanded = structured_df.explode('profile_interests_hobby_categories')

print("TOP HOBBY CATEGORIES:")
hobby_counts = interests_expanded.profile_interests_hobby_categories.value_counts()
print(hobby_counts.head(10))

print("\n" + "="*50 + "\n")

# Lifestyle patterns
lifestyle_expanded = structured_df.explode('profile_interests_lifestyle_indicators')
print("LIFESTYLE INDICATORS:")
lifestyle_counts = lifestyle_expanded.profile_interests_lifestyle_indicators.value_counts()
print(lifestyle_counts.head(10))

## Market Segmentation

Create customer segments based on extracted profiles.

In [None]:
# Generate market segments
segments_df = structured_df.assign(
    segment=lambda df: df.apply(
        lambda row: f"{row.profile_demographics_age_group}_{row.profile_demographics_occupation_category}", 
        axis=1
    )
)

print("MARKET SEGMENTS:")
segment_counts = segments_df.segment.value_counts()
print(segment_counts)

print("\n" + "="*50 + "\n")

# Product recommendations by segment
print("PRODUCT OPPORTUNITIES BY SEGMENT:")
for segment in segment_counts.index[:5]:  # Top 5 segments
    segment_data = segments_df[segments_df.segment == segment]
    print(f"\n📊 {segment.upper()}:")
    
    # Get all product suggestions for this segment
    products = []
    for products_list in segment_data.profile_potential_products:
        products.extend(products_list)
    
    # Count and display top products
    from collections import Counter
    product_counter = Counter(products)
    for product, count in product_counter.most_common(3):
        print(f"   • {product} ({count} mentions)")

## Export for Analysis

Prepare clean datasets for business intelligence tools.

In [None]:
# Create clean demographic table
demographics_clean = structured_df[[
    'response_id',
    'profile_demographics_age',
    'profile_demographics_age_group', 
    'profile_demographics_occupation',
    'profile_demographics_occupation_category',
    'profile_demographics_location',
    'profile_demographics_location_type',
    'profile_demographics_life_stage'
]].copy()

print("📊 CLEAN DEMOGRAPHICS TABLE:")
print(demographics_clean.head())

# Save to CSV for external analysis
# demographics_clean.to_csv('demographics_analysis.csv', index=False)
# print("\n💾 Data exported to demographics_analysis.csv")

## Conclusion

This notebook demonstrates how openaivec transforms unstructured survey data into:

- **Structured Demographics**: Age, occupation, location, life stage
- **Interest Profiles**: Hobbies, lifestyle indicators, personality traits  
- **Market Segments**: Actionable customer groupings
- **Product Opportunities**: Data-driven recommendation insights
- **Analysis-Ready Data**: Clean datasets for BI tools

Scale this approach to thousands of survey responses for comprehensive market research.