# Multi-Modal Data Analysis Workflow
**ASIST Study 3 Dataset**  


## Objective
Analyze team performance data across four modalities:
1. JSON behavior logs
2. SPSS survey responses
3. Video recordings
4. Chat transcripts

Identify correlations between AI interventions and team outcomes.


# Installing dependencies

%pip install opencv-python pandas scikit-learn seaborn matplotlib 

In [None]:
import cv2
from pathlib import Path
import json
import pandas as pd
import openai
from scipy.stats import spearmanr
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path


## 1. JSON Logs Processing
### Objective
Extract structured data from nested JSON logs containing:
- Team actions
- AI intervention timestamps
- Mission outcomes


In [None]:


def parse_json_logs(input_path: Path, output_path: Path) -> pd.DataFrame:
    """Flatten nested JSON logs into structured format"""
    with open(input_path, 'r') as f:
        data = [json.loads(line) for line in f]
    
    df = pd.json_normalize(data, sep='_')
    df.to_csv(output_path, index=False)
    return df

# Process all trial messages
input_files = [
    Path("data/json_logs/HSRData_TrialMessages_Trial-T000603_..."),
    Path("data/json_logs/HSRData_TrialMessages_Trial-T000639_..."),
    Path("data/json_logs/HSRData_TrialMessages_Trial-T000671_...")
]

output_dir = Path("data/processed/json_parsed/")
output_dir.mkdir(parents=True, exist_ok=True)

for file in input_files:
    output_file = output_dir / f"{file.stem}_parsed.csv"
    df = parse_json_logs(file, output_file)
    print(f"Processed {len(df)} records from {file.name}")



## 2. Survey Data Processing
### Objective
Convert SPSS survey data to analyzable CSV format with:
- Cleaned column names
- Consistent formatting


In [None]:

def process_surveys(input_path: Path, output_path: Path) -> pd.DataFrame:
    """Convert TAB-delimited survey data to CSV"""
    df = pd.read_csv(input_path, sep='\t')
    df.columns = [col.strip().replace(' ', '_') for col in df.columns]
    df.to_csv(output_path, index=False)
    return df

survey_file = Path("data/surveys/HSRData_Surveys0Fulltext_...")
output_path = Path("data/processed/surveys/surveys_parsed.csv")
df_surveys = process_surveys(survey_file, output_path)

# %% [markdown]
"""
## 3. Transcript Analysis (NLP)
### Objective
Analyze team communication effectiveness using:
- WebVTT transcript parsing
- GPT-4 effectiveness scoring (1-5 scale)
"""
# %%
def load_vtt(file_path: Path) -> pd.DataFrame:
    """Clean WebVTT subtitle files"""
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f 
                if not line.startswith(('WEBVTT', 'NOTE', '00:')) 
                and line.strip()]
    return pd.DataFrame(lines, columns=['text'])

# Load and process transcripts
transcript_file = Path("data/transcripts/HSRData_ZoomAudioTranscript_...")
df_transcript = load_vtt(transcript_file)


In [None]:
# Label effectiveness with GPT-4
openai.api_key = "YOUR_API_KEY"  

def label_effectiveness(text: str) -> int:
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{
            "role": "system",
            "content": "Rate communication effectiveness (1=chaotic, 5=coordinated)"
        }, {
            "role": "user",
            "content": text
        }]
    )
    return int(response.choices[0].message['content'])

df_sample = df_transcript.sample(100, random_state=42)
df_sample['effectiveness'] = df_sample['text'].apply(label_effectiveness)
df_sample.to_csv("data/processed/transcripts/labeled_transcripts.csv")


## 4. Video Analysis
### Objective
Extract key frames every 10 seconds for:
- Activity pattern analysis
- Non-verbal communication study


In [None]:


def extract_frames(video_path: Path, output_dir: Path, interval: int = 10):
    """Extract frames at fixed intervals"""
    vidcap = cv2.VideoCapture(str(video_path))
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    frame_interval = int(fps * interval)
    
    count = 0
    while vidcap.isOpened():
        success, frame = vidcap.read()
        if not success: break
        if count % frame_interval == 0:
            cv2.imwrite(str(output_dir/f"frame_{count}.jpg"), frame)
        count += 1

video_file = Path("data/videos/HSRData_OBVideo_...")
output_dir = Path("data/processed/video_frames/")
extract_frames(video_file, output_dir)


## 5. Multi-Modal Integration
### Objective
Correlate findings across all modalities:
1. Merge parsed datasets
2. Calculate Spearman correlations
3. Visualize relationships


In [None]:
# Load processed data
df_logs = pd.read_csv("data/processed/json_parsed/...")
df_surveys = pd.read_csv("data/processed/surveys/surveys_parsed.csv")
df_transcripts = pd.read_csv("data/processed/transcripts/labeled_transcripts.csv")

# Merge datasets
merged_df = pd.merge(
    df_logs[['team_id', 'game_score', 'advice_type']],
    df_surveys[['team_id', 'cohesion_score']],
    on='team_id'
)

# Statistical analysis

corr, pval = spearmanr(merged_df['game_score'], merged_df['cohesion_score'])

# Visualization

plt.figure(figsize=(10,6))
sns.scatterplot(
    data=merged_df,
    x='game_score',
    y='cohesion_score',
    hue='advice_type',
    palette='viridis'
)
plt.title(f"Game vs Cohesion Scores (ρ={corr:.2f}, p={pval:.3f})")
plt.savefig("results/correlation_plot.png")
