In [2]:
import numpy as np
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix
from collections import defaultdict

class NorwegianEntityEmbedding:
    def __init__(self):
        """
        Initialize embedding system with Norwegian-specific prototypical words,
        including directionals and locatives.
        """
        self.type_prototypes = {
            'PERSON': {
                'pronouns': ['jeg', 'meg', 'min', 'mitt', 'mine'],
                'roles': ['lege', 'lærer', 'direktør', 'student'],
                'family': ['mor', 'far', 'søster', 'bror']
            },
            'ORGANIZATION': {
                'pronouns': ['vi', 'oss', 'vår', 'vårt', 'våre'],
                'types': ['selskap', 'universitet', 'institutt', 'bedrift'],
                'activities': ['ansetter', 'produserer', 'tilbyr', 'organiserer']
            },
            'LOCATION': {
                'directionals': {
                    'static': ['her', 'der', 'borte', 'oppe', 'nede', 'inne', 'ute'],
                    'dynamic': ['hit', 'dit', 'bort', 'opp', 'ned', 'inn', 'ut'],
                    'source': ['herfra', 'derfra', 'ovenfra', 'nedenfra', 'innenfra', 'utenfra']
                },
                'prepositions': ['i', 'på', 'ved', 'fra', 'til'],
                'types': ['by', 'land', 'elv', 'fjell', 'gate']
            }
        }
        
        self.base_words = []
        self.type_indices = defaultdict(list)
        self.category_weights = {
            'LOCATION': {
                'directionals.static': 1.5,    # Stronger weight for "her", "der" etc.
                'directionals.dynamic': 1.3,   # Strong weight for "hit", "dit" etc.
                'directionals.source': 1.3,    # Strong weight for "herfra" etc.
                'prepositions': 0.8,           # Lower weight as these are common
                'types': 1.0                   # Baseline weight
            }
        }
        self._initialize_base_words()
        
    def _initialize_base_words(self):
        """Initialize base word list with special handling for nested categories."""
        current_idx = 0
        for entity_type, categories in self.type_prototypes.items():
            for category_name, category_content in categories.items():
                if isinstance(category_content, dict):  # Handle nested structure (directionals)
                    for subcategory_name, words in category_content.items():
                        for word in words:
                            self.base_words.append(word)
                            self.type_indices[entity_type].append(current_idx)
                            current_idx += 1
                else:  # Regular category
                    for word in category_content:
                        self.base_words.append(word)
                        self.type_indices[entity_type].append(current_idx)
                        current_idx += 1

    def get_location_score(self, embedding):
        """
        Special scoring for locations using directionals.
        
        Parameters:
        -----------
        embedding : numpy.ndarray
            The embedding vector to analyze
            
        Returns:
        --------
        dict
            Detailed location scores by directional category
        """
        location_scores = {}
        current_idx = 0
        
        # Process each directional category
        for category, words in self.type_prototypes['LOCATION']['directionals'].items():
            category_size = len(words)
            category_embedding = embedding[current_idx:current_idx + category_size]
            weight = self.category_weights['LOCATION'].get(f'directionals.{category}', 1.0)
            
            # Calculate weighted score
            score = float(np.mean(category_embedding) * weight)
            location_scores[category] = {
                'score': score,
                'words': list(zip(words, category_embedding.tolist()))
            }
            current_idx += category_size
            
        # Add overall directional score
        location_scores['overall_directional_score'] = np.mean([
            score['score'] for score in location_scores.values()
            if isinstance(score, dict) and 'score' in score
        ])
        
        return location_scores

    def analyze_directional_patterns(self, embedding):
        """
        Analyze patterns in directional usage that might indicate
        specific types of locations.
        
        Parameters:
        -----------
        embedding : numpy.ndarray
            The embedding vector to analyze
            
        Returns:
        --------
        dict
            Analysis of directional patterns
        """
        scores = self.get_location_score(embedding)
        
        patterns = {
            'indoor_outdoor_ratio': None,
            'elevation_bias': None,
            'proximity_indication': None
        }
        
        # Analyze indoor/outdoor balance
        if 'static' in scores:
            indoor_words = ['inne']
            outdoor_words = ['ute']
            static_scores = dict(scores['static']['words'])
            
            indoor_score = np.mean([static_scores.get(w, 0) for w in indoor_words])
            outdoor_score = np.mean([static_scores.get(w, 0) for w in outdoor_words])
            patterns['indoor_outdoor_ratio'] = indoor_score / outdoor_score if outdoor_score > 0 else float('inf')
            
        # Analyze elevation patterns
        elevation_words = {
            'high': ['oppe', 'opp', 'ovenfra'],
            'low': ['nede', 'ned', 'nedenfra']
        }
        
        high_scores = []
        low_scores = []
        for category in ['static', 'dynamic', 'source']:
            if category in scores:
                category_scores = dict(scores[category]['words'])
                high_scores.extend([category_scores.get(w, 0) for w in elevation_words['high']])
                low_scores.extend([category_scores.get(w, 0) for w in elevation_words['low']])
        
        if high_scores and low_scores:
            patterns['elevation_bias'] = np.mean(high_scores) - np.mean(low_scores)
            
        # Analyze proximity
        proximity_patterns = {
            'near': ['her', 'hit', 'herfra'],
            'far': ['der', 'dit', 'derfra']
        }
        
        near_scores = []
        far_scores = []
        for category in ['static', 'dynamic', 'source']:
            if category in scores:
                category_scores = dict(scores[category]['words'])
                near_scores.extend([category_scores.get(w, 0) for w in proximity_patterns['near']])
                far_scores.extend([category_scores.get(w, 0) for w in proximity_patterns['far']])
        
        if near_scores and far_scores:
            patterns['proximity_indication'] = np.mean(near_scores) - np.mean(far_scores)
            
        return patterns