In [None]:
import pandas as pd
import numpy as np
import joblib
import ast
import os
from sklearn.preprocessing import StandardScaler
from minisom import MiniSom
from math import sqrt
from collections import Counter

# === CONFIGURATION ===
INPUT_FILE = 'data_dec_2024.parquet'
ARTIFACT_DIR = 'models2'
os.makedirs(ARTIFACT_DIR, exist_ok=True)

def clean_and_load(path):
    df = pd.read_parquet(path)
    df = df[df['DpiPolicy'] != 'Unknown']
    
    # Vectorized numeric conversion
    numeric_cols = ['bytesFromClient', 'bytesFromServer', 'sessions_count', 'transationDuration']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col].astype(str).str.replace('"', ''), errors='coerce').fillna(0)
    return df

def encode_to_bitmap(df, column_name, mapping_name):
    """Generic bitmap encoder to replace redundant loops."""
    # Define mappings based on your specific research requirements
    vocab = {
        'DpiPolicy': ['BLOCKALL', 'BlockNonProvSub', 'ClientIPWhitelist', 'ExpireBlock', 'F1200G50M', 'F3000G100M', 'F3000G200M', 'F4000G300M', 'F4000G400M', 'F4000G500M', 'F4000G550M', 'F4000G600M', 'F6000G1000M', 'NoCRBNBlock', 'SuspendBlock', 'ZeroRated'],
        'contentType': ['Ads & Trackers', 'Browsing', 'Domain Name Service', 'Email', 'FTP', 'File Sharing', 'Gaming', 'Instant Messaging', 'Internet Privacy', 'Location Based Services', 'Music Streaming', 'Net Admin', 'One Click Hosting', 'Other', 'Other File Sharing', 'Social Media', 'Streaming', 'Trading', 'Unknown', 'Voice & Video Calls', 'Webmail'],
        'IpProtocol': ['ESP', 'GRE', 'ICMP', 'TCP', 'UDP']
    }
    
    mapping = {item: i for i, item in enumerate(vocab[column_name])}
    joblib.dump(mapping, f'{ARTIFACT_DIR}/{mapping_name}.joblib')
    
    def calculate_bitsum(val):
        if pd.isna(val): return 0
        try:
            items = val if isinstance(val, list) else ast.literal_eval(val) if (isinstance(val, str) and val.startswith('[')) else [val]
            return sum(1 << mapping[i] for i in items if i in mapping)
        except: return 0

    return df[column_name].apply(calculate_bitsum)

# === EXECUTION ===
df = clean_and_load(INPUT_FILE)
original_offers = df['DpiPolicy'].copy()

# Encodings
df['DpiPolicy'] = encode_to_bitmap(df, 'DpiPolicy', 'policy_to_bit')
df['contentType'] = encode_to_bitmap(df, 'contentType', 'content_to_bit')
df['IpProtocol'] = encode_to_bitmap(df, 'IpProtocol', 'proto_to_bit')
df['app_count'] = df['appName'].apply(lambda x: len(x) if isinstance(x, list) else 1)

# Imputation & Scaling
medians = {col: max(df.loc[df[col] > 0, col].median(), 1) for col in ['bytesFromClient', 'bytesFromServer', 'transationDuration']}
for col, val in medians.items():
    df[col] = df[col].apply(lambda x: val if x <= 0 else x)
joblib.dump(medians, f'{ARTIFACT_DIR}/medians.joblib')

# Final Feature Selection
features_df = df.select_dtypes(include=[np.number]).drop(columns=['SubscriberID'], errors='ignore')
numeric_cols = features_df.columns.tolist()

scaler = StandardScaler()
X = scaler.fit_transform(features_df)
joblib.dump(scaler, f'{ARTIFACT_DIR}/scaler.joblib')
joblib.dump(numeric_cols, f'{ARTIFACT_DIR}/numeric_cols.joblib')

# === SOM TRAINING ===
rows, cols = 10, 10
som = MiniSom(x=rows, y=cols, input_len=X.shape[1], sigma=0.8*sqrt(rows**2+cols**2), learning_rate=0.8)
som.random_weights_init(X) # Standardized initialization is better than uniform [-0.3, 0.3]
som.train_random(X, num_iteration=len(X))

joblib.dump(som, f'{ARTIFACT_DIR}/som_model.joblib')
np.save('som_weights1.npy', som.get_weights())

# === CLUSTER MAPPING ===
cluster_offers = {}
# Find winners for all at once
winners = np.array([som.winner(x) for x in X])
cluster_ids = winners[:, 0] * cols + winners[:, 1]

# Map back to original offers (Vectorized-style grouping)
df['cluster_id'] = cluster_ids
df['parsed_offers'] = original_offers.apply(lambda x: x if isinstance(x, list) else [x])

for cid, group in df.groupby('cluster_id'):
    all_offers = [o for sublist in group['parsed_offers'] for o in sublist]
    excluded = {'F3000G200M', 'F3000G100M', 'F1200G50M'}
    counts = Counter([o for o in all_offers if o not in excluded])
    
    # Logic: 20% threshold or top 2
    threshold = len(group) * 0.2
    top = [o for o, c in counts.items() if c >= threshold]
    cluster_offers[int(cid)] = top if len(top) >= 2 else [o for o, _ in counts.most_common(2)]

joblib.dump(cluster_offers, f'{ARTIFACT_DIR}/centroid_feature_map.joblib')
print(f"Pipeline complete. {len(cluster_offers)} clusters mapped.")