In [None]:
api_key="apikey"


In [None]:
import os
import pandas as pd
import numpy as np
from openai import OpenAI
from tensorflow.keras.models import load_model
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sentence_transformers import SentenceTransformer
from gensim.models import Word2Vec
import joblib  # <-- Usamos joblib

# ===== CONFIG =====
API_KEY = api_key
if not API_KEY:
    raise ValueError("‚ö†Ô∏è No se encontr√≥ la variable de entorno OPENAI_API_KEY")

PATH_PIPELINE = "pipelines"
PATH_MODELS = "models"
PATH_DATA = "data"


class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, use_bigrams=True, use_trigrams=True, glove_path=None,
                 sbert_model_name='all-MiniLM-L6-v2', w2v_model_path=None):
        self.use_bigrams = use_bigrams
        self.use_trigrams = use_trigrams
        self.glove_path = glove_path
        self.glove = {}
        self.sbert_model_name = sbert_model_name
        self.sbert_model = None
        self.w2v_model = None
        self.w2v_model_path = w2v_model_path

        # Inicializar recursos NLTK
        self._ensure_nltk_resources()
        self._init_nltk_components()

    # ----------------------------
    # M√©todos de inicializaci√≥n
    # ----------------------------
    def _ensure_nltk_resources(self):
        import nltk
        resources = ['punkt', 'stopwords', 'wordnet', 'omw-1.4']
        for resource in resources:
            try:
                if resource == 'punkt':
                    nltk.data.find('tokenizers/punkt')
                else:
                    nltk.data.find(f'corpora/{resource}')
            except LookupError:
                print(f"Descargando recurso NLTK: {resource}")
                nltk.download(resource, quiet=True)

    def _init_nltk_components(self):
        from nltk.corpus import stopwords
        from nltk.stem import WordNetLemmatizer
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    # ----------------------------
    # Utilidades de procesamiento
    # ----------------------------
    def _clean_text(self, text):
        import re
        text = str(text).lower()
        text = re.sub(r"http\S+", "", text)
        text = re.sub(r"[^a-z√°√©√≠√≥√∫√º√± ]", "", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text

    def _get_wordnet_pos(self, tag):
        from nltk.corpus import wordnet
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    def _tokenize_series(self, series):
        from nltk import pos_tag, ngrams
        from nltk.tokenize import word_tokenize
        all_tokens = []
        for text in series:
            text_clean = self._clean_text(text)
            tokens = word_tokenize(text_clean)
            tokens = [t for t in tokens if t.isalpha() and t not in self.stop_words]
            pos_tags = pos_tag(tokens)
            lemmas = [self.lemmatizer.lemmatize(t, self._get_wordnet_pos(pos)) for t, pos in pos_tags]

            # n-grams
            ngram_tokens = lemmas.copy()
            if self.use_bigrams:
                ngram_tokens.extend(['_'.join(bg) for bg in ngrams(lemmas, 2)])
            if self.use_trigrams:
                ngram_tokens.extend(['_'.join(tg) for tg in ngrams(lemmas, 3)])
            all_tokens.append(ngram_tokens)
        return all_tokens

    def _avg_vector(self, tokens, model):
        vecs = [model.wv[w] for w in tokens if w in model.wv]
        return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)

    def _load_glove(self):
        self.glove = {}
        with open(self.glove_path, 'r', encoding='utf8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                self.glove[word] = vector

    def _avg_glove(self, tokens):
        vecs = [self.glove[w] for w in tokens if w in self.glove]
        return np.mean(vecs, axis=0) if vecs else np.zeros(100)

    # ----------------------------
    # Fit y transform
    # ----------------------------
    def fit(self, X, y=None):
        print("üîÑ Entrenando TextPreprocessor...")
        self.X_tokens_ = self._tokenize_series(X)

        # Word2Vec
        print("üìù Entrenando Word2Vec...")
        self.w2v_model = Word2Vec(
            sentences=self.X_tokens_,
            vector_size=100,
            window=5,
            min_count=2,
            workers=1
        )

        if self.w2v_model_path:
            self.w2v_model.save(self.w2v_model_path)
            print(f"‚úÖ Word2Vec guardado en {self.w2v_model_path}")

        # Cargar GloVe si existe
        if self.glove_path:
            self._load_glove()

        # Cargar SBERT
        print("ü§ñ Cargando modelo SBERT...")
        self.sbert_model = SentenceTransformer(self.sbert_model_name)
        print("‚úÖ TextPreprocessor entrenado exitosamente")
        return self

    def transform(self, X):
        from sentence_transformers import SentenceTransformer
        # Cargar Word2Vec si no est√° en memoria
        if self.w2v_model is None and self.w2v_model_path:
            if os.path.exists(self.w2v_model_path):
                self.w2v_model = Word2Vec.load(self.w2v_model_path)
            else:
                raise ValueError("No se encontr√≥ el modelo Word2Vec. Ejecuta fit() primero.")

        if self.sbert_model is None:
            self.sbert_model = SentenceTransformer(self.sbert_model_name)

        tokens = self._tokenize_series(X)
        X_w2v = np.array([self._avg_vector(t, self.w2v_model) for t in tokens])
        X_glove = np.array([self._avg_glove(t) for t in tokens]) if self.glove else None
        X_sbert = self.sbert_model.encode(X.tolist(), batch_size=32, show_progress_bar=False)

        return {'w2v': X_w2v, 'glove': X_glove, 'sbert': X_sbert}

    # ----------------------------
    # Serializaci√≥n segura
    # ----------------------------
    def __getstate__(self):
        state = self.__dict__.copy()
        state['w2v_model'] = None
        state['sbert_model'] = None
        state['stop_words'] = None
        state['lemmatizer'] = None
        return state

    def __setstate__(self, state):
        self.__dict__.update(state)
        self.w2v_model = None
        self.sbert_model = None
        print("üîÑ Reinicializando recursos NLTK...")
        self._ensure_nltk_resources()
        self._init_nltk_components()
        print("‚úÖ TextPreprocessor deserializado correctamente")


class DateFeatureGenerator(BaseEstimator, TransformerMixin):
    """
    Genera la columna 'DaysSinceLast' basada en la diferencia de fechas.
    Debe ejecutarse ANTES de DropColumns.
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Creamos una copia para no afectar el dataframe original fuera del pipeline
        X = X.copy()
        
        # Verificamos que las columnas existan antes de operar
        if 'TransactionDate' in X.columns and 'PreviousTransactionDate' in X.columns:
            # Asegurar tipo datetime
            X['TransactionDate'] = pd.to_datetime(X['TransactionDate'])
            X['PreviousTransactionDate'] = pd.to_datetime(X['PreviousTransactionDate'])
            
            # Calcular diferencia
            # Nota: Seg√∫n tu l√≥gica es Previous - Transaction
            X['TimeSinceLastTransaction'] = X['PreviousTransactionDate'] - X['TransactionDate']
            
            # Convertir a d√≠as (segundos totales / 86400)
            X['DaysSinceLast'] = X['TimeSinceLastTransaction'].dt.total_seconds() / 86400
            
        return X

class DropColumns(BaseEstimator, TransformerMixin):
    """
    Elimina columnas innecesarias del dataset de clientes.
    """
    def __init__(self):
        self.columns = [
            'CustomerID', 'Id Complain', 'Id Interaction', 'date_received', 
            'Survey date', 'Twitter', 'NPS', 'product', 'sub_product', 
            'issue', 'sub_issue', 'Gender', 'TransactionID', 'AccountID', 
            'DeviceID', 'IP Address', 'MerchantID', "TransactionDate",
            "PreviousTransactionDate", 'TimeSinceLastTransaction'
        ]
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop(columns=self.columns, errors='ignore')


class DynamicPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.num_cols = []
        self.cat_cols = []
        self.num_scaler = MinMaxScaler()
        self.cat_encoder = OneHotEncoder(sparse_output=False)
        self.cat_feature_names = []

    def fit(self, X, y=None):
        self.num_cols = X.select_dtypes(include=['float64','int64']).columns.tolist()
        self.cat_cols = X.select_dtypes(include=['object']).columns.tolist()
        
        if self.num_cols:
            self.num_scaler.fit(X[self.num_cols])
        if self.cat_cols:
            self.cat_encoder.fit(X[self.cat_cols])
            self.cat_feature_names = self.cat_encoder.get_feature_names_out(self.cat_cols)
        return self

    def transform(self, X):
        import numpy as np
        import pandas as pd
        num_part = self.num_scaler.transform(X[self.num_cols]) if self.num_cols else np.empty((len(X),0))
        cat_part = self.cat_encoder.transform(X[self.cat_cols]) if self.cat_cols else np.empty((len(X),0))
        data = np.hstack([num_part, cat_part])
        columns = self.num_cols + list(self.cat_feature_names)
        return pd.DataFrame(data, columns=columns, index=X.index)


# ===== CLASE CREDITADVISOR =====
class CreditAdvisor:
    def __init__(self, api_key: str):
        # Cargar modelos de sentimiento
        self.model_glove = load_model(os.path.join(PATH_MODELS, "2/GLOVE.keras"))
        self.model_sbert = load_model(os.path.join(PATH_MODELS, "2/SBERT.keras"))
        self.model_w2v = load_model(os.path.join(PATH_MODELS, "2/Word2Vec.keras"))
        # Modelo de recomendaci√≥n
        self.model_recommend = load_model(os.path.join(PATH_MODELS, "1/recommend.keras"))
        # Pipelines
        self.text_pipeline = joblib.load(os.path.join(PATH_PIPELINE, "text_pipeline.joblib"))  # <-- joblib
        self.data_pipeline = joblib.load(os.path.join(PATH_PIPELINE, "pipeline_bankchurner_preprocessing.joblib"))
        # Cliente OpenAI
        self.client = OpenAI(api_key=api_key)

    def analyze_client(self, raw_text: str, client_row: pd.Series):
        # Preprocesar texto
        text_features = self.text_pipeline.transform(pd.Series([raw_text]))
        emb_w2v = text_features['w2v']
        emb_glove = text_features['glove']
        emb_sbert = text_features['sbert']

        # Predicci√≥n de sentimientos
        pred_glove = self.model_glove.predict(emb_glove, verbose=0)[0]
        pred_sbert = self.model_sbert.predict(emb_sbert, verbose=0)[0]
        pred_w2v = self.model_w2v.predict(emb_w2v, verbose=0)[0]

        sentiment_results = {
            "GLOVE": {"neg": float(pred_glove[0]), "neu": float(pred_glove[1]), "pos": float(pred_glove[2])},
            "SBERT": {"neg": float(pred_sbert[0]), "neu": float(pred_sbert[1]), "pos": float(pred_sbert[2])},
            "Word2Vec": {"neg": float(pred_w2v[0]), "neu": float(pred_w2v[1]), "pos": float(pred_w2v[2])}
        }
        avg_positive = np.mean([pred_glove[2], pred_sbert[2], pred_w2v[2]])

        # Procesar datos del cliente
        df_input = client_row.to_frame().T
        df_processed = self.data_pipeline.transform(df_input)

        # Modelo de recomendaci√≥n
        credit_score = float(self.model_recommend.predict(df_processed, verbose=0)[0][0])
        decision = "APROBAR" if credit_score >= 0.6 else "RECHAZAR"

        # Prompt para GPT
        system_prompt = (
            "Eres un analista financiero especializado en riesgo crediticio. "
            "Tu tarea es dar una recomendaci√≥n final, breve y clara, sobre otorgar una tarjeta de cr√©dito. "
            "Eval√∫a los sentimientos del cliente y su perfil financiero, "
            "pero enf√≥cate en la conclusi√≥n pr√°ctica. Responde en tono profesional y conciso."
        )

        user_prompt = f"""
Cliente:
{client_row.to_dict()}

An√°lisis de sentimiento:
- GloVe ‚Üí Neg: {pred_glove[0]:.2f}, Neutro: {pred_glove[1]:.2f}, Pos: {pred_glove[2]:.2f}
- SBERT ‚Üí Neg: {pred_sbert[0]:.2f}, Neutro: {pred_sbert[1]:.2f}, Pos: {pred_sbert[2]:.2f}
- Word2Vec ‚Üí Neg: {pred_w2v[0]:.2f}, Neutro: {pred_w2v[1]:.2f}, Pos: {pred_w2v[2]:.2f}
Promedio de positividad general: {avg_positive:.2f}

Modelo de recomendaci√≥n:
- Score del modelo: {credit_score:.2f}
- Umbral de aprobaci√≥n: 0.60
- Decisi√≥n autom√°tica del modelo: {decision}

Texto del cliente:
"{raw_text}"

Con base en todo lo anterior:
1. Indica si se debe otorgar la tarjeta de cr√©dito.
2. Da una breve justificaci√≥n (m√°ximo 3 frases).
3. Menciona 5 caracter√≠sticas claves que pueden influir en la decisi√≥n y por qu√© lo son.
4. Finaliza con: 'Recomendaci√≥n final: APROBAR' o 'Recomendaci√≥n final: RECHAZAR'.
"""

        # Llamada a GPT
        response = self.client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.3,
            top_p=0.7,
            max_tokens=300,
            frequency_penalty=0.3,
            presence_penalty=0.2,
        )

        explanation = response.choices[0].message.content.strip()

        return {
            "sentiment": sentiment_results,
            "avg_positive": avg_positive,
            "credit_score": credit_score,
            "decision_model": decision,
            "gpt_explanation": explanation
        }


# ===== EJEMPLO DE PRUEBA =====
if __name__ == "__main__":
    advisor = CreditAdvisor(api_key=API_KEY)

    # Cargar dataset
    df = pd.read_csv(os.path.join(PATH_DATA, "BankChurners_merged.csv"))
    df.drop(columns=['NPS'], inplace=True)
    df = df.dropna(subset=['Twitter'])

    # Elegir un cliente
    client_row = df.iloc[5].drop('Twitter')
    text_input = df.iloc[5]['Twitter']

    # Ejecutar an√°lisis
    result = advisor.analyze_client(text_input, client_row)

    # Mostrar resultados
    print("\nüîπ RESULTADOS üîπ")
    print("Sentimientos:", result["sentiment"])
    print("Promedio positividad:", result["avg_positive"])
    print("Score modelo recomendaci√≥n:", result["credit_score"])
    print("Decisi√≥n del modelo:", result["decision_model"])
    print("\n--- An√°lisis GPT ---\n")
    print(result["gpt_explanation"])


2025-11-25 08:06:44.328061: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm
I0000 00:00:1764076010.685481   35442 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1709 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3050 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


üîÑ Reinicializando recursos NLTK despu√©s de deserializaci√≥n...
üì• Descargando recurso NLTK: wordnet
üì• Descargando recurso NLTK: omw-1.4
‚úÖ TextPreprocessor deserializado correctamente
üìÇ Cargando Word2Vec desde pipelines/word2vec_model.bin
ü§ñ Cargando modelo SBERT: all-MiniLM-L6-v2


2025-11-25 08:07:08.905484: I external/local_xla/xla/service/service.cc:163] XLA service 0x73fd600024f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-11-25 08:07:08.905496: I external/local_xla/xla/service/service.cc:171]   StreamExecutor device (0): NVIDIA GeForce RTX 3050 Laptop GPU, Compute Capability 8.6
2025-11-25 08:07:08.916082: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-11-25 08:07:08.955773: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91002
I0000 00:00:1764076029.221721   35652 device_compiler.h:196] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.



üîπ RESULTADOS üîπ
Sentimientos: {'GLOVE': {'neg': 4.971271394538235e-08, 'neu': 1.0333885830382528e-10, 'pos': 1.0}, 'SBERT': {'neg': 1.925584865170027e-11, 'neu': 2.604151116969433e-09, 'pos': 1.0}, 'Word2Vec': {'neg': 2.2951690326067364e-08, 'neu': 2.3197045413009265e-12, 'pos': 1.0}}
Promedio positividad: 1.0
Score modelo recomendaci√≥n: 1.0
Decisi√≥n del modelo: APROBAR

--- An√°lisis GPT ---

1. Se debe otorgar la tarjeta de cr√©dito.
2. El cliente muestra una actitud positiva y agradecida, lo cual indica una buena disposici√≥n para mantener una relaci√≥n financiera.
3. Caracter√≠sticas clave: historial de pagos positivo, bajo nivel de endeudamiento, alta actividad de transacciones, estabilidad laboral y satisfacci√≥n con el servicio al cliente.
4. Recomendaci√≥n final: APROBAR
