# NOTEBOOK GUÍA EXPONENCIAL


In [None]:
import pandas as pd # Librería para estructurar la información
import numpy as np #Librería manipulación de array y matrices
from tqdm import tqdm # Utilizado para evaluar el progreso de un proceso
import re #Utilizada para preprocesamiento (eliminación substitución de palabras)
import gensim #Librería para extraccion de características de texto Word2Vec
from sklearn.model_selection import train_test_split #Dividir data en entrenamiento y testeo
from sklearn.ensemble import RandomForestClassifier #Modelo de Clasificación
from google.colab import files # Importar archivos desde drive
import nltk
nltk.download('punkt') #Complemento necesario para Tokenize
import gensim.models.word2vec as w2v


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# **1.LECTURA DE DATOS**

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
df = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Monitoria Exponencial/complaints.csv')
df = df[['Consumer complaint narrative','Product']] # Datos en Dataframe

In [None]:
df['Consumer complaint narrative']

0          transworld systems inc. \nis trying to collect...
1                                                        NaN
2          I would like to request the suppression of the...
3          Over the past 2 weeks, I have been receiving e...
4                                                        NaN
                                 ...                        
1727578    I was on automatic payment for my car loan. In...
1727579    I recieved a collections call from an unknown ...
1727580    On XXXX XXXX, 2015, I contacted XXXX XXXX, who...
1727581    I can not get from chase who services my mortg...
1727582    I made a payment to CITI XXXX Credit Card on X...
Name: Consumer complaint narrative, Length: 1727583, dtype: object

In [None]:
df=df[:10] # Esto es para seleccionar las 10 primeras filas

# **2.PREPROCESAMIENTO**

In [None]:
df['Consumer complaint narrative'] = df['Consumer complaint narrative'].map(lambda x: str(x).lower()) # Poner texto en minúsculas
df['Consumer complaint narrative'] = df['Consumer complaint narrative'].map(lambda x: re.sub(r'/^[ñA-Za-z _]*[ñA-Za-z][ñA-Za-z _]*$/','',str(x))) #Quitar caracteres numéricos y alpha

#re.sub(r'[^\w]', ' ', s)

In [None]:
df

Unnamed: 0,Consumer complaint narrative,Product
0,transworld systems inc. \nis trying to collect...,Debt collection
1,,"Credit reporting, credit repair services, or o..."
2,i would like to request the suppression of the...,"Credit reporting, credit repair services, or o..."
3,"over the past 2 weeks, i have been receiving e...",Debt collection
4,,Vehicle loan or lease
5,,Debt collection
6,,"Credit reporting, credit repair services, or o..."
7,,Credit card or prepaid card
8,"i was sold access to an event digitally, of wh...","Money transfer, virtual currency, or money ser..."
9,this complaint dates back to last xx/xx/xxxx d...,Mortgage


# **3.TOKENIZACIÓN**

In [None]:
# A continuación se lleva a cabo el proceso de Tokenización

from nltk.corpus import stopwords
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

In [None]:
df['Consumer complaint narrative'] = df['Consumer complaint narrative'].map(lambda x: tokenize_text(str(x))) #Quitar caracteres numéricos y alpha

In [None]:
df['Consumer complaint narrative']

0    [transworld, systems, inc., is, trying, to, co...
1                                                [nan]
2    [would, like, to, request, the, suppression, o...
3    [over, the, past, weeks, have, been, receiving...
4                                                [nan]
5                                                [nan]
6                                                [nan]
7                                                [nan]
8    [was, sold, access, to, an, event, digitally, ...
9    [this, complaint, dates, back, to, last, xx/xx...
Name: Consumer complaint narrative, dtype: object

#4. WORD EMBEDDINGS-Word2Ve

In [None]:
from gensim.models import Word2Vec
modelo_prueba= Word2Vec(df['Consumer complaint narrative'], size=100, window=5, min_count=3, workers=4).wv

In [None]:
modelo_prueba.vocab

{'account': <gensim.models.keyedvectors.Vocab at 0x7fca905986a0>,
 'actual': <gensim.models.keyedvectors.Vocab at 0x7fca90598828>,
 'amount': <gensim.models.keyedvectors.Vocab at 0x7fca9130ffd0>,
 'an': <gensim.models.keyedvectors.Vocab at 0x7fca9130ff98>,
 'analysis': <gensim.models.keyedvectors.Vocab at 0x7fca90598eb8>,
 'and': <gensim.models.keyedvectors.Vocab at 0x7fca9373cb38>,
 'are': <gensim.models.keyedvectors.Vocab at 0x7fca9130f358>,
 'as': <gensim.models.keyedvectors.Vocab at 0x7fca9130f278>,
 'at': <gensim.models.keyedvectors.Vocab at 0x7fca9130f0f0>,
 'balance': <gensim.models.keyedvectors.Vocab at 0x7fca90598e48>,
 'be': <gensim.models.keyedvectors.Vocab at 0x7fca9130f208>,
 'by': <gensim.models.keyedvectors.Vocab at 0x7fca9130fe48>,
 'cooper': <gensim.models.keyedvectors.Vocab at 0x7fca90598a58>,
 'correct': <gensim.models.keyedvectors.Vocab at 0x7fca905985f8>,
 'credit': <gensim.models.keyedvectors.Vocab at 0x7fca9130f748>,
 'difference': <gensim.models.keyedvectors.Voc

In [None]:
modelo_prueba["phone"]

array([-0.00511641,  0.00243095,  0.00151465, -0.00129431,  0.0032335 ,
       -0.00348065,  0.00509352, -0.00295665, -0.00018019,  0.00517285,
       -0.00031094, -0.00440564, -0.00471425,  0.00293176, -0.00017674,
        0.00128519,  0.00030311,  0.00453284, -0.00243476,  0.00291448,
        0.00216784, -0.00016371,  0.00343065, -0.00028338, -0.00309713,
       -0.00150332, -0.00078946,  0.00430862, -0.00048648, -0.002056  ,
       -0.00016738, -0.00190523,  0.00017988, -0.00242709, -0.00419664,
        0.00222267, -0.00201584, -0.00014855,  0.00166449, -0.00107281,
        0.00470768,  0.00469129,  0.00137404,  0.00123858,  0.00280231,
        0.00028173, -0.0006414 ,  0.00090739, -0.00250484,  0.00424373,
       -0.00035868,  0.00490559,  0.00308028, -0.00468329, -0.00045184,
        0.00422505,  0.00460443,  0.00088921,  0.0025424 ,  0.00156261,
       -0.00220892,  0.00211557,  0.00249605,  0.00444781,  0.00104333,
       -0.00202434,  0.001619  ,  0.00012122, -0.00263355,  0.00