In [11]:
import pandas as pd
import numpy as np
import string
import re
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from gensim.models import FastText 
from tqdm import tqdm
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [2]:
!pip install -U textblob
!pip install nltk 
!pip install catboost

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

Requirement already up-to-date: textblob in /usr/local/lib/python3.6/dist-packages (0.15.3)
Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/b2/aa/e61819d04ef2bbee778bf4b3a748db1f3ad23512377e43ecfdc3211437a0/catboost-0.23.2-cp36-none-manylinux1_x86_64.whl (64.8MB)
[K     |████████████████████████████████| 64.8MB 64kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.23.2
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
train = pd.read_csv('train_features.csv')
test = pd.read_csv('test_features.csv')

text_train = pd.read_csv('train_limpio.csv')
text_test = pd.read_csv('test_limpio.csv')

In [4]:
train['text'] = text_train['text']
test['text'] = text_test['text']

In [5]:
#Pasos NLP:
#1: FILTRADO DE DATOS - parte en Limpieza de datos
#2: TOKENIZACION
#3: LEMATIZACION

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer() 

def nlp_text(text):
    #filtrado de signos de puntuacion
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    #tokenizacion
    tokens = re.split('\W+', text)
    #lematizacion
    text = [ps.stem(word) for word in tokens]
    return text

PARA TRAIN

In [6]:
train.text.fillna(' ', inplace=True)
test.text.fillna(' ', inplace=True)

In [9]:
id = '1eWJLOjsG04oOtYXJ6eta7VnudJPIlBh_'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('wiki-news-300d-1M-subword.vec')

In [12]:
def load_fasttext():
    
    print('loading word embeddings...')
    embeddings_index = {}
    f = open('wiki-news-300d-1M-subword.vec',encoding='utf-8')
    for line in tqdm(f):
        values = line.strip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('found %s word vectors' % len(embeddings_index))
    
    return embeddings_index

embeddings_index = load_fasttext()

2487it [00:00, 12389.95it/s]

loading word embeddings...


999995it [01:19, 12582.17it/s]

found 999995 word vectors





In [13]:
def get_average(text):
  counter = np.zeros(300)
  words = text.split(" ")
  for word in words:
    if word in embeddings_index.keys():
      counter += embeddings_index[word]
  return counter/len(words)

In [14]:
train['average_words'] = train['text'].apply(lambda x: get_average(x))

In [16]:
matrix_ft_train = np.matrix(list(train['average_words']))
#averageWords_test = pd.DataFrame(matrix_test)
matrix_ft_train.shape

(7613, 300)

In [17]:
numeric_features = train[['cant_stop_words', 'prom_long_palabra', 'cant_puntuacion', 'cant_apariciones_keyword', 'cant_numeros', 'cant_mayusculas', 'cant_vocales']]
numeric_features

Unnamed: 0,cant_stop_words,prom_long_palabra,cant_puntuacion,cant_apariciones_keyword,cant_numeros,cant_mayusculas,cant_vocales
0,6,4.384615,1,61,0,10,25
1,0,4.571429,1,61,0,5,13
2,11,5.090909,3,61,0,2,45
3,1,7.125000,2,61,5,1,24
4,7,4.500000,2,61,0,3,25
...,...,...,...,...,...,...,...
7608,2,6.636364,5,61,1,7,20
7609,9,5.300000,5,61,0,6,39
7610,2,7.250000,11,61,9,10,12
7611,5,6.263158,5,61,0,4,49


In [18]:
matrix_features = np.hstack((matrix_ft_train, numeric_features))

In [20]:
features_train_ft = pd.DataFrame(matrix_features)
features_train_ft.to_csv('features_train_ft.csv', index=False)

In [21]:
features_train_ft.shape

(7613, 307)

In [22]:
from google.colab import files
files.download('features_train_ft.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

PARA TEST

In [23]:
test['average_words'] = test['text'].apply(lambda x: get_average(x))
matrix_ft_test = np.matrix(list(test['average_words']))

numeric_features_test = test[['cant_stop_words', 'prom_long_palabra', 'cant_puntuacion', 'cant_apariciones_keyword', 'cant_numeros', 'cant_mayusculas', 'cant_vocales']]

matrix_features_test = np.hstack((matrix_ft_test, numeric_features_test))

matriz_ft_test = pd.DataFrame(matrix_features_test)
matriz_ft_test.to_csv('features_test_ft.csv', index=False)

In [25]:
matriz_ft_test.shape

(3263, 307)

In [26]:
from google.colab import files
files.download('features_test_ft.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>