# FGV EMAP

## Modelagem e mineração de dados


### Trabalho Kaggle Quora

#### Alunos: Antonio Sombra e Joao Marcos

##### https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words

### Carregando pacotes necessários

In [1]:

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

#strings
import string
import nltk
import nltk.data
from nltk.corpus import stopwords
from nltk import word_tokenize, ngrams
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from fuzzywuzzy import fuzz


#Extras
import os
import sys
import re
import pickle
import logging

#basic
import numpy as np
import pandas as pd
import xgboost as xgb
import string
import math


# Visualisation
import pylab
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
from matplotlib.font_manager import FontProperties
import seaborn as sns

# Modelling Helpers
from sklearn.preprocessing import Imputer , Normalizer , scale
from sklearn.cross_validation import train_test_split , StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.cluster import KMeans
from sklearn.metrics import roc_auc_score as AUC
import gensim
from gensim import corpora
from gensim import models
from gensim import similarities
from gensim.models import Word2Vec


%matplotlib inline



#### Carregando banco de dados

In [2]:
datapath = "/Dados/Kaggle/" #Diretório da base de dados
word2vec= "/Dados/Word2vec" #diretório word2vec
save=  "/home/joaomarcosest/Kaggle_Quora/Dados" #diterório para criação de dados auxiliares
data_train = pd.read_csv(os.path.join(datapath, 'train.csv'))
data_train=data_train.drop(['id','qid1','qid2'],axis=1)# #Eliminando colunas desnecessária à análise
data_train.sample(3)

Unnamed: 0,question1,question2,is_duplicate
361422,What are some of your favourite poems?,What is your favourite poem and why?,1
187384,Why sex is so important in life?,Why is sex given such importance in relationsh...,0
130008,Do employees at Inventure Foods have a good wo...,Do employees at B&G Foods have a good work-lif...,0


In [3]:
data_train.info()
data_train.describe()

#0.369198% do banco são de questõs clássificadas como duplicadas
# O banco possui duas questõe NA

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 3 columns):
question1       404290 non-null object
question2       404288 non-null object
is_duplicate    404290 non-null int64
dtypes: int64(1), object(2)
memory usage: 9.3+ MB


Unnamed: 0,is_duplicate
count,404290.0
mean,0.369198
std,0.482588
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [None]:
print(data_train.columns)# Verificando as colunas do banco
type(data_train)

In [None]:
# Choosing the list of stopwords
mystopwords = nltk.corpus.stopwords.words('english')#carregando stop words
list_of_words= ['where','what','when','why','between','who','how','which']#Eliminando palavras da lista de stopwords
for item in list_of_words:
        mystopwords.remove(item)

### Funçoes necessárias para limpeza do banco

In [6]:
def remove_stopwords(phrase,list_stopwords):
    """
    Função recebe uma frase e uma lista de stopwords
    :return: Retorna a frase sem stopwords
    """
    final_phrase = []
    words = phrase.split(" ")
    for word in words:
        if word not in list_stopwords:
            final_phrase.append((word))
    
    final_phrase = ' '.join(final_phrase)
    
    return final_phrase
    
def remove_punctuation(phrase):
    """
    Função recebe uma frase e retorna a mesma sem pontuações.
    :return: Retorna a prase sem pontuações.
    """
    #https://www.tutorialspoint.com/python/string_maketrans.htm
    #Check if NA
    if type(phrase) is float:
        if math.isnan(phrase):
            return ("")
    
    translator = str.maketrans('', '', string.punctuation)
    phrase = phrase.translate(translator) #removing punctuation
        
    return phrase

def lemm_wordnet(phrase):
    """
    Lematiza as palavras da frase
    """
    lemm = WordNetLemmatizer()
    
    #NA is a float type, so this if is to avoid conflict
    if type(phrase) is not float:
        phrase = [lemm.lemmatize(i) for i in phrase.split()]
        phrase = ' '.join(phrase)
    else:
        return ""
    return phrase
    
def remove_duplicate(phrase):
    """
    remove palavras duplicadas dentro de uma frase
    """
    aux_phrase = []
        
    if type(phrase) is not float:
        
        for i in phrase.split():
            
            if i not in aux_phrase:
                aux_phrase.append(i)
    
    phrase = ' '.join(aux_phrase)
    
    return phrase
    
    
def all_lower_case(phrase):    
    """
   Transforma toda frase para lowwer_case, i.e deixa a frase toda em minusculo
    """
    if type(phrase) is not float:
            phrase = phrase.lower()
    return phrase
    
def stem_snowball(phrase):
    """
    Receives a phrase and returns the same phrase stemmed, lowercase phrase without stopwords
    :param phrase: String. A phrase.
    :return: String. Stemmed, lowercase phrase without stopwords
    """
    stemmer = SnowballStemmer("english")
    
    #Stem words according to stemmer
    final_phrase = []
    words = phrase.split(" ")
    for word in words:
        final_phrase.append((stemmer.stem(word)))
    
    final_phrase = ' '.join(final_phrase)
    
    return final_phrase

stem_snowball("What is the step by step guide to invest in share market in india?")



'what is the step by step guid to invest in share market in india?'

In [7]:
#Conta  o numero de caracteres em cada frases, considerando tbm os espacos em branco
data_train['len_q1'] = data_train.question1.apply(lambda x: len(str(x)))
data_train['len_q2'] = data_train.question2.apply(lambda x: len(str(x)))
#diferenca de caracteres da primeira questao com a segunda
data_train['diff_len'] = data_train.len_q1 - data_train.len_q2
#contao numero de caracteres unico em cada frase, ignorando os espacos em branco
data_train['len_char_q1'] = data_train.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data_train['len_char_q2'] = data_train.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
#Conta o numero de palavras em cada questao
data_train['len_word_q1'] = data_train.question1.apply(lambda x: len(str(x).split()))
data_train['len_word_q2'] = data_train.question2.apply(lambda x: len(str(x).split()))
#Numero de palavras em comum nas duas frases
data_train['common_words'] = data_train.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection\
                                                            (set(str(x['question2']).lower().split()))), axis=1)
#Proporcao de palavras em comum nas duas frases
data_train['prop_common_words'] = data_train.apply(lambda x: \
                                                   len(set(remove_punctuation(x['question1']).lower().split()).intersection\
                                                       (set(remove_punctuation(x['question2']).lower().split()))) / \
                                                   len(set(remove_punctuation(x['question1']).lower().split()).union\
                                                       (set(remove_punctuation(x['question2']).lower().split()))),axis=1 )

#https://github.com/seatgeek/fuzzywuzzy
# calcula distancias de Levensthein para as duas questoes
data_train['fuzz_qratio'] = data_train.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
print('fuzz_qratio')

data_train['fuzz_WRatio'] = data_train.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
print('fuzz_WRatio')

data_train['fuzz_partial_ratio'] = data_train.apply(lambda x: fuzz.partial_ratio\
                                                    (str(x['question1']), str(x['question2'])), axis=1)
print('fuzz_partial_ratio')

data_train['fuzz_partial_token_set_ratio'] = data_train.apply(lambda x: fuzz.partial_token_set_ratio\
                                                              (str(x['question1']), str(x['question2'])), axis=1)
print('fuzz_partial_token_set_ratio')

data_train['fuzz_partial_token_sort_ratio'] = data_train.apply(lambda x: fuzz.partial_token_sort_ratio\
                                                               (str(x['question1']), str(x['question2'])), axis=1)
print('fuzz_partial_token_sort_ratio')

data_train['fuzz_token_set_ratio'] = data_train.apply(lambda x: fuzz.token_set_ratio\
                                                      (str(x['question1']), str(x['question2'])), axis=1)
print('fuzz_token_set_ratio')

data_train['fuzz_token_sort_ratio'] = data_train.apply(lambda x: fuzz.token_sort_ratio\
                                                       (str(x['question1']), str(x['question2'])), axis=1)
print('fuzz_token_sort_ratio')


fuzz_qratio
fuzz_WRatio
fuzz_partial_ratio
fuzz_partial_token_set_ratio
fuzz_partial_token_sort_ratio
fuzz_token_set_ratio
fuzz_token_sort_ratio


### Explicacao sobre as funcoes fuzzy

In [8]:
#https://github.com/seatgeek/fuzzywuzzy
#https://pypi.python.org/pypi/fuzzywuzzy
#http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/

In [9]:
data_train.groupby('is_duplicate').describe()#Verificando as estatisticas por grupo

Unnamed: 0_level_0,common_words,common_words,common_words,common_words,common_words,common_words,common_words,common_words,diff_len,diff_len,...,len_word_q2,len_word_q2,prop_common_words,prop_common_words,prop_common_words,prop_common_words,prop_common_words,prop_common_words,prop_common_words,prop_common_words
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
is_duplicate,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,255027.0,3.960922,3.199845,0.0,2.0,3.0,5.0,34.0,255027.0,-0.849134,...,14.0,237.0,255027.0,0.300786,0.239106,0.0,0.121212,0.230769,0.428571,1.0
1,149263.0,5.452436,2.666256,1.0,4.0,5.0,7.0,41.0,149263.0,-0.097586,...,11.0,60.0,149263.0,0.466475,0.206193,0.055556,0.307692,0.4375,0.6,1.0


### Salvando a base com as features

In [10]:
#Verificando os dados apos a insercao das features
data_train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 19 columns):
question1                        404290 non-null object
question2                        404288 non-null object
is_duplicate                     404290 non-null int64
len_q1                           404290 non-null int64
len_q2                           404290 non-null int64
diff_len                         404290 non-null int64
len_char_q1                      404290 non-null int64
len_char_q2                      404290 non-null int64
len_word_q1                      404290 non-null int64
len_word_q2                      404290 non-null int64
common_words                     404290 non-null int64
prop_common_words                404290 non-null float64
fuzz_qratio                      404290 non-null int64
fuzz_WRatio                      404290 non-null int64
fuzz_partial_ratio               404290 non-null int64
fuzz_partial_token_set_ratio     404290 non-null int64
fuzz_

In [11]:
#Salvando pickle da base de dados com as features para nao precisar rodar novamente as features
with open(os.path.join(save, 'train_features.pkl'),'wb') as f:
    pickle.dump((data_train),f)

In [12]:
# Salvando o banco de treino com as features em csv
data_train.to_csv(os.path.join(save, 'train_features.csv'),index=False)

In [None]:
#Conta o numero total de palavras unica no banco de dados de treinamento
'''
lines_count = 0
a=set()
for item in range(0,404290):
    a =  a.union(
     set(remove_punctuation(data_train.question1[item]).\
    split()).union(set(remove_punctuation(data_train.question2[item]).split())))
    if item%50000==0:
        print(item)
print(len(a))
'''
#Resultado 136153 palavras única

In [None]:
#Stemmers remove morphological affixes from words, leaving only the word stem.
#http://www.nltk.org/howto/stem.html
#The 'english' stemmer is better than the original 'porter' stemmer.
#example; stemmer.stem('likely', 'bites') - like, bite 
#http://www.nltk.org/api/nltk.tokenize.html
#A tokenizer that divides a string into substrings by splitting on the specified string (defined in subclasses).
#word_tokenize("It's only a test")- ['It', "'s", 'only', 'a', 'test']


### Limpando a base de dados para vetorizar e criar modelos

In [13]:
#Funcao que voce escolhe quais filtros deseja fazer no seu banco de dados, lematizar, stematizar, remover duplicadas
#remover pontuacoes, remover stopwords, remover na e pasar todas palavras para minusculo
def cleaning_tool(data, drop_na = True, lower_case = True, rm_duplicate = False, stopwords = False, 
                  punctuation = False, lemma = False, stem = False, list_of_stopwords = None):
    """
    Function to process all data using calling functions from above, according to what was chosen.
    :param data: data frame.
    :param drop_na: If True drop all lines of data frame with NA
    :param lower_case: If True transform for lower case
    :param rm_duplicate: If True remove all duplicate words in questions
    :param stopwords: If True removes stopwords
    :param punctuation: If True removes punctuation
    :param lemm: If True returns the phrase lemmatized
    :param stem: If True returns the phrase stemmed
    :param list_of_stopwords: List of stopwords to be used
    :return: Question1 and Question2 processed according to parameters
    """
    if drop_na == True:
        data = data.dropna(0)
    
    if rm_duplicate == True:
        data["question1"] = data["question1"].apply(lambda x: remove_duplicate(x))
        data["question2"] = data["question2"].apply(lambda x: remove_duplicate(x))
    
    if lower_case == True:
        data["question1"] = data["question1"].apply(lambda x: all_lower_case(x))
        data["question2"] = data["question2"].apply(lambda x: all_lower_case(x))
    
    if stopwords == True:
        data["question1"] = data["question1"].apply(lambda x: remove_stopwords(x, list_of_stopwords))
        data["question2"] = data["question2"].apply(lambda x: remove_stopwords(x, list_of_stopwords))
       
    if punctuation == True:
        data["question1"] = data["question1"].apply(lambda x: remove_punctuation(x))
        data["question2"] = data["question2"].apply(lambda x: remove_punctuation(x))
        
    if lemma == True:
        data["question1"] = data["question1"].apply(lambda x: lemm_wordnet(x))
        data["question2"] = data["question2"].apply(lambda x: lemm_wordnet(x))
        
    if stem_snowball == True:
        data["question1"] = data["question1"].apply(lambda x: stem_snowball(x))
        data["question2"] = data["question2"].apply(lambda x: stem_snowball(x))
    
    #We used it two times if some function create a new NA.
    if drop_na == True:
        data = data.dropna(0)    
        
    return data

In [14]:
#um banco de dados limpo usando apenas lematizacao e um usando ematizacao e stematizacao
data_train_clean = cleaning_tool(data_train, stopwords=True, lemma=True,list_of_stopwords=mystopwords,punctuation=True)
data_train_clean2 = cleaning_tool(data_train, stopwords=True, lemma=True,list_of_stopwords=mystopwords,punctuation=True,stem=True)

In [15]:
#Salvando pickle dos dois bancos limpos
with open(os.path.join(save, 'data_train_clean_features.pkl'),'wb') as f:
    pickle.dump((data_train_clean),f)
with open(os.path.join(save, 'data_train_clean_features2.pkl'),'wb') as f:
    pickle.dump((data_train_clean2),f)

In [16]:
#salvando como csv
data_train_clean.to_csv(os.path.join(save, 'train_clean_features.csv'),index=False)
data_train_clean2.to_csv(os.path.join(save, 'train_clean_features2.csv'),index=False)

In [None]:
#Amostras do banco para validar o modelo
sample1=data_train_clean[0:300000]
sample2=data_train_clean.sample(300000)
teste1=data_train_clean[300001:]

In [None]:
#salvando amostras para valiadar o modelo
with open(os.path.join(save, 'datas_sample.pkl'),'wb') as f:
    pickle.dump((sample1,
                 sample2,
                 teste1),f)

### Vetorizando a base de dados

In [None]:
# Como o banco tem mais de 130 mil palavras escolhemos trabalhar com 10000

vectorizer_tf = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             ngram_range =(1,3), \
                             max_features = 10000) 


#### Vectorize_TF data_train

In [23]:
questions = data_train_clean.question1.append([data_train_clean.question2])

In [24]:
vector_fitt = vectorizer_tf.fit(questions)
data_train_clean_tf_question1 = vector_fitt.transform(data_train_clean.question1)
# Numpy arrays are easy to work with
#data_train_clean_tf_question1 = data_train_clean_tf_question1.toarray()

with open(os.path.join(save,'data_train_clean_tf_question1.pkl'),'wb') as f:
    pickle.dump(data_train_clean_tf_question1,f)
del(data_train_clean_tf_question1)

In [25]:
data_train_clean_tf_question2 = vector_fitt.transform(data_train_clean.question2)
# Numpy arrays are easy to work with
#data_train_clean_tf_question2 = data_train_clean_tf_question2.toarray()

with open(os.path.join(save, 'data_train_clean_tf_question2.pkl'),'wb') as f:
    pickle.dump(data_train_clean_tf_question2,f)
del(data_train_clean_tf_question2)
del(questions)

#### Vectorize_TF sample1

In [None]:
sample1_questions=sample1.question1.append([sample1.question2])

In [None]:
vector_fitt = vectorizer_tf.fit(sample1_questions)
sample1_tf_question1 = vector_fitt.transform(sample1.question1)
# Numpy arrays are easy to work with
#sample1_tf_question1 = sample1_tf_question1.toarray()

with open(os.path.join(save, 'sample1_tf_question1.pkl'),'wb') as f:
    pickle.dump(sample1_tf_question1,f)
del(sample1_tf_question1)

In [None]:
sample1_tf_question2 = vector_fitt.transform(sample1.question2)
#sample1_tf_question2 = sample1_tf_question2.toarray()

with open(os.path.join(save, 'sample1_tf_question2.pkl'),'wb') as f:
    pickle.dump(sample1_tf_question2,f)
del(sample1_tf_question2)
del(sample1_questions)

#### Vectorize_TF sample2

In [None]:
sample2_questions=sample2.question1.append([sample1.question2])

In [None]:
vector_fitt = vectorizer_tf.fit(sample2_questions)
sample2_tf_question1 = vector_fitt.transform(sample2.question1)
#sample2_tf_question1 = sample2_tf_question1.toarray()

with open(os.path.join(save, 'sample2_tf_question1.pkl'),'wb') as f:
    pickle.dump(sample2_tf_question1,f)
del(sample2_tf_question1)

In [None]:
sample2_tf_question2 = vector_fitt.transform(sample2.question2)
#sample2_tf_question2 = sample2_tf_question2.toarray()

with open(os.path.join(save, 'sample2_tf_question2.pkl'),'wb') as f:
    pickle.dump(sample2_tf_question2,f)
del(sample2_tf_question2)
del(sample2_questions)

#### Vectorize_TF teste1

In [None]:
teste1_questions=teste1.question1.append([teste1.question2])

In [None]:
vector_fitt = vectorizer_tf.fit(teste1_questions)
teste1_tf_question1 = vector_fitt.transform(teste1.question1)
#teste1_tf_question1 = teste1_tf_question1.toarray()

with open(os.path.join(save, 'teste1_tf_question1.pkl'),'wb') as f:
    pickle.dump(teste1_tf_question1,f)
del(teste1_tf_question1)

In [None]:
vector_fitt = vectorizer_tf.fit(teste1_questions)
teste1_tf_question2 = vector_fitt.transform(teste1.question2)
#teste1_tf_question2 = teste1_tf_question2.toarray()

with open(os.path.join(save, 'teste1_tf_question2.pkl'),'wb') as f:
    pickle.dump(teste1_tf_question2,f)
del(teste1_tf_question2)

#### Vectorize_TFIDF

In [26]:
#http://scikhttps://jupyterhub.namd.mat.br/user/joaomarcosest/treeit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
#Another approach using TfIDf vectorizer and using the texts with stopwords in:
#https://github.com/zygmuntz/classifying-text/blob/master/bow_predict.py 
vectorizer_tfidf = TfidfVectorizer(analyzer='word', \
                                  preprocessor=None,\
                                  tokenizer=None,\
                                  stop_words=None,\
                                 ngram_range =(1,3), \
                                  max_features=10000)

#### Vectorize_TFIDF data_train

In [27]:
questions = data_train_clean.question1.append([data_train_clean.question2])

In [28]:
vector_fitt = vectorizer_tfidf.fit(questions)
data_train_clean_tfidf_question1 = vector_fitt.transform(data_train_clean.question1)
# Numpy arrays are easy to work with
#data_train_clean_tfidf_question1 = data_train_clean_tfidf_question1.toarray()

with open(os.path.join(save,'data_train_clean_tfidf_question1.pkl'),'wb') as f:
    pickle.dump(data_train_clean_tfidf_question1,f)
del(data_train_clean_tfidf_question1)

In [29]:
data_train_clean_tfidf_question2 = vector_fitt.transform(data_train_clean.question2)
 # Numpy arrays are easy to work with
#data_train_clean_tfidf_question2 = data_train_clean_tfidf_question2.toarray()

with open(os.path.join(save, 'data_train_clean_tfidf_question2.pkl'),'wb') as f:
    pickle.dump(data_train_clean_tfidf_question2,f)
del(data_train_clean_tfidf_question2)
del(questions)

#### Vectorize_tfidf sample1

In [None]:
sample1_questions=sample1.question1.append([sample1.question2])

In [None]:
vector_fitt = vectorizer_tfidf.fit(sample1_questions)
sample1_tfidf_question1 = vector_fitt.transform(sample1.question1)
# Numpy arrays are easy to work with
#sample1_tfidf_question1 = sample1_tfidf_question1.toarray()

with open(os.path.join(save, 'sample1_tfidf_question1.pkl'),'wb') as f:
    pickle.dump(sample1_tfidf_question1,f)
del(sample1_tfidf_question1)

In [None]:
sample1_tfidf_question2 = vector_fitt.transform(sample1.question2)
#sample1_tfidf_question2 = sample1_tfidf_question2.toarray()

with open(os.path.join(save, 'sample1_tfidf_question2.pkl'),'wb') as f:
    pickle.dump(sample1_tfidf_question2,f)
del(sample1_tfidf_question2)
del(sample1_questions)

#### Vectorize_tfidf sample2

In [None]:
sample2_questions=sample2.question1.append([sample1.question2])

In [None]:
vector_fitt = vectorizer_tfidf.fit(sample2_questions)
sample2_tfidf_question1 = vector_fitt.transform(sample2.question1)
#sample2_tfidf_question1 = sample2_tfidf_question1.toarray()

with open(os.path.join(save, 'sample2_tfidf_question1.pkl'),'wb') as f:
    pickle.dump(sample2_tfidf_question1,f)
del(sample2_tfidf_question1)

In [None]:
sample2_tfidf_question2 = vector_fitt.transform(sample2.question2)
#sample2_tfidf_question2 = sample2_tfidf_question2.toarray()

with open(os.path.join(save, 'sample2_tfidf_question2.pkl'),'wb') as f:
    pickle.dump(sample2_tfidf_question2,f)
del(sample2_tfidf_question2)
del(sample2_questions)

#### Vectorize_tfidf teste1

In [None]:
teste1_questions=teste1.question1.append([teste1.question2])

In [None]:
vector_fitt = vectorizer_tfidf.fit(teste1_questions)
teste1_tfidf_question1 = vector_fitt.transform(teste1.question1)
#teste1_tfidf_question1 = teste1_tfidf_question1.toarray()

with open(os.path.join(save, 'teste1_tfidf_question1.pkl'),'wb') as f:
    pickle.dump(teste1_tfidf_question1,f)
del(teste1_tfidf_question1)

In [None]:
vector_fitt = vectorizer_tfidf.fit(teste1_questions)
teste1_tfidf_question2 = vector_fitt.transform(teste1.question2)
#teste1_tfidf_question2 = teste1_tfidf_question2.toarray()

with open(os.path.join(save, 'teste1_tfidf_question2.pkl'),'wb') as f:
    pickle.dump(teste1_tfidf_question2,f)
del(teste1_tfidf_question2)

#### Abrindo os vetores das das frases

In [None]:
with open(os.path.join(save, 'train_features.pkl'),'rb') as f:
    (train_data_features_tf) = pickle.load(f)

# NAO CONSEGUIMOS PASSAR PARA ARRAY OS VETORES DE PALAVRAS POIS DAVA ERRO DE MEMORIA!!!