# FGV EMAP

## Modelagem e mineração de dados


### Trabalho Kaggle Quora

#### Alunos: Antonio Sombra e Joao Marcos

##### https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words

### Reading packages

In [1]:

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

#strings
import string
import nltk
import nltk.data
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from fuzzywuzzy import fuzz

#Extras
import os
import sys
import time
import re
import pickle
import logging

#basic
import numpy as np
import pandas as pd
import xgboost as xgb
import string
import math


# Visualisation
import pylab
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
from matplotlib.font_manager import FontProperties
import seaborn as sns

# Modelling Helpers
from sklearn.preprocessing import Imputer , Normalizer , scale
from sklearn.cross_validation import train_test_split , StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.cluster import KMeans
from sklearn.metrics import roc_auc_score as AUC
import gensim
from gensim import corpora
from gensim import models
from gensim import similarities
from gensim.models import Word2Vec

from bs4 import BeautifulSoup

%matplotlib inline



#### Load data

In [2]:
datapath = "/Dados/Kaggle/"
word2vec= "/Dados/Word2vec"
save=  "/home/joaomarcosest/Kaggle_Quora/Dados"
data_train = pd.read_csv(os.path.join(datapath, 'train.csv'))
data_train=data_train.drop(['id','qid1','qid2'],axis=1)
data_train.sample(3)

Unnamed: 0,question1,question2,is_duplicate
303503,What are some of the interesting encounters wi...,"How is E & I at BITS Pilani, Pilani campus?",0
157911,How high should one jump from to commit suicide?,How high can a cow jump?,0
338574,Do employees at Strategic Hotel & Resorts have...,Do employees at Xenia Hotels & Resorts have a ...,0


In [3]:
data_train.info()
data_train.describe()

#0.369198 percente is duplicate

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 3 columns):
question1       404290 non-null object
question2       404288 non-null object
is_duplicate    404290 non-null int64
dtypes: int64(1), object(2)
memory usage: 9.3+ MB


Unnamed: 0,is_duplicate
count,404290.0
mean,0.369198
std,0.482588
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [4]:
print(data_train.columns)
type(data_train)

Index(['question1', 'question2', 'is_duplicate'], dtype='object')


pandas.core.frame.DataFrame

In [5]:
# Choosing the list of stopwords
mystopwords = nltk.corpus.stopwords.words('english')
list_of_words= ['where','what','when','why','between','who','how','which']
for item in list_of_words:
        mystopwords.remove(item)

### Functions tools

In [6]:
def remove_stopwords(phrase,list_stopwords):
    """
    Receives a phrase and removes all stopwords from a list
    :param phrase: String. A phrase.
    :param list_stopwords: List. A list of stopwords
    :return: The same phrase without stopwords
    """
    final_phrase = []
    words = phrase.split(" ")
    for word in words:
        if word not in list_stopwords:
            final_phrase.append((word))
    
    final_phrase = ' '.join(final_phrase)
    
    return final_phrase
    
def remove_punctuation(phrase):
    """
    Receives a phrase and removes all punctuation from it
    :param phrase: String. A phrase.
    :return: The same phrase without punctuation
    """
    #https://www.tutorialspoint.com/python/string_maketrans.htm
    #Check if NA
    if type(phrase) is float:
        if math.isnan(phrase):
            return ("")
    
    translator = str.maketrans('', '', string.punctuation)
    phrase = phrase.translate(translator) #removing punctuation
        
    return phrase

def lemm_wordnet(phrase):
    """
    Receives a phrase and removes lemmatizes it
    :param phrase: String. A phrase.
    :return: The same phrase in lemmas
    """
    lemm = WordNetLemmatizer()
    
    #NA is a float type, so this if is to avoid conflict
    if type(phrase) is not float:
        phrase = [lemm.lemmatize(i) for i in phrase.split()]
        phrase = ' '.join(phrase)
    else:
        return ""
    return phrase
    
def remove_duplicate(phrase):
    """
    Receives a phrase and removes all duplicate words
    :param phrase: String. A phrase.
    :return: The same phrase with just unique words
    """
    aux_phrase = []
        
    if type(phrase) is not float:
        
        for i in phrase.split():
            
            if i not in aux_phrase:
                aux_phrase.append(i)
    
    phrase = ' '.join(aux_phrase)
    
    return phrase
    
    
def all_lower_case(phrase):    
    """
    Receives a phrase and makes it lower case
    :param phrase: String. A phrase.
    :return: The same phrase in lower case
    """
    if type(phrase) is not float:
            phrase = phrase.lower()
    return phrase
    
def stem_snowball(phrase):
    """
    Receives a phrase and returns the same phrase stemmed, lowercase phrase without stopwords
    :param phrase: String. A phrase.
    :return: String. Stemmed, lowercase phrase without stopwords
    """
    stemmer = SnowballStemmer("english")
    
    #Stem words according to stemmer
    final_phrase = []
    words = phrase.split(" ")
    for word in words:
        final_phrase.append((stemmer.stem(word)))
    
    final_phrase = ' '.join(final_phrase)
    
    return final_phrase

stem_snowball("What is the step by step guide to invest in share market in india?")



'what is the step by step guid to invest in share market in india?'

In [7]:
data_train['len_q1'] = data_train.question1.apply(lambda x: len(str(x)))
data_train['len_q2'] = data_train.question2.apply(lambda x: len(str(x)))
data_train['diff_len'] = data_train.len_q1 - data_train.len_q2
data_train['len_char_q1'] = data_train.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data_train['len_char_q2'] = data_train.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data_train['len_word_q1'] = data_train.question1.apply(lambda x: len(str(x).split()))
data_train['len_word_q2'] = data_train.question2.apply(lambda x: len(str(x).split()))
data_train['common_words'] = data_train.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection\
                                                            (set(str(x['question2']).lower().split()))), axis=1)

data_train['prop_common_words'] = data_train.apply(lambda x: \
                                                   len(set(remove_punctuation(x['question1']).lower().split()).intersection\
                                                       (set(remove_punctuation(x['question2']).lower().split()))) / \
                                                   len(set(remove_punctuation(x['question1']).lower().split()).union\
                                                       (set(remove_punctuation(x['question2']).lower().split()))),axis=1 )


data_train['fuzz_qratio'] = data_train.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
print('fuzz_qratio')

data_train['fuzz_WRatio'] = data_train.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
print('fuzz_WRatio')

data_train['fuzz_partial_ratio'] = data_train.apply(lambda x: fuzz.partial_ratio\
                                                    (str(x['question1']), str(x['question2'])), axis=1)
print('fuzz_partial_ratio')

data_train['fuzz_partial_token_set_ratio'] = data_train.apply(lambda x: fuzz.partial_token_set_ratio\
                                                              (str(x['question1']), str(x['question2'])), axis=1)
print('fuzz_partial_token_set_ratio')

data_train['fuzz_partial_token_sort_ratio'] = data_train.apply(lambda x: fuzz.partial_token_sort_ratio\
                                                               (str(x['question1']), str(x['question2'])), axis=1)
print('fuzz_partial_token_sort_ratio')

data_train['fuzz_token_set_ratio'] = data_train.apply(lambda x: fuzz.token_set_ratio\
                                                      (str(x['question1']), str(x['question2'])), axis=1)
print('fuzz_token_set_ratio')

data_train['fuzz_token_sort_ratio'] = data_train.apply(lambda x: fuzz.token_sort_ratio\
                                                       (str(x['question1']), str(x['question2'])), axis=1)
print('fuzz_token_sort_ratio')


fuzz_qratio
fuzz_WRatio
fuzz_partial_ratio
fuzz_partial_token_set_ratio
fuzz_partial_token_sort_ratio
fuzz_token_set_ratio
fuzz_token_sort_ratio


In [8]:
#https://github.com/seatgeek/fuzzywuzzy
#https://pypi.python.org/pypi/fuzzywuzzy
#http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/

In [9]:
data_train.groupby('is_duplicate').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,common_words,diff_len,fuzz_WRatio,fuzz_partial_ratio,fuzz_partial_token_set_ratio,fuzz_partial_token_sort_ratio,fuzz_qratio,fuzz_token_set_ratio,fuzz_token_sort_ratio,len_char_q1,len_char_q2,len_q1,len_q2,len_word_q1,len_word_q2,prop_common_words
is_duplicate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,count,255027.0,255027.0,255027.0,255027.0,255027.0,255027.0,255027.0,255027.0,255027.0,255027.0,255027.0,255027.0,255027.0,255027.0,255027.0,255027.0
0,mean,3.960922,-0.849134,73.680136,60.334274,96.33427,63.232207,56.742929,67.718489,59.24606,20.679308,20.656158,63.455403,64.304536,11.582828,11.95573,0.300786
0,std,3.199845,38.196508,16.832006,16.921124,14.094409,14.818205,18.296754,18.815512,16.806466,4.377631,4.584406,32.584157,38.116989,5.955508,7.162012,0.239106
0,min,0.0,-1080.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
0,25%,2.0,-14.0,60.0,46.0,100.0,52.0,42.0,54.0,47.0,18.0,18.0,41.0,40.0,8.0,8.0,0.121212
0,50%,3.0,0.0,83.0,57.0,100.0,61.0,52.0,67.0,57.0,20.0,20.0,55.0,53.0,10.0,10.0,0.230769
0,75%,5.0,14.0,86.0,73.0,100.0,74.0,70.0,83.0,71.0,23.0,23.0,78.0,78.0,14.0,14.0,0.428571
0,max,34.0,487.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,52.0,55.0,623.0,1169.0,125.0,237.0,1.0
1,count,149263.0,149263.0,149263.0,149263.0,149263.0,149263.0,149263.0,149263.0,149263.0,149263.0,149263.0,149263.0,149263.0,149263.0,149263.0,149263.0
1,mean,5.452436,-0.097586,81.319282,72.802382,100.0,74.420808,70.850197,82.662227,72.28145,19.45107,19.44701,52.841347,52.938933,9.847665,9.859999,0.466475


### Saving data base of features

In [10]:
data_train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 19 columns):
question1                        404290 non-null object
question2                        404288 non-null object
is_duplicate                     404290 non-null int64
len_q1                           404290 non-null int64
len_q2                           404290 non-null int64
diff_len                         404290 non-null int64
len_char_q1                      404290 non-null int64
len_char_q2                      404290 non-null int64
len_word_q1                      404290 non-null int64
len_word_q2                      404290 non-null int64
common_words                     404290 non-null int64
prop_common_words                404290 non-null float64
fuzz_qratio                      404290 non-null int64
fuzz_WRatio                      404290 non-null int64
fuzz_partial_ratio               404290 non-null int64
fuzz_partial_token_set_ratio     404290 non-null int64
fuzz_

In [11]:
#Saving Pickle
with open(os.path.join(save, 'train_features.pkl'),'wb') as f:
    pickle.dump((data_train),f)

In [12]:
# save features as CSV
data_train.to_csv(os.path.join(save, 'train_features.csv'),index=False)

In [13]:
#Count the number of words in data
'''
lines_count = 0
a=set()
for item in range(0,404290):
    a =  a.union(
     set(remove_punctuation(data_train.question1[item]).\
    split()).union(set(remove_punctuation(data_train.question2[item]).split())))
    if item%50000==0:
        print(item)
print(len(a))
'''
#Resultado 136153 palavras única

'\nlines_count = 0\na=set()\nfor item in range(0,404290):\n    a =  a.union(\n     set(remove_punctuation(data_train.question1[item]).    split()).union(set(remove_punctuation(data_train.question2[item]).split())))\n    if item%50000==0:\n        print(item)\nprint(len(a))\n'

In [14]:
#Stemmers remove morphological affixes from words, leaving only the word stem.
#http://www.nltk.org/howto/stem.html
#The 'english' stemmer is better than the original 'porter' stemmer.
#example; stemmer.stem('likely', 'bites') - like, bite 
#http://www.nltk.org/api/nltk.tokenize.html
#A tokenizer that divides a string into substrings by splitting on the specified string (defined in subclasses).
#word_tokenize("It's only a test")- ['It', "'s", 'only', 'a', 'test']


### Clearing data base

In [15]:
#cleaning tool is used so you can easily choose which functions you want to use to clean te text
def cleaning_tool(data, drop_na = True, lower_case = True, rm_duplicate = False, stopwords = False, 
                  punctuation = False, lemm = False, stem = False, list_of_stopwords = None):
    """
    Function to process all data using calling functions from above, according to what was chosen.
    :param data: data frame.
    :param drop_na: If True drop all lines of data frame with NA
    :param lower_case: If True transform for lower case
    :param rm_duplicate: If True remove all duplicate words in questions
    :param stopwords: If True removes stopwords
    :param punctuation: If True removes punctuation
    :param lemm: If True returns the phrase lemmatized
    :param stem: If True returns the phrase stemmed
    :param list_of_stopwords: List of stopwords to be used
    :return: Question1 and Question2 processed according to parameters
    """
    if drop_na == True:
        data = data.dropna(0)
    
    if rm_duplicate == True:
        data["question1"] = data["question1"].apply(lambda x: remove_duplicate(x))
        data["question2"] = data["question2"].apply(lambda x: remove_duplicate(x))
    
    if lower_case == True:
        data["question1"] = data["question1"].apply(lambda x: all_lower_case(x))
        data["question2"] = data["question2"].apply(lambda x: all_lower_case(x))
    
    if stopwords == True:
        data["question1"] = data["question1"].apply(lambda x: remove_stopwords(x, list_of_stopwords))
        data["question2"] = data["question2"].apply(lambda x: remove_stopwords(x, list_of_stopwords))
       
    if punctuation == True:
        data["question1"] = data["question1"].apply(lambda x: remove_punctuation(x))
        data["question2"] = data["question2"].apply(lambda x: remove_punctuation(x))
        
    if lemm_wordnet == True:
        data["question1"] = data["question1"].apply(lambda x: lemm_wordnet(x))
        data["question2"] = data["question2"].apply(lambda x: lemm_wordnet(x))
        
    if stem_snowball == True:
        data["question1"] = data["question1"].apply(lambda x: stem_snowball(x))
        data["question2"] = data["question2"].apply(lambda x: stem_snowball(x))
    
    #We used it two times if some function create a new NA.
    if drop_na == True:
        data = data.dropna(0)    
        
    return data

In [16]:

data_train_clean = cleaning_tool(data_train, stopwords=True, lemm=True,list_of_stopwords=mystopwords,punctuation=True)

In [17]:
#Saving Pickle
with open(os.path.join(save, 'data_train_clean_features.pkl'),'wb') as f:
    pickle.dump((data_train_clean),f)

In [18]:
'''
#Plain Word Counts
#X_traincv_tf, X_testcv_tf, y_traincv_tf, y_testcv_tf = model_selection.train_test_split(train_data_features_tf,
                                                                                        train["sentiment"],
                                                                                        test_size=0.2,
                                                                                        random_state=0)
'''


'\n#Plain Word Counts\n#X_traincv_tf, X_testcv_tf, y_traincv_tf, y_testcv_tf = model_selection.train_test_split(train_data_features_tf,\n                                                                                        train["sentiment"],\n                                                                                        test_size=0.2,\n                                                                                        random_state=0)\n'

In [19]:
sample1=data_train_clean[0:300000]
sample2=data_train_clean.sample(300000)
teste1=data_train_clean[300001:]
#teste2=data_train_clean.sample(104287)

In [20]:
#teste1.sample(2)
#teste2.sample(2)
#sample1.sample(2)
#sample2.sample(2)

with open(os.path.join(save, 'datas_sample.pkl'),'wb') as f:
    pickle.dump((sample1,
                 sample2,
                 teste1),f)

In [21]:
vectorizer_tf = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 10000) 


#### Vectorize_TF data_train

In [22]:
questions = data_train_clean.question1.append([data_train_clean.question2])

In [23]:
vector_fitt = vectorizer_tf.fit(questions)
data_train_clean_tf_question1 = vector_fitt.transform(data_train_clean.question1)
# Numpy arrays are easy to work with
data_train_clean_tf_question1 = data_train_clean_tf_question1.toarray()

with open(os.path.join(save,'data_train_clean_tf_question1.pkl'),'wb') as f:
    pickle.dump(data_train_clean_tf_question1,f)
del(data_train_clean_tf_question1)

MemoryError: 

In [None]:
data_train_clean_tf_question2 = vector_fitt.transform(data_train_clean.question2)
 # Numpy arrays are easy to work with
data_train_clean_tf_question2 = data_train_clean_tf_question2.toarray()

with open(os.path.join(save, 'data_train_clean_tf_question2.pkl'),'wb') as f:
    pickle.dump(data_train_clean_tf_question2,f)
del(data_train_clean_tf_question2)
del(questions)

#### Vectorize_TF sample1

In [None]:
sample1_questions=sample1.question1.append([sample1.question2])

In [None]:
vector_fitt = vectorizer_tf.fit(sample1_questions)
sample1_tf_question1 = vector_fitt.transform(sample1.question1)
# Numpy arrays are easy to work with
sample1_tf_question1 = sample1_tf_question1.toarray()

with open(os.path.join(save, 'sample1_tf_question1.pkl'),'wb') as f:
    pickle.dump(sample1_tf_question1,f)
del(sample1_tf_question1)

In [None]:
sample1_tf_question2 = vector_fitt.transform(sample1.question2)
sample1_tf_question2 = sample1_tf_question2.toarray()

with open(os.path.join(save, 'sample1_tf_question2.pkl'),'wb') as f:
    pickle.dump(sample1_tf_question2,f)
del(sample1_tf_question2)
del(sample1_questions)

#### Vectorize_TF sample2

In [None]:
sample2_questions=sample2.question1.append([sample1.question2])

In [None]:
vector_fitt = vectorizer_tf.fit(sample2_questions)
sample2_tf_question1 = vector_fitt.transform(sample2.question1)
sample2_tf_question1 = sample2_tf_question1.toarray()

with open(os.path.join(save, 'sample2_tf_question1.pkl'),'wb') as f:
    pickle.dump(sample2_tf_question1,f)
del(sample2_tf_question1)

In [None]:
sample2_tf_question2 = vector_fitt.transform(sample2.question2)
sample2_tf_question2 = sample2_tf_question2.toarray()

with open(os.path.join(save, 'sample2_tf_question2.pkl'),'wb') as f:
    pickle.dump(sample2_tf_question2,f)
del(sample2_tf_question2)
del(sample2_questions)

#### Vectorize_TF teste1

In [None]:
teste1_questions=teste1.question1.append([teste1.question2])

In [None]:
vector_fitt = vectorizer_tf.fit(teste1_questions)
teste1_tf_question1 = vector_fitt.transform(teste1.question1)
teste1_tf_question1 = teste1_tf_question1.toarray()

with open(os.path.join(save, 'teste1_tf_question1.pkl'),'wb') as f:
    pickle.dump(teste1_tf_question1,f)
del(teste1_tf_question1)

In [None]:
vector_fitt = vectorizer_tf.fit(teste1_questions)
teste1_tf_question2 = vector_fitt.transform(teste1.question2)
teste1_tf_question2 = teste1_tf_question2.toarray()

with open(os.path.join(save, 'teste1_tf_question2.pkl'),'wb') as f:
    pickle.dump(teste1_tf_question2,f)
del(teste1_tf_question2)

#### Vectorize_TFIDF

In [None]:
#http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
#Another approach using TfIDf vectorizer and using the texts with stopwords in:
#https://github.com/zygmuntz/classifying-text/blob/master/bow_predict.py 
vectorizer_tfidf = TfidfVectorizer(analyzer='word', \
                                  preprocessor=None,\
                                  tokenizer=None,\
                                  stop_words=None,\
                                  max_features=10000)

#### Vectorize_TFIDF data_train

In [None]:
questions = data_train_clean.question1.append([data_train_clean.question2])

In [None]:
vector_fitt = vectorizer_tfidf.fit(questions)
data_train_clean_tfidf_question1 = vector_fitt.transform(data_train_clean.question1)
# Numpy arrays are easy to work with
data_train_clean_tfidf_question1 = data_train_clean_tfidf_question1.toarray()

with open(os.path.join(save,'data_train_clean_tfidf_question1.pkl'),'wb') as f:
    pickle.dump(data_train_clean_tfidf_question1,f)
del(data_train_clean_tfidf_question1)

In [None]:
data_train_clean_tfidf_question2 = vector_fitt.transform(data_train_clean.question2)
 # Numpy arrays are easy to work with
data_train_clean_tfidf_question2 = data_train_clean_tfidf_question2.toarray()

with open(os.path.join(save, 'data_train_clean_tfidf_question2.pkl'),'wb') as f:
    pickle.dump(data_train_clean_tfidf_question2,f)
del(data_train_clean_tfidf_question2)
del(questions)

#### Vectorize_tfidf sample1

In [None]:
sample1_questions=sample1.question1.append([sample1.question2])

In [None]:
vector_fitt = vectorizer_tfidf.fit(sample1_questions)
sample1_tfidf_question1 = vector_fitt.transform(sample1.question1)
# Numpy arrays are easy to work with
sample1_tfidf_question1 = sample1_tfidf_question1.toarray()

with open(os.path.join(save, 'sample1_tfidf_question1.pkl'),'wb') as f:
    pickle.dump(sample1_tfidf_question1,f)
del(sample1_tfidf_question1)

In [None]:
sample1_tfidf_question2 = vector_fitt.transform(sample1.question2)
sample1_tfidf_question2 = sample1_tfidf_question2.toarray()

with open(os.path.join(save, 'sample1_tfidf_question2.pkl'),'wb') as f:
    pickle.dump(sample1_tfidf_question2,f)
del(sample1_tfidf_question2)
del(sample1_questions)

#### Vectorize_tfidf sample2

In [None]:
sample2_questions=sample2.question1.append([sample1.question2])

In [None]:
vector_fitt = vectorizer_tfidf.fit(sample2_questions)
sample2_tfidf_question1 = vector_fitt.transform(sample2.question1)
sample2_tfidf_question1 = sample2_tfidf_question1.toarray()

with open(os.path.join(save, 'sample2_tfidf_question1.pkl'),'wb') as f:
    pickle.dump(sample2_tfidf_question1,f)
del(sample2_tfidf_question1)

In [None]:
sample2_tfidf_question2 = vector_fitt.transform(sample2.question2)
sample2_tfidf_question2 = sample2_tfidf_question2.toarray()

with open(os.path.join(save, 'sample2_tfidf_question2.pkl'),'wb') as f:
    pickle.dump(sample2_tfidf_question2,f)
del(sample2_tfidf_question2)
del(sample2_questions)

#### Vectorize_tfidf teste1

In [None]:
teste1_questions=teste1.question1.append([teste1.question2])

In [None]:
vector_fitt = vectorizer_tfidf.fit(teste1_questions)
teste1_tfidf_question1 = vector_fitt.transform(teste1.question1)
teste1_tfidf_question1 = teste1_tfidf_question1.toarray()

with open(os.path.join(save, 'teste1_tfidf_question1.pkl'),'wb') as f:
    pickle.dump(teste1_tfidf_question1,f)
del(teste1_tfidf_question1)

In [None]:
vector_fitt = vectorizer_tfidf.fit(teste1_questions)
teste1_tfidf_question2 = vector_fitt.transform(teste1.question2)
teste1_tfidf_question2 = teste1_tfidf_question2.toarray()

with open(os.path.join(save, 'teste1_tfidf_question2.pkl'),'wb') as f:
    pickle.dump(teste1_tfidf_question2,f)
del(teste1_tfidf_question2)

In [None]:
'''with open(os.path.join(outputs, 'data_train_features_vector2.pkl'),'rb') as f:
    (train_data_features_tf, 
    test_data_features_tf,
    train_data_features_tfidf,
    test_data_features_tfidf) = pickle.load(f)'''