# FGV EMAP

## Modelagem e mineração de dados


### Trabalho Kaggle Quora

#### Alunos: Antonio Sombra e Joao Marcos

##### https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words

### Reading packages

In [None]:

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

#strings
import string
import nltk
import nltk.data
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from fuzzywuzzy import fuzz

#Extras
import os
import sys
import time
import re
import pickle
import logging

#basic
import numpy as np
import pandas as pd
import xgboost as xgb
import string
import math


# Visualisation
import pylab
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
from matplotlib.font_manager import FontProperties
import seaborn as sns

# Modelling Helpers
from sklearn.preprocessing import Imputer , Normalizer , scale
from sklearn.cross_validation import train_test_split , StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.cluster import KMeans
from sklearn.metrics import roc_auc_score as AUC
import gensim
from gensim import corpora
from gensim import models
from gensim import similarities
from gensim.models import Word2Vec

from bs4 import BeautifulSoup

%matplotlib inline

#### Load data

In [None]:
datapath = "/Dados/Kaggle/"
word2vec= "/Dados/Word2vec"
save=  os.getcwd()
data_train = pd.read_csv(os.path.join(datapath, 'train.csv'))
data_train=data_train.drop(['id','qid1','qid2'],axis=1)
data_train.sample(3)

In [None]:
data_train.info()
data_train.describe()

#0.369198 percente is duplicate

In [None]:
print(data_train.columns)
type(data_train)

In [None]:
# Choosing the list of stopwords
mystopwords = nltk.corpus.stopwords.words('english')
list_of_words= ['where','what','when','why','between','who','how','which']
for item in list_of_words:
        mystopwords.remove(item)

### Editing questions with NLTK package

In [None]:
def remove_stopwords(phrase,list_stopwords):
    """
    Receives a phrase and removes all stopwords from a list
    :param phrase: String. A phrase.
    :param list_stopwords: List. A list of stopwords
    :return: The same phrase without stopwords
    """
    final_phrase = []
    words = phrase.split(" ")
    for word in words:
        if word not in list_stopwords:
            final_phrase.append((word))
    
    final_phrase = ' '.join(final_phrase)
    
    return final_phrase
    
def remove_punctuation(phrase):
    """
    Receives a phrase and removes all punctuation from it
    :param phrase: String. A phrase.
    :return: The same phrase without punctuation
    """
    #https://www.tutorialspoint.com/python/string_maketrans.htm
    #Check if NA
    if type(phrase) is float:
        if math.isnan(phrase):
            return ("")
    
    translator = str.maketrans('', '', string.punctuation)
    phrase = phrase.translate(translator) #removing punctuation
        
    return phrase

def lemm_wordnet(phrase):
    """
    Receives a phrase and removes lemmatizes it
    :param phrase: String. A phrase.
    :return: The same phrase in lemmas
    """
    lemm = WordNetLemmatizer()
    
    #NA is a float type, so this if is to avoid conflict
    if type(phrase) is not float:
        phrase = [lemm.lemmatize(i) for i in phrase.split()]
        phrase = ' '.join(phrase)
    else:
        return ""
    return phrase
    
def remove_duplicate(phrase):
    """
    Receives a phrase and removes all duplicate words
    :param phrase: String. A phrase.
    :return: The same phrase with just unique words
    """
    aux_phrase = []
        
    if type(phrase) is not float:
        
        for i in phrase.split():
            
            if i not in aux_phrase:
                aux_phrase.append(i)
    
    phrase = ' '.join(aux_phrase)
    
    return phrase
    
    
def all_lower_case(phrase):    
    """
    Receives a phrase and makes it lower case
    :param phrase: String. A phrase.
    :return: The same phrase in lower case
    """
    if type(phrase) is not float:
            phrase = phrase.lower()
    return phrase
    
def stem_snowball(phrase):
    """
    Receives a phrase and returns the same phrase stemmed, lowercase phrase without stopwords
    :param phrase: String. A phrase.
    :return: String. Stemmed, lowercase phrase without stopwords
    """
    stemmer = SnowballStemmer("english")
    
    #Stem words according to stemmer
    final_phrase = []
    words = phrase.split(" ")
    for word in words:
        final_phrase.append((stemmer.stem(word)))
    
    final_phrase = ' '.join(final_phrase)
    
    return final_phrase

stem_snowball("What is the step by step guide to invest in share market in india?")



In [None]:
data_train['len_q1'] = data_train.question1.apply(lambda x: len(str(x)))
data_train['len_q2'] = data_train.question2.apply(lambda x: len(str(x)))
data_train['diff_len'] = data_train.len_q1 - data_train.len_q2
data_train['len_char_q1'] = data_train.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data_train['len_char_q2'] = data_train.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data_train['len_word_q1'] = data_train.question1.apply(lambda x: len(str(x).split()))
data_train['len_word_q2'] = data_train.question2.apply(lambda x: len(str(x).split()))
data_train['common_words'] = data_train.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection\
                                                            (set(str(x['question2']).lower().split()))), axis=1)
data_train['prop_common_words'] = data_train.apply(lambda x: \
                                                   len(set(remove_punctuation(x['question1']).lower().split()).intersection\
                                                       (set(remove_punctuation(x['question2']).lower().split()))) / \
                                                   len(set(remove_punctuation(x['question1']).lower().split()).union\
                                                       (set(remove_punctuation(x['question2']).lower().split()))),axis=1 )


data_train['fuzz_qratio'] = data_train.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
print('fuzz_qratio')
data_train['fuzz_WRatio'] = data_train.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
print('fuzz_WRatio')
data_train['fuzz_partial_ratio'] = data_train.apply(lambda x: fuzz.partial_ratio\
                                                    (str(x['question1']), str(x['question2'])), axis=1)
print('fuzz_partial_ratio')
data_train['fuzz_partial_token_set_ratio'] = data_train.apply(lambda x: fuzz.partial_token_set_ratio\
                                                              (str(x['question1']), str(x['question2'])), axis=1)
print('fuzz_partial_token_set_ratio')
data_train['fuzz_partial_token_sort_ratio'] = data_train.apply(lambda x: fuzz.partial_token_sort_ratio\
                                                               (str(x['question1']), str(x['question2'])), axis=1)
print('fuzz_partial_token_sort_ratio')
data_train['fuzz_token_set_ratio'] = data_train.apply(lambda x: fuzz.token_set_ratio\
                                                      (str(x['question1']), str(x['question2'])), axis=1)
print('fuzz_token_set_ratio')
data_train['fuzz_token_sort_ratio'] = data_train.apply(lambda x: fuzz.token_sort_ratio\
                                                       (str(x['question1']), str(x['question2'])), axis=1)

print('fuzz_token_sort_ratio')


In [None]:
#https://github.com/seatgeek/fuzzywuzzy
#https://pypi.python.org/pypi/fuzzywuzzy
#http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/

In [None]:
data_train.groupby('is_duplicate').describe()

### Depois apagar as # para não ficar como comentário

### Saving data base of features

In [None]:
#Saving Pickle
with open(os.path.join(save, 'train_features.pkl'),'wb') as f:
    pickle.dump((data_train),f)

In [None]:
# save features as CSV
data_train.to_csv(os.path.join(save, 'train_features_2.csv'),index=False)
print(save)



In [None]:



#data_train_features = pd.read_csv(os.path.join(save, 'train_features.csv'))

In [None]:
'''
lines_count = 0
a=set()
for item in range(0,404290):
    a =  a.union(
     set(remove_punctuation(data_train.question1[item]).\
    split()).union(set(remove_punctuation(data_train.question2[item]).split())))
    if item%50000==0:
        print(item)
print(len(a))
'''
#Resultado 136153 palavras única

In [None]:
#Stemmers remove morphological affixes from words, leaving only the word stem.
#http://www.nltk.org/howto/stem.html
#The 'english' stemmer is better than the original 'porter' stemmer.
#example; stemmer.stem('likely', 'bites') - like, bite 
#http://www.nltk.org/api/nltk.tokenize.html
#A tokenizer that divides a string into substrings by splitting on the specified string (defined in subclasses).
#word_tokenize("It's only a test")- ['It', "'s", 'only', 'a', 'test']


### Editing questions with NLTK package

In [None]:
#cleaning tool is used so you can easily choose which functions you want to use to clean te text
def cleaning_tool(data, drop_na = True, lower_case = True, rm_duplicate = False, stopwords = False, 
                  punctuation = False, lemm = False, stem = False, list_of_stopwords = None):
    """
    Function to process all data using calling functions from above, according to what was chosen.
    :param data: data frame.
    :param drop_na: If True drop all lines of data frame with NA
    :param lower_case: If True transform for lower case
    :param rm_duplicate: If True remove all duplicate words in questions
    :param stopwords: If True removes stopwords
    :param punctuation: If True removes punctuation
    :param lemm: If True returns the phrase lemmatized
    :param stem: If True returns the phrase stemmed
    :param list_of_stopwords: List of stopwords to be used
    :return: Question1 and Question2 processed according to parameters
    """
    if drop_na == True:
        data = data.dropna(0)
    
    if rm_duplicate == True:
        data["question1"] = data["question1"].apply(lambda x: remove_duplicate(x))
        data["question2"] = data["question2"].apply(lambda x: remove_duplicate(x))
    
    if lower_case == True:
        data["question1"] = data["question1"].apply(lambda x: all_lower_case(x))
        data["question2"] = data["question2"].apply(lambda x: all_lower_case(x))
    
    if stopwords == True:
        data["question1"] = data["question1"].apply(lambda x: remove_stopwords(x, list_of_stopwords))
        data["question2"] = data["question2"].apply(lambda x: remove_stopwords(x, list_of_stopwords))
       
    if punctuation == True:
        data["question1"] = data["question1"].apply(lambda x: remove_punctuation(x))
        data["question2"] = data["question2"].apply(lambda x: remove_punctuation(x))
        
    if lemm_wordnet == True:
        data["question1"] = data["question1"].apply(lambda x: lemm_wordnet(x))
        data["question2"] = data["question2"].apply(lambda x: lemm_wordnet(x))
        
    if stem_snowball == True:
        data["question1"] = data["question1"].apply(lambda x: stem_snowball(x))
        data["question2"] = data["question2"].apply(lambda x: stem_snowball(x))
    
    #We used it two times if some function create a new NA.
    if drop_na == True:
        data = data.dropna(0)    
        
    return data

In [None]:

data_train_clean = cleaning_tool(data_train, stopwords=True, lemm=True,list_of_stopwords=mystopwords,punctuation=True)

In [None]:
# Perguntar entre o split e word_tokenize
#Verificar http://www.nltk.org/api/nltk.stem.html LEMATIZAR

In [None]:
#Plain Word Counts
X_traincv_tf, X_testcv_tf, y_traincv_tf, y_testcv_tf = model_selection.train_test_split(train_data_features_tf,
                                                                                        train["sentiment"],
                                                                                        test_size=0.2,
                                                                                        random_state=0)

In [None]:
vectorizer_tf = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 


In [None]:
data_train_clean_tf = vectorizer_tf.fit_transform(data_train_clean.question1)
#data_train_clean_tf = data_train_clean_tf.toarray() # Numpy arrays are easy to work with
#print(data_train_clean_tf.shape)
data_train_clean_tf = data_train_clean_tf.toarray()


In [None]:
data_train_clean_tf

In [None]:
  print(data)