In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import csv
import os.path
import pandas as pd
import itertools
from nltk.corpus import stopwords
import numpy as np
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import WhitespaceTokenizer 
from nltk.corpus import wordnet
import unicodedata
from nltk.tokenize.moses import MosesDetokenizer,MosesTokenizer
from nltk.tokenize import TweetTokenizer
tknzr =MosesTokenizer()
detokenizer = MosesDetokenizer()
      

def flatten(tokens):
    tokens2 = [([x] if isinstance(x,str) else x) for x in tokens]
    flattened = list(itertools.chain(*tokens2))
    return flattened

def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

# remove stop words
stopWords = set(stopwords.words('english'))
def removeStopWords(words):
    wordsFiltered = [] 
    for w in words:
        if w not in stopWords:
            wordsFiltered.append(w)            
    return wordsFiltered

# remove punctuations

def token_punct_comma(tokens):
    tokens = detokenizer.tokenize(tokens)
    return tknzr.tokenize(tokens)

def convert_sentence(sentence):
    sentence = unicodedata.normalize('NFD', unicode(sentence, "utf-8")).encode('ascii', 'ignore')
    try:
        if sentence :
            sentence = unicodedata.normalize('NFD', unicode(sentence, "utf-8")).encode('ascii', 'ignore')
    except:
        print "------------- Error unicodeData-----------"
        print sentence
    tokens = WhitespaceTokenizer().tokenize(sentence.lower())
#     comma_tokenized = token_punct_comma(tokens)
    return tokens




In [2]:
#Dictionary of contractions
contractions = {
  "'s": ["is"],
  "'re": ["are"],
  "aren't": ["are", "not"],
  "can't": ["can", "not"],
  "can't've": ["can", "not", "have"],
  "'cause": ["because"],
  "could've": ["could", "have"],
  "couldn't": ["could", "not"],
  "couldn't've": ["could", "not", "have"],
  "didn't": ["did", "not"],
  "doesn't": ["does", "not"],
  "don't": ["do", "not"],
  "hadn't": ["had", "not"],
  "hadn't've": ["had", "not", "have"],
  "hasn't": ["has", "not"],
  "haven't": ["have", "not"],
  "he'd": ["he", "would"],
  "he'd've": ["he", "would", "have"],
  "he'll": ["he", "will"],
  "he'll've": ["he", "will", "have"],
  "he's": ["he", "is"],
  "how'd": ["how", "did"],
  "how'd'y": ["how", "do", "you"],
  "how'll": ["how", "will"],
  "how's": ["how", "is"],
  "I'd": ["I would"],
  "I'd've": ["I", "would", "have"],
  "I'll": ["I", "will"],
  "I'll've": ["I", "will", "have"],
  "I'm": ["I", "am"],
  "I've": ["I", "have"],
  "isn't": ["is", "not"],
  "it'd": ["it", "had"],
  "it'd've": ["it", "would", "have"],
  "it'll": ["it", "will"],
  "it'll've": ["it", "will", "have"],
  "it's": ["it", "is"],
  "let's": ["let", "us"],
  "ma'am": ["madam"],
  "mayn't": ["may", "not"],
  "might've": ["might", "have"],
  "mightn't": ["might", "not"],
  "mightn't've": ["might", "not", "have"],
  "must've": ["must", "have"],
  "mustn't": ["must", "not"],
  "mustn't've": ["must", "not", "have"],
  "needn't": ["need", "not"],
  "needn't've": ["need", "not", "have"],
  "o'clock": ["of", "the", "clock"],
  "oughtn't": ["ought", "not"],
  "oughtn't've": ["ought", "not", "have"],
  "shan't": ["shall", "not"],
  "sha'n't": ["shall", "not"],
  "shan't've": ["shall", "not", "have"],
  "she'd": ["she", "would"],
  "she'd've": ["she", "would", "have"],
  "she'll": ["she", "will"],
  "she'll've": ["she", "shall", "have"],
  "she's": ["she", "is"],
  "should've": ["should", "have"],
  "shouldn't": ["should", "not"],
  "shouldn't've": ["should", "not", "have"],
  "so've": ["so", "have"],
  "so's": ["so", "as"],
  "that'd": ["that", "would"],
  "that'd've": ["that", "would", "have"],
  "that's": ["that", "has"],
  "there'd": ["there", "had"],
  "there'd've": ["there", "would", "have"],
  "there's": ["there", "is"],
  "they'd": ["they", "had "],
  "they'd've": ["they", "would", "have"],
  "they'll": ["they", "will"],
  "they'll've": ["they", "will", "have"],
  "they're": ["they", "are"],
  "they've": ["they", "have"],
  "to've": ["to", "have"],
  "wasn't": ["was", "not"],
  "we'd": ["we", "had"],
  "we'd've": ["we", "would", "have"],
  "we'll": ["we", "will"],
  "we'll've": ["we", "will", "have"],
  "we're": ["we", "are"],
  "we've": ["we", "have"],
  "weren't": ["were", "not"],
  "what'll": ["what", "will"],
  "what'll've": ["what", "will", "have"],
  "what're": ["what", "are"],
  "what's": ["what", "is"],
  "what've": ["what have"],
  "when's": ["when", "is"],
  "when've": ["when", "have"],
  "where'd": ["where", "did"],
  "where's": ["where", "is"],
  "where've": ["where", "have"],
  "who'll": ["who", "will"],
  "who'll've": ["who", "will", "have"],
  "who's": ["who", "is"],
  "who've": ["who", "have"],
  "why's": ["why", "is"],
  "why've": ["why", "have"],
  "will've": ["will", "have"],
  "won't": ["will", "not"],
  "won't've": ["will", "not", "have"],
  "would've": ["would", "have"],
  "wouldn't": ["would", "not"],
  "wouldn't've": ["would", "not", "have"],
  "y'all": ["you", "all"],
  "y'all'd": ["you", "all", "would"],
  "y'all'd've": ["you", "all", "would", "have"],
  "y'all're": ["you", "all", "are"],
  "y'all've": ["you", "all", "have"],
  "you'd": ["you", "had"],
  "you'd've": ["you", "would", "have"],
  "you'll": ["you", "will"],
  "you'll've": ["you", "will", "have"],
  "you're": ["you", "are"],
  "you've": ["you", "have"]
}

def expand_contraction(s, contractions_dict = contractions):
    return contractions_dict.get(s)

def find_contraction(s):
    if '\'' in s:
        return True
    return False

def replace_contraction(s):
    if(find_contraction(s)):
        expanded = expand_contraction(s)
    return expanded

def expand(tokens):
    for i in range(len(tokens)):
        if '\'' in tokens[i]:
            expandedtoken = replace_contraction(tokens[i])
            if(expandedtoken != None):
                tokens[i] = expandedtoken
    return tokens

In [3]:
from nltk import pos_tag
from nltk.corpus import wordnet as wn

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('V'):
        return wn.VERB
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    else:
        return None

def tag_pos (sentence):
    tagged = pos_tag(sentence)
    return tagged

def replace_base (sentence):
    tagged = tag_pos(sentence)
    replaced = []
    for i in range(len(tagged)):
        token = tagged[i]
        word = token[0]
        pos = get_wordnet_pos(token[1])
        try:
            
            base = wn.morphy(word, pos)
        except:
            print "-------------------- Error --------------------"
            print word
        if(base != None):
            replaced.append(base)
        else:
            replaced.append(word)
    return replaced

def synonyms_word (word):
    wordlemma = []
    synsets = wn.synsets(word)
    for synset in synsets:
        lemma = [str(lemma.name()) for lemma in synset.lemmas()]
        wordlemma.append(lemma)
    flattened = flatten(wordlemma)
    unique = list(set(flattened))
    return unique

def synonyms_sentence (sentence):
    syn = []
    for word in sentence:
        syn.append(synonyms_word(word))
        
    return syn

def replace_syn(x):
    
    for i in range(len(x['Sentence2'])):
        a = synonyms_word(x['Sentence2'][i])
        for replacew in x['Sentence1']:
            b = synonyms_word(replacew)
            if(not set(a).isdisjoint(b)):
                x['Sentence2'][i] = replacew
                break
    return x

In [4]:
# from nltk.parse.stanford import GenericStanfordParser

# g = GenericStanfordParser()
# print g.tagged_parse(nltk.pos_tag('Four men died in an accident.'))
from nltk.tokenize.moses import MosesDetokenizer,MosesTokenizer
from nltk.tokenize import TweetTokenizer
tknzr =MosesTokenizer()
detokenizer = MosesDetokenizer()
def sentence_tokenise(s):
    s = detokenizer.detokenize(s, return_str=True)
            
    return tknzr.tokenize(s)        


def preprocessFile(inputFile):
    
    df = pd.read_pickle(inputFile)

    #0.convert the contraction & lowercase
    df['Sentence1'] = df.apply(lambda x: convert_sentence(x['Sentence1']), axis=1)
    df['Sentence2'] = df.apply(lambda x: convert_sentence(x['Sentence2']), axis=1)

    
    #1. no stopwords
    df['Sentence1'] = df.apply(lambda x: removeStopWords(x['Sentence1']), axis=1)
    df['Sentence2'] = df.apply(lambda x: removeStopWords(x['Sentence2']), axis=1)

    
#     #2.lemmatize to base
    df['Sentence1'] = df.apply(lambda x: replace_base(x['Sentence1']), axis=1)
    df['Sentence2'] = df.apply(lambda x: replace_base(x['Sentence2']), axis=1)

    #3.replace synonyms
    df = df.apply(lambda x: replace_syn(x), axis=1)
    
    df['Sentence1'] = df.apply(lambda x: sentence_tokenise(x['Sentence1']), axis=1)
    df['Sentence2'] = df.apply(lambda x: sentence_tokenise(x['Sentence2']), axis=1)
    
    return df

In [5]:
# import sys
# reload(sys)
# sys.setdefaultencoding('utf8')
# def encode(s):
#     return s.encode('utf-8')


# df = preprocessFile('../Data/backup/Train_Data')
# df.to_pickle("Preprocessed_ML_TRAIN.pickle")
# df.to_excel("Preprocessed_ML_TRAIN.xlsx")

In [6]:
import unicodedata
text = u'Cześć'
print unicodedata.normalize('NFD', text).encode('ascii', 'ignore')

Czesc


In [7]:
df = preprocessFile('../../data/Pandas_Pickle/STS/SICK/SICK_test_sr')
df.to_pickle("Preprocessed_ML_TEST.pickle")
df.to_excel("Preprocessed_ML_TEST.xlsx")

df = preprocessFile('../../data/Pandas_Pickle/STS/SICK/SICK_test_rte')
df.to_pickle("Preprocessed_ML_TEST_RTE.pickle")
df.to_excel("Preprocessed_ML_TEST_RTE.xlsx")