In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import string
import os
from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
path_data = '../datasets'
path_w2v = '/Volumes/DD_Supp'

In [3]:
print(os.listdir(path_data))

['smart_stopwords.txt', 'test.csv', 'train.csv']


In [4]:
# remove dashes and apostrophes from punctuation marks
punct = string.punctuation.replace('-', '').replace("'",'')
# regex to match intra-word dashes and intra-word apostrophes
my_regex = re.compile(r"(\b[-']\b)|[\W_]")

def clean_string(string, punct=punct, my_regex=my_regex, to_lower=True, stpwds=None, minstr=3, maxstr=25):
    if to_lower:
        string = string.lower()
    # remove formatting
    str = re.sub('\s+', ' ', string)
     # remove punctuation
    str = ''.join(l for l in str if l not in punct)
    # remove dashes that are not intra-word
    str = my_regex.sub(lambda x: (x.group(1) if x.group(1) else ' '), str)
    # strip extra white space
    str = re.sub(' +',' ',str)
    # strip leading and trailing white space
    str = str.strip()
    # tokenize
    tokens = str.split(' ')
    # remove stopwords
    if stpwds != None:
        tokens = [token for token in tokens if token not in stpwds]
    # remove digits
    tokens = [''.join([elt for elt in token if not elt.isdigit()]) for token in tokens]
    # remove tokens shorter than 3 characters in size
    tokens = [token for token in tokens if len(token)>=minstr]
    # remove tokens exceeding 25 characters in size
    tokens = [token for token in tokens if len(token)<=maxstr]
    return str, tokens

In [5]:
texts = {}
clean_texts = {}
tokens = {}
pairs_train = []
pairs_test = []
y_train = []

with open(path_data+'/train.csv','r', encoding='utf8') as f:
    for line in f:
        l = line.split(',')
        if l[1] not in texts:
            texts[l[1]] = l[3]
        if l[2] not in texts:
            texts[l[2]] = l[4]

        pairs_train.append([l[1],l[2]])

        y_train.append(int(l[5][:-1])) # [:-1] is just to remove formatting at the end
    

with open(path_data+'/test.csv','r', encoding='utf8') as f:
    for line in f:
        l = line.split(',')
        if l[1] not in texts:
            texts[l[1]] = l[3]
        if l[2] not in texts:
            texts[l[2]] = l[4][:-1]

        pairs_test.append([l[1], l[2]])
        
for key in list(texts.keys()):
    clean_text = clean_string(texts[key])
    clean_texts[key] = clean_text[0]
    tokens[key] = clean_text[1]


ids2ind = {} # will contain the row idx of each unique text in the TFIDF matrix
for qid in texts:
    ids2ind[qid] = len(ids2ind)

In [10]:
vec = TfidfVectorizer()
M = vec.fit_transform(texts.values())

In [13]:
len(tokens.values())

58940

In [None]:
my_q = 300 # to match dim of GNews word vectors
mcount = 2
w2v_perso = Word2Vec(size=my_q, min_count=mcount)
w2v_perso.build_vocab(token) 