In [1]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords 

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 


from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

from collections import Counter

from nltk.util import ngrams

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aleksejfilippov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aleksejfilippov/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
import torch
torch.cuda.is_available()

False

# 1. Data cleaning

In [15]:
train = pd.read_csv('/Users/aleksejfilippov/Desktop/techotrack/quora_2/csvs/train.csv')
test = pd.read_csv('/Users/aleksejfilippov/Desktop/techotrack/quora_2/csvs/test.csv')
print("Train datasets shape:", train.shape)
print("Test datasets shape:", test.shape)

Train datasets shape: (1306122, 3)
Test datasets shape: (56370, 2)


## 1.1 Throw away most common words and all signs. 

In [16]:
frequent_words = ['what','when','why','which','who','how', 'whose', 'whome', 'people', 'i', 
                  'n\'t','\'s','like','get','would','would','many', 'want', 'good', 'india', 'girl',
                  'first', 'take', 'much', 'ever', 'take', 'feel', 'know', 'think', 'make', 
                  'year', 'time', 'still', 'life', 'country', 'world', 'question', 'even', 'really',
                  'love', 'better', 'human', 'right', 'thing', 'could', 'give', 'person', 'child']
stop_signs = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√']
stop_words = stopwords.words('english')
for w in frequent_words+stop_signs:
    stop_words.append(w)

## 1.2 Words lemmatization, using only words with more than 3 letters.

In [17]:
cleaned_questions_train = []
 
for sentence in train['question_text']:
    new_sentence = [wordnet_lemmatizer.lemmatize(w).lower() for w in word_tokenize(sentence)]
    new_sentence = [w for w in new_sentence if w not in stop_words]
    new_sentence = [w for w in new_sentence if len(w)>3]
         
    clean = ' '.join(new_sentence)    
    if len(clean) == 0: 
        cleaned_questions_train.append('0')
    else:
        cleaned_questions_train.append(clean)

cleaned_questions_test = []
for sentence in test['question_text']:
    new_sentence = [wordnet_lemmatizer.lemmatize(w).lower() for w in word_tokenize(sentence)]
    new_sentence = [w for w in new_sentence if w not in stop_words]
    new_sentence = [w for w in new_sentence if len(w)>3]
       
    clean = ' '.join(new_sentence)    
    if len(clean) == 0:
        cleaned_questions_test.append('0')
    else:
        cleaned_questions_test.append(clean)

In [18]:
train1 = pd.concat([train.qid, train.target], axis = 1, copy = True)
test1 = pd.concat([test.qid], axis = 1, copy = True)
train1.insert(loc = 0, column="debugged_questions", value=cleaned_questions_train)
test1.insert(loc = 0, column="debugged_questions", value=cleaned_questions_test)

# 2. Data preparation

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
import time

## 2.1 TF-IDF. Using feature selection to find 500 most meanigful words.

In [20]:
tfv = TfidfVectorizer(ngram_range=(1, 3))
tfv.fit(train1.debugged_questions.tolist() + test1.debugged_questions.tolist())

train2_ne =  tfv.transform(train1.debugged_questions)
test2_ne =  tfv.transform(test1.debugged_questions)

In [None]:
select = SelectKBest(chi2, k=500)
X_new = select.fit_transform(train2_ne, train1.target)
names = tfv.get_feature_names()
selected_words = np.asarray(names)[select.get_support()]
print('Here goes a list of most meaningful words and word combinations of length less than four:\n')
print(', '.join(selected_words))

In [22]:
start1 = time.time()
tr_idf = pd.DataFrame(train2_ne[:, select.get_support()].toarray(), dtype = 'float16')
te_idf = pd.DataFrame(test2_ne.T[select.get_support()].T.toarray(), dtype = 'float16')

tr_idf.columns = ['idf' + str(tok) for tok in np.arange(500)]
te_idf.columns = ['idf' + str(tok) for tok in np.arange(500)]

train2_idf = pd.concat([tr_idf, train1.qid, train1.target], axis = 1)
test2_idf = pd.concat([te_idf, test1.qid], axis = 1)
print(time.time() - start1)

34.030734062194824


## 2.2 Glove.840B.300d. 

In [23]:
# Reload embeddings even if already loaded (for each section to be independant)
EMBEDDING_FILE = '/Users/aleksejfilippov/Desktop/techotrack/quora_2/Data_preparation/glove.840B.300d.txt'
def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

In [24]:
train_list_of_vectors = []
for sentence in train1.debugged_questions.tolist():
    splitted = sentence.split(' ')
    vector = np.zeros(300).astype(np.float16)
    for token in splitted:
        try:
            vector = vector + embeddings_index[token]
        except KeyError:
            pass
    train_list_of_vectors.append(vector.astype(np.float16))
    
test_list_of_vectors = []
for sentence in test1.debugged_questions.tolist():
    splitted = sentence.split(' ')
    vector = np.zeros(300).astype(np.float16)
    for token in splitted:
        try:
            vector = vector + embeddings_index[token]
        except KeyError:
            pass
    test_list_of_vectors.append(vector.astype(np.float16))

In [None]:
start1 = time.time()
tr_glove = pd.DataFrame(train_list_of_vectors, dtype = 'float16')
te_glove = pd.DataFrame(test_list_of_vectors, dtype = 'float16')

tr_glove.columns = ['glove'+str(tok) for tok in np.arange(300)]
tr_glove.columns = ['glove'+str(tok) for tok in np.arange(300)]

train2_glove = pd.concat([tr_glove, train1.qid, train1.target], axis = 1)
test2_glove = pd.concat([te_glove, test1.qid], axis = 1)
print(time.time() - start1)

# 3. Latent Direchlet Allocation model to be used as extra features.

In [None]:
from gensim import models, corpora

In [None]:
train_tokens = [] #trtok
for sentence in train1.debugged_questions.tolist():
    train_tokens.append(sentence.split())
test_tokens = []
for sentence in test1.debugged_questions.tolist():
    test_tokens.append(sentence.split())
    
all_tokens = train_tokens + test_tokens

dictionary = corpora.Dictionary(all_tokens)  
corpus = [dictionary.doc2bow(text) for text in all_tokens]
    
test_corpora = [] #tecor
train_corpora = [] #trcor

for token in test_tokens: test_corpora.append(dictionary.doc2bow(token))
for token in train_tokens: train_corpora.append(dictionary.doc2bow(token))

In [None]:
%time ldamodel = models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=8, passes=5)
# ldamodel = models.ldamodel.LdaModel.load("/Users/aleksejfilippov/Desktop/techotrack/quora/ldamodel3_lkcd")

## 3.1 Create matrix Theta describing the destribution of topics above documents.

In [60]:
test_topics = ldamodel.get_document_topics(test_corpora)
train_topics = ldamodel.get_document_topics(train_corpora)

In [61]:
train_documents_topics = [] #nums1
for list_of_topics in [ldamodel.get_document_topics(corp) for corp in train_corpora]:
    i = i + 1
    document_topics = [] #num1
    for tup in list_of_topics:
        document_topics.append(tup)
    train_documents_topics.append(document_topics)

test_documents_topics = []
for list_of_topics in [ldamodel.get_document_topics(corp) for corp in test_corpora]:
    document_topics = []
    for tup in list_of_topics:
        document_topics.append(tup)
    test_documents_topics.append(document_topics)

In [99]:
X_ps_tr = np.ndarray([len(train_corpora), 8])
X_ps_te = np.ndarray([len(test_corpora), 8])
for i in range(len(train_documents_topics)):
    for j in range(len(train_documents_topics[i])):
        X_ps_tr[i, int(train_documents_topics[i][j][0])] = train_documents_topics[i][j][1]
        
        
for i in range(len(test_documents_topics)):
    for j in range(len(test_documents_topics[i])):
        X_ps_te[i, test_documents_topics[i][j][0]] = test_documents_topics[i][j][1]
        
X_ps_te = pd.DataFrame(X_ps_te, dtype = 'float16')
X_ps_tr = pd.DataFrame(X_ps_tr, dtype = 'float16')

X_ps_tr.columns = ['lda'+str(tok) for tok in np.arange(8)]
X_ps_te.columns = ['lda'+str(tok) for tok in np.arange(8)]

## 3.2 Concat Theta as new feature-matrix to existing datasets.

In [103]:
train2_idf_lda = pd.concat([X_ps_tr, train2_idf], axis = 1)
test2_idf_lda = pd.concat([X_ps_te, test2_idf], axis = 1)

train2_glove_lda = pd.concat([X_ps_tr, train2_glove], axis = 1)
test2_glove_lda = pd.concat([X_ps_te, test2_glove], axis = 1)

In [110]:
train2_idf_lda.columns = ['lda'+str(tok) for tok in np.arange(8)] + ['idf' + str(tok) for tok in np.arange(500)] + ['qid', 'target']
test2_idf_lda.columns = ['lda'+str(tok) for tok in np.arange(8)] + ['idf' + str(tok) for tok in np.arange(500)] + ['qid']

train2_glove_lda.columns = ['lda'+str(tok) for tok in np.arange(8)] + ['glove' + str(tok) for tok in np.arange(300)] + ['qid', 'target']
test2_glove_lda.columns = ['lda'+str(tok) for tok in np.arange(8)] + ['glove' + str(tok) for tok in np.arange(300)] + ['qid']

In [111]:
train2_glove_lda.head()

Unnamed: 0,glove0,glove1,glove2,glove3,glove4,glove5,glove6,glove7,glove8,glove9,...,qid,target,lda0,lda1,lda2,lda3,lda4,lda5,lda6,lda7
0,0.632324,-0.392578,1.703125,-0.258301,2.683594,-0.836914,-0.585449,1.135742,0.002935,10.109375,...,00002165364db923c7e6,0,0.187506,0.020833,0.520843,0.187484,0.020833,0.020833,0.020833,0.020833
1,0.660645,-0.067566,-0.893066,-0.07605,0.353516,-0.841797,0.852539,-0.271484,0.011772,9.40625,...,000032939017120e6e44,0,0.025,0.025,0.625005,0.025,0.025,0.224995,0.025,0.025
2,0.811035,3.775391,-2.605469,0.283936,-2.291016,1.155273,0.00101,1.822266,0.608887,7.058594,...,0000412ca6e4628ce2cf,0,0.017857,0.017885,0.017857,0.446426,0.446403,0.017857,0.017857,0.017857
3,0.19812,0.214722,-0.312744,0.356445,0.397949,0.316162,-0.939453,0.811523,-0.875,0.467773,...,000042bf85aa498cd78e,0,0.020847,0.020845,0.020847,0.020846,0.854052,0.020847,0.020852,0.020865
4,3.681641,1.371094,-1.675781,-1.631836,-0.024017,1.447266,0.417725,0.409424,-4.195312,3.080078,...,0000455dfa3e01eae3af,0,0.386566,0.015633,0.248629,0.015633,0.01565,0.286622,0.015634,0.015633


In [None]:
train2_idf_lda.to_csv('train2_idf_lda.csv')
test2_idf_lda.to_csv('test2_idf_lda.csv')

train2_glove_lda.to_csv('train2_glove_lda.csv')
test2_glove_lda.to_csv('test2_glove_lda.csv')

#### end of data preprocessing