In [43]:
import pandas as pd
import numpy as np
import re
from re import sub
import multiprocessing
import unidecode

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors


from time import time 
from collections import defaultdict

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [44]:
file = pd.read_csv(r"C:\Users\15145\Documents\text analytics\Analysis_Final.csv")

In [45]:
file

Unnamed: 0,userid,date,comment
0,libertycat,Mar-03,I'm not due to the smell and noise of diesel.
1,tidester,Mar-03,I really think you should take this over to th...
2,jlp8885,Mar-03,today's diesels- common rail or even the old v...
3,dieselbreath,Mar-03,I suspect it may get the Benz E300D power-plan...
4,gagrice,Mar-03,It may be the co-op in Volga. That is where he...
...,...,...,...
4996,tired_old_dave,Apr-06,atf+4 is dc tranny fluid. What are they using ...
4997,jimhemi,Apr-06,"Maybe you're onto something. Crack the egg, ma..."
4998,jimhemi,Apr-06,Take it it a Detroit Diesel Repair shop. DC ow...
4999,caribou1,Apr-06,How would you react if a (really good and seri...


In [46]:
file_cleaned = file.dropna().drop_duplicates().reset_index(drop=True).rename(columns={'comment':'title'})

In [47]:
file_cleaned

Unnamed: 0,userid,date,title
0,libertycat,Mar-03,I'm not due to the smell and noise of diesel.
1,tidester,Mar-03,I really think you should take this over to th...
2,jlp8885,Mar-03,today's diesels- common rail or even the old v...
3,dieselbreath,Mar-03,I suspect it may get the Benz E300D power-plan...
4,gagrice,Mar-03,It may be the co-op in Volga. That is where he...
...,...,...,...
4991,tired_old_dave,Apr-06,atf+4 is dc tranny fluid. What are they using ...
4992,jimhemi,Apr-06,"Maybe you're onto something. Crack the egg, ma..."
4993,jimhemi,Apr-06,Take it it a Detroit Diesel Repair shop. DC ow...
4994,caribou1,Apr-06,How would you react if a (really good and seri...


In [48]:
file_cleaned.userid.value_counts()/len(file_cleaned)

caribou1          0.085068
winter2           0.076861
farout            0.055044
tired_old_dave    0.036829
moparbad          0.036229
                    ...   
drobinson         0.000200
rs_petty          0.000200
crdlibertyownr    0.000200
isis2             0.000200
farm101           0.000200
Name: userid, Length: 459, dtype: float64

In [49]:
file_cleaned[file_cleaned.userid==0]

Unnamed: 0,userid,date,title


In [50]:
file_cleaned = file_cleaned[file_cleaned.userid!=0]

In [51]:
file_cleaned.userid.value_counts()/len(file_cleaned)

caribou1          0.085068
winter2           0.076861
farout            0.055044
tired_old_dave    0.036829
moparbad          0.036229
                    ...   
drobinson         0.000200
rs_petty          0.000200
crdlibertyownr    0.000200
isis2             0.000200
farm101           0.000200
Name: userid, Length: 459, dtype: float64

In [52]:

def text_to_word_list(text, remove_polish_letters):
    ''' Pre process and convert texts to a list of words 
    method inspired by method from eliorc github repo: https://github.com/eliorc/Medium/blob/master/MaLSTM.ipynb'''
    #text = remove_polish_letters(text)
    text = str(text)
    text = text.lower()

    # Clean the text
    text = sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = sub(r"\+", " plus ", text)
    text = sub(r",", " ", text)
    text = sub(r"\.", " ", text)
    text = sub(r"!", " ! ", text)
    text = sub(r"\?", " ? ", text)
    text = sub(r"'", " ", text)
    text = sub(r":", " : ", text)
    text = sub(r"\s{2,}", " ", text)

    text = text.split()

    return text

In [53]:
file_cleaned.title = file_cleaned.title.apply(lambda x: text_to_word_list(x, unidecode))

In [54]:
file_cleaned.title

0       [i, m, not, due, to, the, smell, and, noise, o...
1       [i, really, think, you, should, take, this, ov...
2       [today, s, diesels, common, rail, or, even, th...
3       [i, suspect, it, may, get, the, benz, e300d, p...
4       [it, may, be, the, co, op, in, volga, that, is...
                              ...                        
4991    [atf, plus, 4, is, dc, tranny, fluid, what, ar...
4992    [maybe, you, re, onto, something, crack, the, ...
4993    [take, it, it, a, detroit, diesel, repair, sho...
4994    [how, would, you, react, if, a, really, good, ...
4995    [dave, the, level, of, the, fluid, in, the, re...
Name: title, Length: 4996, dtype: object

In [55]:
file_model = file_cleaned.copy()
file_model = file_model[file_model.title.str.len()>1]

In [56]:
file_model

Unnamed: 0,userid,date,title
0,libertycat,Mar-03,"[i, m, not, due, to, the, smell, and, noise, o..."
1,tidester,Mar-03,"[i, really, think, you, should, take, this, ov..."
2,jlp8885,Mar-03,"[today, s, diesels, common, rail, or, even, th..."
3,dieselbreath,Mar-03,"[i, suspect, it, may, get, the, benz, e300d, p..."
4,gagrice,Mar-03,"[it, may, be, the, co, op, in, volga, that, is..."
...,...,...,...
4991,tired_old_dave,Apr-06,"[atf, plus, 4, is, dc, tranny, fluid, what, ar..."
4992,jimhemi,Apr-06,"[maybe, you, re, onto, something, crack, the, ..."
4993,jimhemi,Apr-06,"[take, it, it, a, detroit, diesel, repair, sho..."
4994,caribou1,Apr-06,"[how, would, you, react, if, a, really, good, ..."


In [57]:
sent = [row for row in file_model.title]
phrases = Phrases(sent, min_count=1, progress_per=50000)
bigram = Phraser(phrases)
sentences = bigram[sent]
sentences[1]

INFO - 13:51:48: collecting all words and their counts
INFO - 13:51:48: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 13:51:51: collected 196766 word types from a corpus of 491436 words (unigram + bigrams) and 4992 sentences
INFO - 13:51:51: using 196766 counts as vocab in Phrases<0 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000>
INFO - 13:51:51: source_vocab length 196766
INFO - 13:51:57: Phraser built with 11286 phrasegrams


['i',
 'really',
 'think',
 'you',
 'should',
 'take',
 'this',
 'over',
 'to',
 'the',
 'jeep_liberty',
 'forum',
 'thanks',
 '!_tidester',
 'host']

In [58]:
w2v_model = Word2Vec(min_count=3,
                     window=4,
                     size=300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)

start = time()

w2v_model.build_vocab(sentences, progress_per=50000)

print('Time to build vocab: {} mins'.format(round((time() - start) / 60, 2)))

INFO - 13:52:08: collecting all words and their counts
INFO - 13:52:08: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 13:52:10: collected 27949 word types from a corpus of 430486 raw words and 4992 sentences
INFO - 13:52:10: Loading a fresh vocabulary
INFO - 13:52:10: effective_min_count=3 retains 11053 unique words (39% of original 27949, drops 16896)
INFO - 13:52:10: effective_min_count=3 leaves 406733 word corpus (94% of original 430486, drops 23753)
INFO - 13:52:10: deleting the raw counts dictionary of 27949 items
INFO - 13:52:10: sample=1e-05 downsamples 3290 most-common words
INFO - 13:52:10: downsampling leaves estimated 97903 word corpus (24.1% of prior 406733)
INFO - 13:52:10: estimated required memory for 11053 words and 300 dimensions: 32053700 bytes
INFO - 13:52:10: resetting layer weights


Time to build vocab: 0.14 mins


In [59]:
start = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=15, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - start) / 60, 2)))

w2v_model.init_sims(replace=True)

INFO - 13:52:16: training model with 7 workers on 11053 vocabulary and 300 features, using sg=0 hs=0 sample=1e-05 negative=20 window=4
INFO - 13:52:17: EPOCH 1 - PROGRESS: at 27.10% examples, 26251 words/s, in_qsize 1, out_qsize 0
INFO - 13:52:18: EPOCH 1 - PROGRESS: at 56.25% examples, 26427 words/s, in_qsize 0, out_qsize 0
INFO - 13:52:19: EPOCH 1 - PROGRESS: at 88.76% examples, 27850 words/s, in_qsize 0, out_qsize 0
INFO - 13:52:19: worker thread finished; awaiting finish of 6 more threads
INFO - 13:52:19: worker thread finished; awaiting finish of 5 more threads
INFO - 13:52:19: worker thread finished; awaiting finish of 4 more threads
INFO - 13:52:19: worker thread finished; awaiting finish of 3 more threads
INFO - 13:52:19: worker thread finished; awaiting finish of 2 more threads
INFO - 13:52:19: worker thread finished; awaiting finish of 1 more threads
INFO - 13:52:19: worker thread finished; awaiting finish of 0 more threads
INFO - 13:52:19: EPOCH - 1 : training on 430486 raw 

INFO - 13:52:47: worker thread finished; awaiting finish of 1 more threads
INFO - 13:52:47: worker thread finished; awaiting finish of 0 more threads
INFO - 13:52:47: EPOCH - 9 : training on 430486 raw words (97808 effective words) took 3.4s, 28468 effective words/s
INFO - 13:52:48: EPOCH 10 - PROGRESS: at 27.10% examples, 25887 words/s, in_qsize 0, out_qsize 0
INFO - 13:52:49: EPOCH 10 - PROGRESS: at 60.22% examples, 27729 words/s, in_qsize 0, out_qsize 0
INFO - 13:52:50: EPOCH 10 - PROGRESS: at 93.13% examples, 28622 words/s, in_qsize 0, out_qsize 0
INFO - 13:52:50: worker thread finished; awaiting finish of 6 more threads
INFO - 13:52:50: worker thread finished; awaiting finish of 5 more threads
INFO - 13:52:50: worker thread finished; awaiting finish of 4 more threads
INFO - 13:52:50: worker thread finished; awaiting finish of 3 more threads
INFO - 13:52:50: worker thread finished; awaiting finish of 2 more threads
INFO - 13:52:50: worker thread finished; awaiting finish of 1 more 

Time to train the model: 0.85 mins


In [60]:
w2v_model.save("word2vec.model")

INFO - 13:53:07: saving Word2Vec object under word2vec.model, separately None
INFO - 13:53:07: not storing attribute vectors_norm
INFO - 13:53:07: not storing attribute cum_table
INFO - 13:53:08: saved word2vec.model


In [61]:
file_export = file_model.copy()
file_export['old_title'] = file_export.title
file_export.old_title = file_export.old_title.str.join(' ')
file_export.title = file_export.title.apply(lambda x: ' '.join(bigram[x]))
#file_export.userid = file_export.userid.astype('int8')

In [62]:
file_export[['title', 'userid']].to_csv(r"C:\Users\15145\Documents\text analytics\cleaned_dataset.csv", index=False)