In [10]:
import os
import re
import logging
from glob import glob
from pprint import pprint

import gensim
import pyLDAvis
import pyLDAvis.gensim
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
import gensim.corpora as corpora

# VUEPOINT TEXT PROCESSING PIPELINE
from vuepoint import NgramFreq
from vuepoint.StatsPipeline import get_corpus_stats
from vuepoint.TextPipeline import preprocess_corpus
from vuepoint.TextPipeline import format_sent_topics
from vuepoint.ModelingPipeline import process_words

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [3]:
corpus_large = []
for file in glob('downloads/cartrends/*.json'):
    corpus = pd.read_json(file, encoding='utf-8', lines=True)
    corpus_large.append(corpus)

corpus_large = pd.concat(corpus_large, ignore_index=True)
corpus_large.text.describe()

count          27406
unique         27081
top       Thank you﻿
freq              19
Name: text, dtype: object

In [4]:
corpus_large.text.head(5)

0    Happy New Year!\n\n⬇️Scotty’s Top DIY Tools:\r...
1    Hi Scotty I recently bought a 95 Camry automat...
2    I really dislike those new grills as well.  Ha...
3                             HAPPY NEW YEAR SCOTTY!!﻿
4    Scotty Kilmer what do u think of a 2003 Chevy ...
Name: text, dtype: object

In [5]:
get_corpus_stats(corpus_large, 'text')
corpus_large = corpus_large[corpus_large.word_count > 10]
preprocess_corpus(corpus_large, 'text')
corpus_large.text.head(5)

extracting text features
extracting content features from text
extracting text sentiment features


0    happy new year scotty top diy tool bluetooth s...
1    hi scotty recently bought camry automatic v k ...
2    really dislike new grill well happy new year s...
4            scotty kilmer u think chevy thousand mile
5    scotty kilmer happy new year always enjoy vide...
Name: text, dtype: object

In [6]:
corpus_large.to_csv('cartrends_large_corpus.csv')
corpus_large.describe()

Unnamed: 0,word_count,char_lengh,n_avg_word,n_numerics,upper_case,polarity,subjectivity
count,16860.0,16860.0,16860.0,16860.0,16860.0,16860.0,16860.0
mean,38.766548,209.290036,4.438942,0.302135,1.5328,0.068759,0.471441
std,44.481971,244.133382,0.601176,0.783714,3.549292,0.295928,0.267065
min,11.0,28.0,1.1,0.0,0.0,-1.0,0.0
25%,16.0,85.0,4.066667,0.0,0.0,-0.05,0.3125
50%,25.0,134.0,4.386364,0.0,1.0,0.04,0.5
75%,44.0,237.0,4.74333,0.0,2.0,0.225,0.644555
max,1018.0,5680.0,15.384615,19.0,224.0,1.0,1.0


In [7]:
corpus_large.reindex(np.random.permutation(corpus_large.index))
corpus_large.text.describe()

count                                           16860
unique                                          16826
top       tesla autopilot safer human credible source
freq                                                3
Name: text, dtype: object

In [22]:
top_unigrams = NgramFreq.get_top_unigrams(
    corpus_large.text.values.tolist(), topn=20)
top_unigrams

[('like', 3648),
 ('one', 2435),
 ('get', 2012),
 ('new', 1736),
 ('people', 1723),
 ('make', 1711),
 ('would', 1707),
 ('thing', 1686),
 ('look', 1611),
 ('year', 1585),
 ('drive', 1541),
 ('even', 1457),
 ('driving', 1392),
 ('time', 1328),
 ('driver', 1277),
 ('think', 1257),
 ('need', 1252),
 ('want', 1221),
 ('screen', 1221),
 ('know', 1212)]

In [20]:
top_bigrams = NgramFreq.get_top_bigrams(
    corpus_large.text.values.tolist(), topn=20)
top_bigrams

[('look like', 309),
 ('touch screen', 291),
 ('start stop', 285),
 ('self driving', 279),
 ('best driver', 248),
 ('year old', 202),
 ('steering wheel', 181),
 ('low profile', 165),
 ('spare tire', 149),
 ('parking brake', 147),
 ('door handle', 145),
 ('sound like', 143),
 ('manual transmission', 137),
 ('worst trend', 137),
 ('motor trend', 136),
 ('feel like', 131),
 ('fake vent', 130),
 ('profile tire', 126),
 ('year ago', 126),
 ('stuck screen', 121)]

In [21]:
top_trigrams = NgramFreq.get_top_trigrams(
    corpus_large.text.values.tolist(), topn=20)
top_trigrams

[('low profile tire', 116),
 ('tesla hater tesla', 92),
 ('hater tesla hater', 91),
 ('happy new year', 78),
 ('beep beep beep', 69),
 ('fake engine noise', 62),
 ('electronic parking brake', 59),
 ('take eye road', 45),
 ('new year scotty', 32),
 ('fake exhaust tip', 32),
 ('full size spare', 29),
 ('push button start', 29),
 ('auto start stop', 28),
 ('check engine light', 27),
 ('electronic door handle', 26),
 ('disciple jesus christ', 26),
 ('self driving vehicle', 25),
 ('front wheel drive', 24),
 ('keep eye road', 23),
 ('taking eye road', 22)]

In [10]:
top_quadgrams = NgramFreq.get_top_quadgrams(
    corpus_large.text.values.tolist(), topn=10)
top_quadgrams

[('tesla hater tesla hater', 91),
 ('hater tesla hater tesla', 91),
 ('beep beep beep beep', 65),
 ('happy new year scotty', 32),
 ('peep peep peep peep', 14),
 ('world greatest drag race', 12),
 ('wheel low profile tire', 11),
 ('without taking eye road', 10),
 ('awakened truth disciple jesus', 10),
 ('truth disciple jesus christ', 10)]

In [11]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True)
        yield(sent)

data = corpus_large.text.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1])

[['happy', 'new', 'year', 'scotty', 'top', 'diy', 'tool', 'bluetooth', 'scan', 'tool', 'cheap', 'scan', 'tool', 'professional', 'socket', 'set', 'wrench', 'set', 'charging', 'required', 'jump', 'starter', 'battery', 'pack', 'jump', 'starter', 'thing', 'used', 'video', 'common', 'sense', 'camera', 'camera', 'microphone', 'camera', 'tripod', 'computer', 'editing', 'uploading', 'video', 'editing', 'software', 'thumbnail', 'software', 'check', 'tool', 'use', 'highly', 'recommend', 'scotty', 'shirt', 'merch', 'subscribe', 'hit', 'notification', 'bell', 'scotty', 'social', 'facebook', 'instagram', 'twitter']]


>- **building the car trends model**

- **data_ready**: final text processing pipeline with spaCy's `en_core_web_lg` model
- **id2word**: build the dictionary from corpus
- **corpus**: create the corpus **TDF** with Gensim `doc2bow`
- **lda_model**: and finally we build the **LDA**

In [12]:
data_ready = process_words(data_words)
id2word = corpora.Dictionary(data_ready)
corpus = [id2word.doc2bow(text) for text in data_ready]
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=3,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=10,
                                            passes=10,
                                            alpha='symmetric',
                                            iterations=100,
                                            per_word_topics=True)

pprint(lda_model.print_topics())

[(0,
  '0.033*"people" + 0.031*"want" + 0.029*"drive" + 0.019*"year" + 0.014*"old" '
  '+ 0.014*"suv" + 0.012*"new" + 0.011*"road" + 0.010*"vehicle" + 0.010*"lot"'),
 (1,
  '0.020*"buy" + 0.015*"thing" + 0.011*"good" + 0.011*"start" + 0.011*"engine" '
  '+ 0.009*"vehicle" + 0.009*"need" + 0.008*"work" + 0.008*"stop" + '
  '0.008*"know"'),
 (2,
  '0.029*"look" + 0.021*"hate" + 0.020*"new" + 0.019*"feel" + 0.015*"love" + '
  '0.012*"big" + 0.011*"think" + 0.011*"truck" + 0.010*"design" + '
  '0.009*"headlight"')]


In [13]:
topic_keywords = format_sent_topics(lda_model, corpus, data_ready, n_topics=3)
dominant_topic = topic_keywords.reset_index()
dominant_topic.columns = [
    'doc_num', 'dominant_topic', 'topic_perc_contrib', 'key_words', 'text']

dominant_topic.to_csv('dominant_3topic_model.csv')
dominant_topic.head(5)

Unnamed: 0,doc_num,dominant_topic,topic_perc_contrib,key_words,text
0,0,1.0,0.608,"buy, thing, good, start, engine, vehicle, need...","[happy, new, year, scotty, diy, tool, bluetoot..."
1,1,1.0,0.815,"buy, thing, good, start, engine, vehicle, need...","[recently, buy, camry, automatic, mile, good, ..."
2,2,0.0,0.482,"people, want, drive, year, old, suv, new, road...","[dislike, new, grill, happy, new, year, scotty..."
3,3,1.0,0.663,"buy, thing, good, start, engine, vehicle, need...","[think, mile]"
4,4,0.0,0.468,"people, want, drive, year, old, suv, new, road...","[scotty_kilmer, happy, new, year, enjoy, video..."


In [14]:
dominant_topic.describe()

Unnamed: 0,doc_num,dominant_topic,topic_perc_contrib
count,16860.0,16860.0,16860.0
mean,8429.5,0.90516,0.597347
std,4867.207105,0.746916,0.142059
min,0.0,0.0,0.333
25%,4214.75,0.0,0.485
50%,8429.5,1.0,0.576
75%,12644.25,1.0,0.692
max,16859.0,2.0,0.969


In [15]:
import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
pyLDAvis.save_html(vis, '3topic_ldamodel.html')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [16]:
from multiprocessing import cpu_count

model = gensim.models.Word2Vec(size=100, 
                               window=5, 
                               min_count=5, 
                               sample=1e-4, 
                               seed=1,
                               negative=5,
                               workers=cpu_count(), sg=1)

model.build_vocab(data_ready)
print('model vocab length: ', len(model.wv.vocab))
model.train(data_ready, total_examples=len(data_ready), epochs=400)
# model.wv.save_word2vec_format(os.path.join('saved_models', 'model.bin'))

model vocab length:  4380


(48344830, 94703600)

In [17]:
model.wv.most_similar("voice_command", topn=20)

[('gesture', 0.4699676036834717),
 ('commonly', 0.43160951137542725),
 ('gesture_control', 0.4015405774116516),
 ('trek', 0.38776665925979614),
 ('sex', 0.38675397634506226),
 ('indicate', 0.3791407644748688),
 ('difficulty', 0.37336450815200806),
 ('laggy', 0.3718671202659607),
 ('engage', 0.3712344765663147),
 ('transfer', 0.3711259961128235),
 ('command', 0.36593523621559143),
 ('volume', 0.36405009031295776),
 ('chemistry', 0.363465279340744),
 ('basic', 0.3623785078525543),
 ('idrive', 0.36224907636642456),
 ('frustrating', 0.3618530035018921),
 ('incident', 0.36155545711517334),
 ('gps', 0.36019399762153625),
 ('site', 0.3586016893386841),
 ('spread', 0.3554077446460724)]

In [18]:
model.wv.most_similar(positive=['play_music', 'command'], negative=['unwanted'], topn=20)

[('attractive', 0.4177040457725525),
 ('leukemia', 0.3948470652103424),
 ('robot', 0.3864787817001343),
 ('bluetooth', 0.3769947588443756),
 ('medium', 0.3733176290988922),
 ('phone', 0.37244296073913574),
 ('usb', 0.36586570739746094),
 ('entry', 0.36483582854270935),
 ('vibrate', 0.36271965503692627),
 ('fiesta', 0.3607165515422821),
 ('navigation', 0.35931235551834106),
 ('device', 0.355599045753479),
 ('rename', 0.35479024052619934),
 ('wooden', 0.3541204035282135),
 ('addict', 0.3509850800037384),
 ('plug', 0.3505522608757019),
 ('cell', 0.35046106576919556),
 ('obtain', 0.3447158634662628),
 ('voice_control', 0.342851459980011),
 ('adult', 0.3369237780570984)]

In [19]:
def word_algebra(add=[], subtract=[], topn=10):
    answers = model.wv.most_similar(positive=add, negative=subtract, topn=topn)
    for term, similarity in answers:
        print(term)

In [20]:
# so touch screen and a system and not want
word_algebra(['touch_screen', 'system'], ['want'])

control
physical
infotainment
knob
button
operate
command
screen
touchscreen
eye


In [21]:
word_algebra(['voice_control', 'voice_command', 'interface'], ['difficulty', 'feature'])

leg_room
gesture_control
disappointed
seater
volume
gesture
commonly
citroen
touchpad
literal


In [22]:
# from these words : 'safe' does not match the words in the string
print(model.wv.doesnt_match("simple infotainment safe vehicle screen".split()))

safe


  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


In [23]:
# by replacing screen with 'touch' then touch does not match the words in the string
print(model.wv.doesnt_match("simple infotainment safe vehicle touch".split()))

touch
