# NLP For Regression

In [None]:
import os
import numpy as np
import pandas as pd

from gensim import corpora
from gensim.models import TfidfModel, LdaModel
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, stem_text

from collections import defaultdict

from sklearn.linear_model import LinearRegression

Let's bring in our guitar JSON parser:

In [2]:
from Axe_Object import Axe

And instantiate our rock n roll guitar objects:

In [3]:
filenames = [name for name in os.listdir('axe_specs/') if not name.startswith('.')] # Ignores hidden files on mac

In [4]:
axes = []
for filename in filenames:
    try:
        this_axe = Axe(filename)
        if "LOT OF" not in this_axe.title.upper() and this_axe.price_usd > 100 and this_axe.price_usd < 5000:
            if this_axe.string_config and this_axe.string_config < 5:
                continue
            axes.append(this_axe)
    except ValueError:
        pass

Check out all these text fields we can turn into a big nasty stew of NLP goodness:

In [5]:
axes[58].title

'1991 Japan Made Aria Pro II MAGNA Series Electric Guitar MA-550 Made in Japan'

In [6]:
axes[58].description 

"A.01 Fender PB62-75US vwh.html Ebay説明文原本 1991 Made Aria Pro II MAGNA Series Electric Guitar MA-550 Made in Japan Item\u3000Number ER-853 Condition Rank A No Reserve Auction! Shipping Free All over the world! USA Canada Buyer:Free shipping Europe Buyer: Free shipping Australia Buyer: Free shipping Brazil and South America buyer:Free shipping. We will ship this guitar No CITES application. We can ship this item within 1 business day After you made a payment. If the package will not able to any trouble. You can receive the full refund. no worry 1991 Made Aria Pro II MAGNA Series Electric Guitar MA-550 Made in Japan You're bidding on a very good working condition 1991 Made Aria Pro II MA-550, which contains very fine sound and rich tones made from good pieces of wood and hardworking craftsmanship. The guitar has slim neck shape with 24th frets type. Body has very slim comfortable shape. This is '90s made in Japan high quality made guitar. Tuner has GOTOH fine tuners. Sound is very nice. B

In [7]:
axes[58].condition_description # Might not exist for all instances

'Condition Rating(80/100)Not come with tremolo arm bar.Guitar has some scratches and dings on the body.(Photo-01)(Photo-02)The guitar is overall in very good working condition.The fret condition also very good(left 80%)(Not has a fret buzzing)Sounds and plays great with not issue.Please confirm pictures for details.'

In [8]:
axes[58].subtitle # Might not exist for all instances

'Japan Made vintage guitar No Reserve with FREE SHIPPING'

In [9]:
axes[58].brand # Might not exist for all instances

'OTHER'

In [10]:
axes[58].model # Might not exist for all instances

'MA-550'

In [11]:
str(axes[58].year) # Might not exist for all instances

'1991'

In [12]:
axes[58].material # Might not exist for all instances

'ALDER'

**Toss Together the Stew**

In [13]:
def assemble_guitar_document(axe):
    document = axe.title + ' '
    if axe.year != None:
        document += (str(axe.year) + ' ')*2
    if axe.material != None:
        document += axe.material + ' '
    if axe.model != None:
        document += axe.model + ' ' 
    if axe.brand != None:
        document += axe.brand + ' '
    if axe.subtitle != None:
        document += axe.subtitle + ' '
    if axe.condition_description != None:
        document += axe.condition_description + ' '
    if axe.description != None:
        document += axe.description
    return document

In [14]:
raw_corpus = [assemble_guitar_document(axe).lower() for axe in axes]

In [15]:
len(raw_corpus)

13134

**Text Pre-Processing:**

In [16]:
corpus = []
for doc in raw_corpus:
    doc = strip_multiple_whitespaces(strip_short(strip_punctuation(doc)))
    corpus.append([word for word in stem_text(remove_stopwords(doc)).split()])

In [17]:
# remove words that appear only once
frequency = defaultdict(int)
for doc in corpus:
    for word in doc:
        frequency[word] += 1
corpus = [[word for word in doc if frequency[word] > 1] for doc in corpus]

**Convert to Bag of Words:**

In [18]:
dictionary = corpora.Dictionary(corpus)

In [19]:
corpus = [dictionary.doc2bow(doc) for doc in corpus]

**Normalize:**

In [28]:
# tfidf = TfidfModel(corpus)

In [29]:
# corpus_tfidf = tfidf[corpus]

**LDA Transformation:**

In [52]:
num_topics = 300

In [25]:
lda = LdaModel(corpus, id2word=dictionary, num_topics=num_topics)

In [26]:
lda.print_topics(300)

[(0,
  '0.075*"black" + 0.054*"vintag" + 0.047*"pickguard" + 0.032*"modifi" + 0.027*"usag" + 0.026*"great" + 0.024*"squir" + 0.023*"pickup" + 0.021*"logo" + 0.020*"white"'),
 (1,
  '0.112*"pr" + 0.062*"smith" + 0.057*"reed" + 0.048*"paul" + 0.039*"guitar" + 0.031*"bird" + 0.030*"mccarti" + 0.029*"condit" + 0.023*"custom" + 0.019*"origin"'),
 (2,
  '0.066*"neck" + 0.036*"mapl" + 0.035*"bridg" + 0.030*"bodi" + 0.030*"pickup" + 0.026*"wai" + 0.024*"tone" + 0.024*"nut" + 0.020*"schaller" + 0.018*"fret"'),
 (3,
  '0.051*"guitar" + 0.031*"string" + 0.030*"schecter" + 0.026*"leather" + 0.019*"damien" + 0.019*"condit" + 0.016*"shini" + 0.014*"electr" + 0.014*"mainten" + 0.014*"black"'),
 (4,
  '0.062*"bar" + 0.054*"whammi" + 0.046*"includ" + 0.046*"area" + 0.036*"ship" + 0.026*"film" + 0.019*"strap" + 0.018*"white" + 0.017*"cover" + 0.016*"larger"'),
 (5,
  '0.113*"acoust" + 0.064*"guitar" + 0.053*"piezo" + 0.040*"electr" + 0.039*"steel" + 0.036*"broken" + 0.033*"fix" + 0.030*"repair" + 0.026*

In [149]:
corpus_lda = lda[corpus_tfidf]

### Linear Regression Baselining

Here I have to set up a naive linear regression model to tune the # of topics we'll be using to feed the better one later.

In [None]:
blank_300 = [0 for k in range(300)]
to_concat = []

for i in range(len(corpus_lda)):
    new_row = pd.DataFrame(data=blank_300)
    for j in corpus_lda[i]:
        new_row[j[0]] = j[1]
    to_concat.append(new_row)

topic_weights = pd.concat(to_concat, ignore_index=True)

In [None]:
topic_weights[:20]