# NLP For Regression

In [1]:
import os
import numpy as np
import pandas as pd

from gensim import corpora
from gensim.utils import lemmatize
from gensim.models import LdaModel, Phrases
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, stem_text


from collections import defaultdict

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

Let's bring in our guitar JSON parser:

In [2]:
from Axe_Object import Axe

And instantiate our rock n roll guitar objects:

In [3]:
filenames = [name for name in os.listdir('axe_specs/') if not name.startswith('.')] # Ignores hidden files on mac

In [21]:
axes = []
for filename in filenames:
    try:
        this_axe = Axe(filename)
        if "LOT OF" not in this_axe.title.upper() and this_axe.price_usd > 100 and this_axe.price_usd < 1750:
            if this_axe.string_config and this_axe.string_config < 5:
                continue
            axes.append(this_axe)
    except ValueError:
        pass

Check out all these text fields we can turn into a big nasty stew of NLP goodness:

In [22]:
axes[58].title

'Ibanez RG321MH guitar custom paint color. Blue sparkle w/ black pickguard'

In [None]:
axes[58].description 

In [None]:
axes[58].condition_description # Might not exist for all instances

In [None]:
axes[58].subtitle # Might not exist for all instances

In [None]:
axes[58].brand # Might not exist for all instances

In [None]:
axes[58].model # Might not exist for all instances

In [None]:
str(axes[58].year) # Might not exist for all instances

In [None]:
axes[58].material # Might not exist for all instances

### Here we'll do some LDA to assign topic-based weights to each guitar

In [23]:
def assemble_guitar_document(axe):
    document = axe.title + ' '
    if axe.year != None:
        document += (str(axe.year) + ' ')*2
    if axe.material != None:
        document += axe.material + ' '
    if axe.model != None:
        document += axe.model + ' ' 
    if axe.brand != None:
        document += axe.brand + ' '
    if axe.subtitle != None:
        document += axe.subtitle + ' '
    if axe.condition_description != None:
        document += axe.condition_description + ' '
    if axe.description != None:
        document += axe.description
    return document

In [24]:
raw_corpus = [assemble_guitar_document(axe).lower() for axe in axes]

In [25]:
len(raw_corpus)

12468

**Text Pre-Processing:**

In [26]:
corpus = []
for doc in raw_corpus:
    doc = strip_multiple_whitespaces(strip_short(strip_punctuation(doc)))
    corpus.append([word for word in stem_text(remove_stopwords(doc)).split()])

In [27]:
# remove words that appear only once
frequency = defaultdict(int)
for doc in corpus:
    for word in doc:
        frequency[word] += 1
corpus = [[word for word in doc if frequency[word] > 1] for doc in corpus]

**Convert to Bag of Words:**

In [28]:
dictionary = corpora.Dictionary(corpus)
corpus = [dictionary.doc2bow(doc) for doc in corpus]

### LDA Transformation and Getting Weights:

In [29]:
def get_topic_weights_df(num_topics, corpus):
    
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, alpha='auto')
    
    corpus_lda = lda[corpus]

    blank_row = [0 for k in range(len(lda.print_topics(1001)))]
    to_concat = []

    for i in range(len(corpus_lda)):
        new_row = pd.Series(data=blank_row)
        for j in corpus_lda[i]:
            new_row.iloc[j[0]] = j[1]
        to_concat.append(new_row)
    
    return pd.concat(to_concat, axis=1).T

### Multiple Regression Baselining

Here I have to set up a primitive linear regression model to tune the # of topics we'll be using to feed the better one later.

In [15]:
X = topic_weights_820 #get_topic_weights_df(500, corpus)

In [16]:
prices = [axe.price_usd for axe in axes]
y = prices

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)

In [18]:
lin_reg = LinearRegression().fit(X_train, y_train)

In [19]:
y_preds = lin_reg.predict(X_test)

In [20]:
np.sqrt(mean_squared_error(y_test, y_preds))

536.6925031423226

In [None]:
price_mean = np.mean([axe.price_usd for axe in axes])
np.sqrt(mean_squared_error(y_test, [price_mean for i in range(len(y_test))]))

Incredibly, the text feature actually reduces error. But how many topics is best?

### Searching for ideal topic number

In [30]:
def get_ideal_num_topics(range_object):
    errors = []
    for i in range_object:
        X = get_topic_weights_df(i)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)
        lin_reg = LinearRegression().fit(X_train, y_train)
        y_preds = lin_reg.predict(X_test)
        error = np.sqrt(mean_squared_error(y_test, y_preds))
        errors.append(error)
    return errors

In [None]:
# errors = get_ideal_num_topics(range(10,1010,10))

In [None]:
# min(errors) # At 820 topics

In [31]:
topic_weights_820 = get_topic_weights_df(820, corpus)

  diff = np.log(self.expElogbeta)


Persisting topic-modeled weights to Disk:

In [32]:
topic_weights_820.to_csv('topic_weights')

In [None]:
topic_weights = pd.read_csv('topic_weights.csv', index_col=0)