In [176]:
import numpy as np
import pandas as pd

import nltk, string, os
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist

from sklearn.feature_extraction.text import TfidfVectorizer
np.random.seed(0)

In [4]:
from Axe_Object import Axe

In [5]:
filenames = [name for name in os.listdir('axe_specs/') if not name.startswith('.')] # Ignores hidden files on mac

In [6]:
axes = []
for filename in filenames:
    try:
        this_axe = Axe(filename)
        if "LOT OF" not in this_axe.title.upper()  and this_axe.price_usd > 110 and this_axe.price_usd < 750:
            if this_axe.string_config and this_axe.string_config < 5:
                continue
            if this_axe.market != 'EBAY-US':
                continue
            if this_axe.year and this_axe.year > 2019:
                continue
            axes.append(this_axe)
    except ValueError:
        pass
len(axes)

9557

In [115]:
def assemble_guitar_document(axe):
    document = axe.title + ' '
    if axe.year != None:
        document += (str(axe.year) + ' ')*2
    if axe.material != None:
        document += axe.material + ' '
    if axe.model != None:
        document += axe.model + ' ' 
    if axe.brand != None:
        document += axe.brand + ' '
    if axe.subtitle != None:
        document += axe.subtitle + ' '
    if axe.condition_description != None:
        document += axe.condition_description + ' '
    if axe.description != None:
        document += axe.description
    return document

In [116]:
raw_corpus = [assemble_guitar_document(axe).lower() for axe in axes]

In [169]:
def process_doc(doc):
    stopwords_removed = []
    tokens = nltk.word_tokenize(doc)
    for i in range(len(tokens)):
        try:
            if tokens[i].lower() not in stopwords_list and tokens[i+1] not in string.punctuation:
                stopwords_removed.append((tokens[i].lower(), tokens[i+1].lower()))
            elif tokens[i+1].lower() not in stopwords_list and tokens[i] not in string.punctuation:
                    stopwords_removed.append((tokens[i].lower(), tokens[i+1].lower()))
        except IndexError:
            pass
    return stopwords_removed

In [182]:
stopwords_list = stopwords.words('english') + list(string.punctuation)
stopwords_list += ["''", '""', '...', '``', ",", ".", ":", "'s", "--","’"]

In [183]:
processed_data = list(map(process_doc, raw_corpus))

In [184]:
total_vocab = set()
for bigram in processed_data:
    total_vocab.update(bigram)
len(total_vocab)

262272

In [185]:
docs_concat = []
for doc in processed_data:
    docs_concat += doc

In [186]:
articles_freqdist = FreqDist(docs_concat)
articles_freqdist.most_common(200)

[(('electric', 'guitar'), 7632),
 (('the', 'item'), 3459),
 (('this', 'guitar'), 3407),
 (('the', 'guitar'), 3397),
 (('les', 'paul'), 3346),
 (('guitar', 'is'), 2828),
 (('contact', 'us'), 2538),
 (('the', 'neck'), 2266),
 (('made', 'in'), 2035),
 (('please', 'contact'), 2014),
 (('comes', 'with'), 1915),
 (('any', 'questions'), 1862),
 (('gig', 'bag'), 1812),
 (('excellent', 'condition'), 1685),
 (('good', 'condition'), 1680),
 (('the', 'body'), 1480),
 (('very', 'good'), 1393),
 (('hard', 'case'), 1352),
 (('ship', 'to'), 1292),
 (('do', "n't"), 1249),
 (('the', 'original'), 1245),
 (('the', 'buyer'), 1238),
 (('in', 'good'), 1229),
 (('not', 'included'), 1228),
 (('for', 'sale'), 1217),
 (('must', 'be'), 1215),
 (('maple', 'neck'), 1176),
 (('the', 'pictures'), 1172),
 (('in', 'excellent'), 1171),
 (('the', 'back'), 1169),
 (('please', 'note'), 1149),
 (('shipping', 'charges'), 1137),
 (('neck', 'is'), 1136),
 (('guitar', 'with'), 1114),
 (('guitar', 'has'), 1083),
 (('thank', 'you

In [187]:
vectorizer = TfidfVectorizer()