In [40]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.cross_validation import KFold, train_test_split
from BeautifulSoup import BeautifulSoup
from pymongo import MongoClient
from lxml import etree

In [87]:
#Path of data files
path = r'C:\Anaconda\Galvanize\Minimester2-Project\cars\data\2007'

#Other initializations
client = MongoClient()
db = client.reviews
collection = db.cars
#Drop data so duplicates aren't loaded
collection.drop()

#loop through files
for i,name in enumerate(os.listdir(path)):
    #break for the folder containing bad data files
    if name in ['Bad']:
        break
    #context manager
    with open(path+'\\'+name, 'r') as fo:
        #read in the data
        data = fo.read().splitlines(True)
        #construct etree XML friendly strings
        newdata = '<DATA>\n'+''.join(data[1:]).replace('&','and')+'</DATA>'
        #BS automatically transforms ampersands and angle brackets
        #into their escaped variant
        soup = BeautifulSoup(newdata)
        root = etree.fromstring(str(soup), 
                        parser=etree.XMLParser(encoding='cp1252'))

        #build lists of data
        date, author, text, favorite, = [],[],[],[]
        #loop through the tree to capture all data
        for j, doc in enumerate(root):
            #conditionals to build lists of data
#             print j
            for child in doc:
#                 print child.tag, child.text
                if child.tag.lower() == 'date':
                    date.append(child.text)
                elif child.tag.lower() == 'author':
                    author.append(child.text)
                elif child.tag.lower() == 'text':
                    text.append(child.text)
                elif child.tag.lower() == 'favorite':
                    favorite.append(child.text)
#         print len(date),len(author),len(text),len(favorite)
        df = pd.DataFrame(zip(date,author,text,favorite))
#         print df.head()
        df.columns = ['Date','Author','Text','Favorite']
        df['Vehicle'] = name
        #Insert df into MongoDB
        result = collection.insert(df.T.to_dict().values())
        #Checks that the data load is proceeding
#         print name
#         print collection.count()
print collection.count()
print 'Done!'

18903
Done!


In [86]:
#Hyperparameters to tune
n_samples = 2000
n_features = 1000
n_topics = 16
n_top_words = 20

carsdf = pd.DataFrame(list(collection.find()))
cars = carsdf.as_matrix()

# carfav = np.empty_like(cars[:,2])
cartext = np.empty_like(cars[:,3])

# for i,item in enumerate(list(cars[:,2])):
#     if item is None:
#         carfav[i] = ''
#     else:
#         carfav[i] = item

for j,item in enumerate(list(cars[:,3])):
    if item is None:
        cartext[j] = ''
    else:
        cartext[j] = item
        
cartext_train, cartext_test = train_test_split(cartext)
kf = KFold(cartext_train.shape[0], n_folds=5)
vectorizer = TfidfVectorizer(max_features=n_features, stop_words='english')

for train_index, test_index in kf:    
    # fav_tfidf = vectorizer.fit_transform(list(carfav))
    text_tfidf = vectorizer.fit_transform(list(cartext_train[train_index]))
    nmf = NMF(n_components=n_topics).fit(text_tfidf)
    print nmf.reconstruction_err_

feature_names = vectorizer.get_feature_names()

for topic_idx, topic in enumerate(nmf.components_):
    print "Topic #{}".format(topic_idx)
    print(" ".join([feature_names[i]
                    for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print

99.4457422045
99.4746007562
99.4492008649
99.4610140615
99.440046819
Topic #0
car bought want cars really recommend sports little buy amazing money lot say handles overall perfect makes driving looking price

Topic #1
truck trucks ford chevy cab tundra bed toyota power tacoma towing dodge silverado gmc gm trailer tow v8 4wd size

Topic #2
great looks handles far price drives value handling runs features sound awesome interior job performance acceleration money little design stereo

Topic #3
love just got new bought absolutely ago way driving fell look kids awesome looks color suv purchased drives blue years

Topic #4
mpg highway city driving miles getting fuel mph average 30 trip town 20 25 economy 22 hwy averaging 40 26

Topic #5
drive fun comfortable test fast easy wheel blast snow day quick just work manual lot pleasure looking mazda sporty road

Topic #6
vehicle suv purchased family size vehicles purchase looking recommend road fuel comfort jeep excellent wife luxury impressed gm o