In [1]:
import csv
import pandas as pd
import numpy
import re
from nltk.corpus import stopwords
import nltk
from nltk import pos_tag, word_tokenize
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer


In [2]:
df = pd.read_csv('edmunds_comments.csv', encoding='UTF-8')

In [3]:
df.head()

Unnamed: 0,date,username,post,quotes
0,b'September 7',b'dino001',b'If they keep it around in next four-five yea...,"b""The lease rate is the factor that stops me c..."
1,b'September 7',b'circlew',b'2018 430i Gran Coupe',"b""show previous quotes\n\n\nThe Stinger is too..."
2,b'September 8',b'qbrozen',"b""The lease rate is the factor that stops me c...",b''
3,b'September 9',b'FlightNurse2',"b""circlew said:\nThe lease rate is the factor ...","b'Again, the local dealer are hopeless at best..."
4,b'September 9',b'roadburner',"b""'17 F150 Crew 2.7; '67 Coronet R/T; '14 Town...","b'Thanks, but I am more than aware of the dist..."


In [4]:
df['post'].iloc[0].lstrip('b')

'\'If they keep it around in next four-five years, I think I will make myself to put on a hazmat suit and visit our friendly Kia dealer (oh, boy are they horrible here, or what) to check it out. It\\\'s got a lot of good stuff, but seems like they are still behind on a few things, such as relationship between power and gas mileage (e.g. BMW 340/440 has similar performance, but much better gas mileage as a daily driver), but with appropriate price difference, those objections and shortcomings are not insurmountable. Biggest thing will be the "first contact" with a sales person. Hope it won\\\'t start from "are you buying it today?", "what can I do to make you take it home?", or "let me wash your BMW - oh, I can\\\'t find the keys", "How about I show you the deal - square one, two, three, four".\''

In [5]:
for c in (df.columns.values):
    df[c] = df[c].astype(str).str.lstrip('b')

In [6]:
df.head()

Unnamed: 0,date,username,post,quotes
0,'September 7','dino001','If they keep it around in next four-five year...,"""The lease rate is the factor that stops me co..."
1,'September 7','circlew','2018 430i Gran Coupe',"""show previous quotes\n\n\nThe Stinger is too ..."
2,'September 8','qbrozen',"""The lease rate is the factor that stops me co...",''
3,'September 9','FlightNurse2',"""circlew said:\nThe lease rate is the factor t...","'Again, the local dealer are hopeless at best(..."
4,'September 9','roadburner',"""'17 F150 Crew 2.7; '67 Coronet R/T; '14 Town&...","'Thanks, but I am more than aware of the disti..."


In [7]:
model_df = pd.read_csv('models.csv', encoding='ISO-8859-1')
model_df.columns = ['Replace', 'Search']
model_df['Replace'] = model_df['Replace'].str.lower()
model_df['Search'] = model_df['Search'].str.lower()
model_dict = model_df.set_index('Search')['Replace'].to_dict()

In [8]:
## making a clean message column 
StopWordslist = stopwords.words("english")

def string_process(s):
    s=str(s)
    lower = s.lower()
    for key, value in model_dict.items():
        lower= lower.replace(key, value)
    words = lower.split()
    refined = []
    for i in words:
        if i not in StopWordslist:
            refined.append(re.sub("[^\w]+", "", i))
    sentence = ' '.join([word for word in refined])
    return(sentence)
df['clean_message']=df['post'].map(string_process)

In [9]:
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(PorterStemmer().stem(item))
    return stems

In [10]:
vect = TfidfVectorizer(min_df = 5, tokenizer=tokenize).fit(df['clean_message']) ## Using our custom Tokenizer function using Stemming

In [11]:
len(vect.get_feature_names())  ## Total number of unique vocab after removing not-so-useful words

6437

In [12]:
X_vectorized = vect.transform(df['clean_message'])

In [13]:
X_vectorized.toarray

<bound method _cs_matrix.toarray of <12552x6437 sparse matrix of type '<class 'numpy.float64'>'
	with 455239 stored elements in Compressed Sparse Row format>>

In [14]:
#vect.get_feature_names()

In [15]:
X_vectorized.shape

(12552, 6437)

In [16]:
doc =0
feature_index = X_vectorized[doc,:].nonzero()[1]
feature_names = vect.get_feature_names()

In [17]:
feature_index

array([6390, 6317, 6250, 6204, 6161, 5967, 5841, 5782, 5762, 5757, 5645,
       5549, 5505, 5448, 5411, 5382, 5169, 5141, 5045, 4963, 4708, 4548,
       4457, 4409, 4274, 4267, 4133, 4091, 4075, 4038, 3939, 3855, 3736,
       3586, 3509, 3454, 3426, 3314, 3312, 3308, 3231, 3047, 2993, 2979,
       2977, 2958, 2909, 2739, 2731, 2640, 2601, 2575, 2491, 2480, 2189,
       2098, 1953, 1831, 1830, 1801, 1647, 1438, 1322, 1271, 1172, 1109,
       1052, 1039, 1014,  820,  806,  797])

In [18]:

tfidf_scores = zip(feature_index, [X_vectorized[doc, x] for x in feature_index])

In [19]:
di = {}
for w, s in [(feature_names[i],s) for (i, s) in tfidf_scores]:
    di[w] = s

In [20]:
#di = sorted(di, lambda x )
sorted_by_value = sorted(di.items(), key=lambda kv: kv[1], reverse=True)

In [21]:
di

{'year': 0.06534336970658151,
 'wont': 0.1168946885893826,
 'what': 0.21094820641399484,
 'wash': 0.15267121923451693,
 'visit': 0.1288079426514008,
 'two': 0.08916661551078542,
 'today': 0.09703021528424831,
 'three': 0.10984345878967453,
 'think': 0.06231825488991268,
 'thing': 0.15418337311206495,
 'take': 0.07843622144030356,
 'suit': 0.14096746490074283,
 'stuff': 0.11439753687040602,
 'still': 0.07625116822597522,
 'start': 0.09072057857348018,
 'squar': 0.16494179475876852,
 'similar': 0.10673615714922538,
 'show': 0.10244009368335573,
 'seem': 0.08028566055159515,
 'sale': 0.09321112796197703,
 'relationship': 0.15652946795627698,
 'put': 0.08742064840068924,
 'price': 0.07032906161272491,
 'power': 0.08984002553214986,
 'person': 0.09688202184431895,
 'perform': 0.08042716033490499,
 'out': 0.10200292111552793,
 'one': 0.057524879510272714,
 'oh': 0.244596320409519,
 'object': 0.1399389847169012,
 'next': 0.09128899436778906,
 'much': 0.06964551686909629,
 'mileag': 0.21377881

## Analysis of BMW and Lexus

In [22]:
df["post_clean"] = df.post.apply(lambda x:re.sub(r'[^\w\s]', ' ', x.lower()))
df["post_clean"] = df.post_clean.apply(lambda x:re.sub(r'(\\n+)', ' ', x))
df["post_clean"] = df.post_clean.apply(lambda x:re.sub(r'^b[\'\"\'"\"''\s]', '', x.lower()))
df["post_clean"] = df.post_clean.apply(lambda x:re.sub(r'\s+', ' ', x))

In [23]:
stop = set(stopwords.words('english'))
punc = string.punctuation
df['post_clean'] = df.post_clean.apply(lambda x: [word for word in word_tokenize(x) if word not in stop])
df['post_clean'] = df['post_clean'].apply(lambda x: [word for word in x if word not in punc])

In [24]:
df.head()

Unnamed: 0,date,username,post,quotes,clean_message,post_clean
0,'September 7','dino001','If they keep it around in next four-five year...,"""The lease rate is the factor that stops me co...",if keep around next fourfive years think make ...,"[keep, around, next, four, five, years, think,..."
1,'September 7','circlew','2018 430i Gran Coupe',"""show previous quotes\n\n\nThe Stinger is too ...",2018 430i gran coupe,"[2018, 430i, gran, coupe]"
2,'September 8','qbrozen',"""The lease rate is the factor that stops me co...",'',the lease rate factor stops cold taking leapnn...,"[lease, rate, factor, stops, cold, taking, lea..."
3,'September 9','FlightNurse2',"""circlew said:\nThe lease rate is the factor t...","'Again, the local dealer are hopeless at best(...",circlew saidnthe lease rate factor stops cold ...,"[circlew, said, nthe, lease, rate, factor, sto..."
4,'September 9','roadburner',"""'17 F150 Crew 2.7; '67 Coronet R/T; '14 Town&...","'Thanks, but I am more than aware of the disti...",17 ford crew 27 67 coronet rt 14 towncountry l...,"[17, f150, crew, 2, 7, 67, coronet, r, 14, tow..."


In [25]:
def classifyBMWLex(po):
    """ Classigying dataframe observations containing/not containing BMW"""
    r = 0
    for w in po:
        if((w == 'bmw') | (w == 'lexus')):
            r= 1
    return r        

In [26]:
df['BMW or Lexus post'] = df['post_clean'].map(classifyBMWLex)

In [27]:
df_bmw_lex = df[df['BMW or Lexus post'] == 1]

In [28]:

feature_names = vect.get_feature_names()
def get_doc_best_attributes(docnum):
    """Function to get the top attributes from each post"""
    doc = docnum
    #print(doc)
    feature_index = X_vectorized[doc,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [X_vectorized[doc, x] for x in feature_index])
    di = {}
    for w, s in [(feature_names[i],s) for (i, s) in tfidf_scores]:
        di[w] = s
    if(len(di) >= 3):
        sorted_by_value = sorted(di.items(), key=lambda kv: kv[1], reverse=True)
        sorted_by_value = sorted_by_value[:3]
        l = [sorted_by_value[:3][0][0],sorted_by_value[:3][1][0], sorted_by_value[:3][2][0]]
    else:
        l = []
    return l

In [29]:
## testing the functinon
get_doc_best_attributes(1000)

['3g', 'wifi', 'mmi']

In [30]:
df_bmw_lex['Top_Attributes'] = df_bmw_lex.index.map(get_doc_best_attributes)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [31]:
df_bmw_lex.head()

Unnamed: 0,date,username,post,quotes,clean_message,post_clean,BMW or Lexus post,Top_Attributes
0,'September 7','dino001','If they keep it around in next four-five year...,"""The lease rate is the factor that stops me co...",if keep around next fourfive years think make ...,"[keep, around, next, four, five, years, think,...",1,"[oh, mileag, what]"
8,'September 24','benjaminhf',"""roadburner said:\nAgain, the local dealer are...",'If I do manage to upgrade to a TLX it would h...,roadburner saidnagain local dealer hopeless be...,"[roadburner, said, nagain, local, dealer, hope...",1,"[hyundai, dealer, look]"
90,'July 17','benjaminh',"""Alex on Autos, on the other hand, thinks the ...","""Alex on Autos, on the other hand, thinks the ...",alex autos hand thinks tchrysler aspec awd clo...,"[alex, autos, hand, thinks, tlx, spec, awd, cl...",1,"[mpg, calcul, 24]"
98,'July 19','benjaminh',"""benjaminh said:\nAlex on Autos, on the other ...",'Alfa: 11.8 seconds at 122 mph through the qua...,benjaminh saidnalex autos hand thinks tchrysle...,"[benjaminh, said, nalex, autos, hand, thinks, ...",1,"[tire, mpg, calcul]"
100,'July 19','stickguy',"""I have enjoyed listening to people takes on t...","""show previous quotes\n\n\n\nBen in the BMW wo...",i enjoyed listening people takes honda v tchry...,"[enjoyed, listening, people, takes, accord, v,...",1,"[x2, 330i, dealer]"


In [32]:
all_attri_bmw_lexv = set(df_bmw_lex['Top_Attributes'].sum())

In [33]:
(all_attri_bmw_lexv)

{'mercedesbenz',
 'confus',
 'till',
 'net',
 'readi',
 'nonnegoti',
 'everi',
 'broken',
 'mirror',
 'scotch',
 'zdx',
 'leatherett',
 'comput',
 'unbias',
 't',
 'bar',
 'bundl',
 'surmis',
 'couch',
 'think',
 'seller',
 'three',
 'arguabl',
 'fair',
 'ft',
 'perceiv',
 'ingolstadt',
 'press',
 'othersnni',
 'de',
 'implic',
 'chassi',
 'mess',
 'wasnt',
 '375',
 'grad',
 'bmw',
 'east',
 'connissant',
 'panododg',
 'bill',
 'accustom',
 'testdriv',
 'segmentnni',
 'someon',
 'candl',
 'brand',
 'effici',
 'personnel',
 'simplic',
 'n54',
 'hid',
 'lean',
 'open',
 'across',
 'work',
 'gadgetri',
 'robot',
 'surg',
 'comprehens',
 'rock',
 'opportun',
 'motion',
 'wise',
 'feet',
 'complaint',
 'hong',
 'me',
 'consid',
 'upgrad',
 'kw',
 'saidnwel',
 'youth',
 'moreso',
 'interview',
 'fiber',
 'doe',
 '190',
 'lawyer',
 'prior',
 'acrua',
 'downsiz',
 'virginia',
 'penalti',
 'enthusiast',
 'clutch',
 'x5d',
 'plant',
 'mind',
 'thrill',
 'rocket',
 'region',
 'decad',
 'shall',
 

### All the different product attributes found with BMW and Lexus