In [195]:
import pandas as pd
import numpy as np
import re
import bs4 as bs

# download NLTK classifiers - these are cached locally on your machine
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

# import ml classifiers
from nltk.tokenize import sent_tokenize # tokenizes sentences
from nltk.stem import PorterStemmer     # parsing/stemmer
from nltk.tag import pos_tag            # parts-of-speech tagging
from nltk.corpus import wordnet         # sentiment scores
from nltk.stem import WordNetLemmatizer # stem and context
from nltk.corpus import stopwords       # stopwords
from nltk.util import ngrams            # ngram iterator
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk import tokenize as tok

# import word2vec
import gensim
from gensim.test.utils import datapath
from gensim import utils
from gensim.models import Word2Vec
from gensim import corpora, models, similarities, downloader



df = pd.read_csv("data/Volvo_edmunds_10yrs.csv",lineterminator='\n').iloc[:,1:]
df['Review_Date'] = pd.to_datetime(df['Review_Date'],errors='coerce')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sixumeng/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sixumeng/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sixumeng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/sixumeng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [196]:
# complaint_df = df[df["Customer_Rating"]<3]
# complaint_df = df.copy()
complaint_df = df[df["Customer_Rating"]>3]


In [197]:
import plotly.express as px
fig = px.histogram(complaint_df, x='Review_Date', template='plotly_white', title='Complaint counts by date')
fig.update_xaxes(categoryorder='category descending', title='Date').update_yaxes(title='Number of complaints')
fig.show()

In [198]:
fig = px.histogram(complaint_df, x='Vehicle_model', template='plotly_white', title='Complaint counts by company')
fig.update_xaxes(categoryorder='total descending').update_yaxes(title='Number of complaints')
fig.show()

In [199]:
complaint_df["Words_clipped"] = complaint_df["Review"].str.split("/").str.len()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [200]:
complaint_df.head()

Unnamed: 0,Vehicle_model,Vehicle_Year,Vehicle_Rating,Review_Date,Author_Name,Vehicle_Name,Helpful_weight,Review_Title,Customer_Rating,Review,Words_clipped
0,C30,2011,4.8,2011-06-12,califas,T5 2dr Hatchback (2.5L 5cyl Turbo 6M),23 of 23 people found this review helpful,Great Driving Experience,4.88,"In evaluating a car, first you define the miss...",1
1,C30,2011,4.8,2010-10-17,Lauren,T5 2dr Hatchback (2.5L 5cyl Turbo 6M),14 of 14 people found this review helpful,Forget Mini check out Volvo,4.88,"I thought I loved my 2007 Mini S convertible, ...",1
2,C30,2011,4.8,2011-09-30,jersite,T5 2dr Hatchback (2.5L 5cyl Turbo 6M),11 of 11 people found this review helpful,Great Car!,4.75,Have had my 2011 Volvo C30 for a little over a...,1
3,C30,2011,4.8,2010-06-04,Robert James G,T5 2dr Hatchback (2.5L 5cyl Turbo 6M),10 of 10 people found this review helpful,Get it for what it is: a premium hatch.,4.75,I've read plenty of reviews stating how this c...,1
4,C30,2011,4.8,2011-08-18,rsoldier97,R-Design 2dr Hatchback (2.5L 5cyl Turbo 6M),10 of 10 people found this review helpful,HOT!! Volvo C30 R-Design,4.5,I test drove several cars to include the Volks...,1


In [201]:
# Complaints by company & date
fig = px.histogram(complaint_df, x='Review_Date', template='plotly_white', title='Complaint counts by date'
                   , color='Vehicle_model', nbins=6, log_y=True, barmode='group')
fig.update_xaxes(categoryorder='category descending', title='Date').update_yaxes(title='Number of complaints')
fig.show()

In [284]:
eng_stopwords = set(stopwords.words("english"))
eng_stopwords.add("volvo")

In [309]:
ps = PorterStemmer()
wnl = WordNetLemmatizer()
eng_stopwords = set(stopwords.words("english"))
for e in ["volvo","car","xc","vehicle","bmw","audi"]:
    eng_stopwords.add(e)


def review_cleaner(review, lemmatize=True, stem=False):
    if lemmatize == True and stem == True:
        raise RuntimeError("May not pass both lemmatize and stem flags")

    #1. Remove HTML tags
    review = bs.BeautifulSoup(review).text    

    #2. Use regex to find emoticons
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', review)

    #3. Remove punctuation
    review = re.sub("[^a-zA-Z]", " ",review)

    #4. Tokenize into words (all lower case)
    review = review.lower().split()

    #5. Remove stopwords, Lemmatize, Stem
    clean_review=[]
    for word in review:
        if word not in eng_stopwords:
            if lemmatize is True:
                word=wnl.lemmatize(word)
            elif stem is True:
                if word == 'oed':
                    continue
                word=ps.stem(word)
            clean_review.append(word)
            

    #6. Join the review to one sentence
    review_processed = ' '.join(clean_review)
    
    return review_processed

In [310]:
complaint_df["Review"] = complaint_df["Review"].apply(review_cleaner)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [305]:
df["Review"][0]

"In evaluating a car, first you define the mission.\nNo car can be all things.\nIf you want a Corvette, you understand that it can't have the gas mileage of a Prius. \n\nThis is the T5 C30 with 17 in wheels:\n\nA small, nimble car with an appealing and distinctive aesthetic that coddles the driver with excellent ride dynamics and great interior design.\n \nIt is not trying to be a track car.\nIt is 2+2 GT road car.\nIt gets you through your driving day with a minimum of fatigue and a maximum of bliss.\nIt has the quickness and agility to squeeze through traffic, yet still has a cabin that is quiet and comfortable.\nPlus, it feels like quality, it is nice to look at, and it is just fun to drive. "

In [300]:
complaint_df["Review"][2]

'c little month overall really love car styling amazing handling great seat comfortable ever sat heated seat huge bonus great fun drive high point bluetooth work amazingly well ipod integration great visibility drawback minor best huge oversized key mileage difficulty accessing rear seat said would recommend c everyone'

In [312]:
##############

In [313]:
from nltk.corpus import stopwords
stoplist = stopwords.words('english') + ['though']
from sklearn.feature_extraction.text import CountVectorizer
c_vec = CountVectorizer(stop_words=stoplist, ngram_range=(2,3))
# matrix of ngrams
ngrams = c_vec.fit_transform(complaint_df['Review'])
# count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)
# list of ngrams
vocab = c_vec.vocabulary_
df_ngram = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'bigram/trigram'})

In [314]:
'''
Remove very rare and very common words:
- words appearing less than 15 times
- words appearing in more than 10% of all documents
'''
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n= 100000)

In [315]:
df_ngram

Unnamed: 0,frequency,bigram/trigram
0,106,safety feature
1,93,gas mileage
2,70,fun drive
3,54,back seat
4,50,front seat
...,...,...
92646,1,ability change color
92647,1,abandoned leasing done
92648,1,abandoned leasing
92649,1,aaa rating hudson


In [316]:
'''
Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
words and how many times those words appear. Save this to 'bow_corpus'
'''
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [317]:
from textblob import TextBlob
df_ngram['polarity'] = df_ngram['bigram/trigram'].apply(lambda x: TextBlob(x).polarity)
df_ngram['subjective'] = df_ngram['bigram/trigram'].apply(lambda x: TextBlob(x).subjectivity)

In [318]:
df_ngram

Unnamed: 0,frequency,bigram/trigram,polarity,subjective
0,106,safety feature,0.0,0.0
1,93,gas mileage,0.0,0.0
2,70,fun drive,0.3,0.2
3,54,back seat,0.0,0.0
4,50,front seat,0.0,0.0
...,...,...,...,...
92646,1,ability change color,0.0,0.0
92647,1,abandoned leasing done,0.0,0.0
92648,1,abandoned leasing,0.0,0.0
92649,1,aaa rating hudson,0.0,0.0


In [319]:
df_ngram[df_ngram["subjective"]!=0]

Unnamed: 0,frequency,bigram/trigram,polarity,subjective
2,70,fun drive,0.3,0.200000
6,46,sound system,0.4,0.400000
10,40,much better,0.5,0.500000
13,38,blind spot,-0.5,0.666667
18,30,year old,0.1,0.200000
...,...,...,...,...
92620,1,able balanced,0.5,0.625000
92631,1,ability remote start,-0.1,0.200000
92632,1,ability remote,-0.1,0.200000
92635,1,ability really customize,0.2,0.200000


## Topic modeling
We can also do some topic modeling with text data. There are two ways to do this: NMF models and LDA models. We will show examples using both methods next.

### NMF models

Non-Negative Matrix Factorization (NMF) is a matrix decomposition method, which decomposes a matrix into the product of W and H of non-negative elements. The default method optimizes the distance between the original matrix and WH, i.e., the Frobenius norm. Below is an example where we use NMF to produce 3 topics and we showed 3 bigrams/trigrams in each topic.


In [306]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.pipeline import make_pipeline
tfidf_vectorizer = TfidfVectorizer(stop_words=stoplist, ngram_range=(2,3))
nmf = NMF(n_components=3)
pipe = make_pipeline(tfidf_vectorizer, nmf)
pipe.fit(complaint_df['Review'])
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += ", ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
print_top_words(nmf, tfidf_vectorizer.get_feature_names(), n_top_words=3)

Topic #0: best ever owned, ever owned, best ever
Topic #1: steering wheel, perfect steering, perfect steering wheel
Topic #2: safety feature, look great, update mile




Maximum number of iterations 200 reached. Increase it to improve convergence.



### LDA models
> Latent Dirichlet Allocation is a generative probabilistic model for collections of discrete dataset such as text corpora. It is also a topic model that is used for discovering abstract topics from a collection of documents.

Here in our example, we use the function LatentDirichletAllocation, which “implements the online variational Bayes algorithm and supports both online and batch update methods”. Here we show an example where the learning method is set to the default value “online”.

In [307]:
from sklearn.decomposition import LatentDirichletAllocation
tfidf_vectorizer = TfidfVectorizer(stop_words=stoplist, ngram_range=(2,3))
lda = LatentDirichletAllocation(n_components=3)
pipe = make_pipeline(tfidf_vectorizer, lda)
pipe.fit(complaint_df['Review'])
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += ", ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
print_top_words(lda, tfidf_vectorizer.get_feature_names(), n_top_words=3)

Topic #0: gas mileage, safety feature, great comfortable
Topic #1: safe fun, ever owned, best ever owned
Topic #2: test drive, fun drive, fuel economy



In [311]:

number_topics = 10
number_words = 5

corpus = complaint_df['Review'].tolist()
    # print(corpus)
tf_vectorizer = CountVectorizer(max_df=0.9, min_df=0.00, stop_words="english", tokenizer=tokenize_only) # Use tf (raw term count) features for LDA.
tf = tf_vectorizer.fit_transform(corpus)

# Create and fit the LDA model
model = LDA(n_components=number_topics, n_jobs=-1)
id_topic = model.fit(tf)
# Print the topics found by the LDA model
print("Topics found via LDA:")
topic_keywords = show_topics(vectorizer=tf_vectorizer, lda_model=model, n_words=number_words)        
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]

df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]

df_topic_keywords = df_topic_keywords.reset_index()
df_topic_keywords['topic_index'] = df_topic_keywords['index'].str.split(' ', n = 1, expand = True)[[1]].astype('int')
print(df_topic_keywords)
    
############ get the dominat topic for each document in a data frame ###############
# Create Document — Topic Matrix
lda_output = model.transform(tf)
# column names
topicnames = ["Topic" + str(i) for i in range(model.n_components)]
# index names
docnames = ["Doc" + str(i) for i in range(len(corpus))]
    
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic   
df_document_topic = df_document_topic.reset_index()
        
df_document_topic

Topics found via LDA:
     index    Word 0  Word 1   Word 2       Word 3   Word 4  topic_index
0  Topic 0      love    year    drive         seat     like            0
1  Topic 1      like   drive     seat      driving  feature            1
2  Topic 2  interior    love     like          row    think            2
3  Topic 3     great    good     seat         tire     mile            3
4  Topic 4      seat     new   design         good   little            4
5  Topic 5     drive    time     work         seat   engine            5
6  Topic 6      good    feel  driving        brand     love            6
7  Topic 7     drive    seat    great  comfortable     like            7
8  Topic 8     drive    mile    great         love   design            8
9  Topic 9     great  safety    drive         seat  feature            9


Unnamed: 0,index,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
0,Doc0,0.00,0.98,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1
1,Doc1,0.00,0.00,0.00,0.00,0.98,0.00,0.00,0.00,0.00,0.00,4
2,Doc2,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.98,0.00,0.00,7
3,Doc3,0.00,0.98,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1
4,Doc4,0.00,0.00,0.00,0.00,0.98,0.00,0.00,0.00,0.00,0.00,4
...,...,...,...,...,...,...,...,...,...,...,...,...
827,Doc827,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.85,9
828,Doc828,0.19,0.42,0.00,0.00,0.00,0.00,0.00,0.24,0.14,0.00,1
829,Doc829,0.00,0.00,0.98,0.00,0.00,0.00,0.00,0.00,0.00,0.00,2
830,Doc830,0.98,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0
