In [3]:
import pandas as pd
import re
import spacy
reviews = pd.read_csv("reviews_preprocessed.csv", delimiter = ',')



In [4]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [5]:
#lemmatization
reviews['reviews_text'] = reviews['review_text'].apply(lambda x: [token.lemma_ for token in nlp(x)])

In [6]:
#remove words with capital letters
reviews['reviews_text'] = reviews['reviews_text'].apply(lambda x: [re.sub(r'\s*[A-Z]\w*\s*', '', word) for word in x])

In [7]:
#remove 2-letter words
reviews['reviews_text'] = reviews['reviews_text'].apply(lambda x: [re.sub(r'\W*\b\w{1,2}\b', '', word) for word in x])

In [8]:
#remove empty spaces
reviews['reviews_text'] = reviews['reviews_text'].apply(lambda x: [i for i in x if i])

In [9]:
reviews['reviews_text']

0         [good, place, business, friendly, courteous, k...
1         [just, purchase, new, purchase, quick, painles...
2           [they, friendly, helpful, year, great, service]
3         [salesman, extremely, nice, patient, answer, q...
4         [not, pleased, visit, major, concern, address,...
                                ...                        
370012    [great, place, purchase, vehicle, rest, team, ...
370013    [receive, great, customer, service, recent, pu...
370014    [great, even, not, buy, buy, nice, lot, give, ...
370015    [salesman, want, sell, helpful, like, friend, ...
370016    [big, thank, rest, staff, help, choose, purcha...
Name: reviews_text, Length: 370017, dtype: object

Lemmatization has split the reviews into separate words, but to apply VADER, we need sentences, so we join the words back to sentences

Sentiment polarity

In [43]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [41]:
reviews['sentence'] = reviews['reviews_text'].apply(lambda x: ' '.join(x))

In [42]:
reviews['sentence']

0         good place business friendly courteous knowled...
1         just purchase new purchase quick painless ever...
2                  they friendly helpful year great service
3         salesman extremely nice patient answer questio...
4         not pleased visit major concern address all it...
                                ...                        
370012    great place purchase vehicle rest team wonderf...
370013    receive great customer service recent purchase...
370014    great even not buy buy nice lot give treatment...
370015    salesman want sell helpful like friend favor t...
370016    big thank rest staff help choose purchase new ...
Name: sentence, Length: 370017, dtype: object

In [44]:
scores = []
for sentence in reviews['sentence']:
    sid = SentimentIntensityAnalyzer()
    #print(sentence)
    ss = sid.polarity_scores(sentence)
    #for k in sorted(ss):
    scores.append(ss['compound'])
    #print(ss['compound'])
        #print('{0}: {1}, '.format(k, ss[k]), end='')
    #print()

In [45]:
len(scores)

370017

In [46]:
scores

[0.8979,
 0.9246,
 0.8779,
 0.8777,
 -0.3412,
 0.763,
 0.8442,
 0.9287,
 0.7841,
 0.9153,
 0.9743,
 0.4939,
 0.0,
 0.6249,
 0.4404,
 0.9501,
 0.4588,
 0.0382,
 0.7845,
 0.714,
 0.5106,
 0.0,
 0.91,
 0.8519,
 0.8074,
 0.891,
 0.7096,
 0.2755,
 0.8316,
 0.8074,
 0.6597,
 0.8176,
 0.8481,
 -0.0258,
 0.9201,
 0.4939,
 0.802,
 0.9042,
 -0.2732,
 0.7003,
 0.8934,
 0.9705,
 -0.4763,
 -0.6705,
 0.9352,
 0.6369,
 0.9828,
 0.9255,
 0.978,
 0.9509,
 -0.4767,
 0.8849,
 0.765,
 0.7351,
 0.128,
 0.836,
 0.7906,
 -0.7845,
 -0.8161,
 0.8519,
 0.6908,
 0.6249,
 0.9081,
 0.8126,
 -0.1531,
 0.6597,
 0.7845,
 0.4404,
 0.8201,
 0.7269,
 0.6908,
 0.9081,
 0.765,
 0.8689,
 0.8834,
 0.7783,
 0.3975,
 0.7171,
 0.9666,
 0.9216,
 -0.4404,
 0.7269,
 0.8885,
 0.8074,
 0.891,
 0.7845,
 -0.0516,
 0.7579,
 0.6908,
 0.4404,
 0.7027,
 0.6597,
 0.7351,
 0.5859,
 0.6369,
 0.8625,
 0.8689,
 0.4019,
 0.9022,
 0.9603,
 0.8481,
 0.9337,
 0.8885,
 0.8934,
 0.9022,
 0.3182,
 0.8225,
 0.9246,
 0.8834,
 0.8316,
 0.7906,
 0.9231,

In [47]:
reviews['sentiment_score'] = scores

In [48]:
reviews

Unnamed: 0,dealer_id,review_score,review_title,review_date,review_text,cust_service,buying_process,repair_quality,facilities,experience,recommend,purchase_type,purchase,reviews_text,sentence,sentiment_score
0,924b6a44-24bb-4640-844d-2ace5ea8f188,5.0,"Good place to do business with, friendly, cour...",2023-02-15,Good place business friendly courteous knowled...,,,,,5.0,yes,new,no purchase,"[good, place, business, friendly, courteous, k...",good place business friendly courteous knowled...,0.8979
1,924b6a44-24bb-4640-844d-2ace5ea8f188,5.0,Happy Customer,2023-02-14,Just purchased new Telluride James Myrick purc...,,5.0,,5.0,5.0,yes,new,yes,"[just, purchase, new, purchase, quick, painles...",just purchase new purchase quick painless ever...,0.9246
2,924b6a44-24bb-4640-844d-2ace5ea8f188,5.0,They have always been friendly and helpful.,2023-02-14,They friendly helpful Ive years great service,,,,,5.0,yes,repair,no purchase,"[they, friendly, helpful, year, great, service]",they friendly helpful year great service,0.8779
3,924b6a44-24bb-4640-844d-2ace5ea8f188,5.0,My salesman was Ghina!,2023-02-13,My salesman Ghina He extremely nice patient an...,,,,,5.0,yes,used,no purchase,"[salesman, extremely, nice, patient, answer, q...",salesman extremely nice patient answer questio...,0.8777
4,924b6a44-24bb-4640-844d-2ace5ea8f188,2.0,Not pleased this visit.,2023-02-13,Not pleased visit Major concern addressed All ...,,,,,2.0,no,repair,no purchase,"[not, pleased, visit, major, concern, address,...",not pleased visit major concern address all it...,-0.3412
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370012,593465de-0226-4a67-8d89-320ae6bee4f6,5.0,Mercedes of Loveland was a great place to purc...,2020-01-18,Mercedes Loveland great place purchase vehicle...,,,,,5.0,yes,used,no purchase,"[great, place, purchase, vehicle, rest, team, ...",great place purchase vehicle rest team wonderf...,0.9442
370013,593465de-0226-4a67-8d89-320ae6bee4f6,5.0,We received great customer service during the ...,2020-01-17,We received great customer service recent purc...,,,,,5.0,yes,new,no purchase,"[receive, great, customer, service, recent, pu...",receive great customer service recent purchase...,0.7269
370014,593465de-0226-4a67-8d89-320ae6bee4f6,5.0,Ken was great! Even though I didn’t buy a Merc...,2020-01-17,Ken great Even I didnt buy Mercedes I bought n...,,,,,5.0,yes,used,no purchase,"[great, even, not, buy, buy, nice, lot, give, ...",great even not buy buy nice lot give treatment...,0.7998
370015,593465de-0226-4a67-8d89-320ae6bee4f6,5.0,Great experience,2020-01-15,My salesman want sell He helpful like friend f...,5.0,5.0,,,5.0,yes,used,yes,"[salesman, want, sell, helpful, like, friend, ...",salesman want sell helpful like friend favor t...,0.9349


In [51]:
#take only relevant columns
analysis_dataset = reviews[['dealer_id', 'review_score', 'cust_service', 'buying_process', 
                            'repair_quality', 'facilities', 'experience', 'recommend', 'purchase_type',
                            'purchase', 'sentiment_score']].copy()

In [52]:
analysis_dataset

Unnamed: 0,dealer_id,review_score,cust_service,buying_process,repair_quality,facilities,experience,recommend,purchase_type,purchase,sentiment_score
0,924b6a44-24bb-4640-844d-2ace5ea8f188,5.0,,,,,5.0,yes,new,no purchase,0.8979
1,924b6a44-24bb-4640-844d-2ace5ea8f188,5.0,,5.0,,5.0,5.0,yes,new,yes,0.9246
2,924b6a44-24bb-4640-844d-2ace5ea8f188,5.0,,,,,5.0,yes,repair,no purchase,0.8779
3,924b6a44-24bb-4640-844d-2ace5ea8f188,5.0,,,,,5.0,yes,used,no purchase,0.8777
4,924b6a44-24bb-4640-844d-2ace5ea8f188,2.0,,,,,2.0,no,repair,no purchase,-0.3412
...,...,...,...,...,...,...,...,...,...,...,...
370012,593465de-0226-4a67-8d89-320ae6bee4f6,5.0,,,,,5.0,yes,used,no purchase,0.9442
370013,593465de-0226-4a67-8d89-320ae6bee4f6,5.0,,,,,5.0,yes,new,no purchase,0.7269
370014,593465de-0226-4a67-8d89-320ae6bee4f6,5.0,,,,,5.0,yes,used,no purchase,0.7998
370015,593465de-0226-4a67-8d89-320ae6bee4f6,5.0,5.0,5.0,,,5.0,yes,used,yes,0.9349


In [53]:
analysis_dataset.to_csv("analysis_dataset.csv", index=False)

In [50]:
reviews.to_csv("reviews_preprocessed.csv", index=False)

In [49]:
#save sentiment scores separately
with open('sentiment_scores.txt', 'w+') as f:
    # write elements of list
    for items in scores:
        f.write('%s\n' %items)
       
    
f.close()