In [1]:
import pandas as pd
import numpy as np
import re
import spacy
import contractions
from spellchecker import SpellChecker

In [2]:
data = pd.read_csv('trustpilot_reviews.csv').iloc[:,1:]
data.columns = ['review_title','review_body','company_source']
data = data.replace('(\\n)','',regex=True)
data.head()

Unnamed: 0,review_title,review_body,company_source
0,I ordered an item and had a little…,I ordered an item and had a li...,zalando
1,I ordered few items for 800 euro,I ordered few items for 800 eu...,zalando
2,Always the first choice for apparel shopping.,Always the first choice for ap...,zalando
3,"Support is bad, return items and don't get mon...",Ordered items for 1200€. Got t...,zalando
4,Utterley useless in every resepct,"2 items ordered, the first cam...",zalando


In [3]:
data['review_title'] = data['review_title'].apply(lambda x: contractions.fix(x))
data['review_body'] = data['review_body'].apply(lambda x: contractions.fix(x))

In [4]:
data.head()

Unnamed: 0,review_title,review_body,company_source
0,I ordered an item and had a little…,I ordered an item and had a li...,zalando
1,I ordered few items for 800 euro,I ordered few items for 800 eu...,zalando
2,Always the first choice for apparel shopping.,Always the first choice for ap...,zalando
3,"Support is bad, return items and do not get mo...",Ordered items for 1200€. Got t...,zalando
4,Utterley useless in every resepct,"2 items ordered, the first cam...",zalando


In [None]:
data = data.replace('(Amazon|amazon|Outfittery|outfittery|zalando|Zalando)','',regex=True)

In [5]:
nlp = spacy.load("en_core_web_sm")
spell = SpellChecker(case_sensitive=True)

In [6]:
review_title = list(data['review_title'])
review_body = list(data['review_body'])
len(review_title)

653

In [7]:
def correct_spelling(review_list):
    new_list = []
    for sentence in review_list:
        new_sentence = ' '.join([spell.correction(word) for word in spell.split_words(sentence)])
        new_list.append(new_sentence)
    return new_list
    

In [8]:
review_title = correct_spelling(review_title)
review_body = correct_spelling(review_body)

In [9]:
review_title[4]

'utterly useless in every respect'

In [10]:
data['correct_title'] = review_title
data['correct_body'] = review_body

In [11]:
data.head()

Unnamed: 0,review_title,review_body,company_source,correct_title,correct_body
0,I ordered an item and had a little…,I ordered an item and had a li...,zalando,i ordered an item and had a little,i ordered an item and had a little payment mis...
1,I ordered few items for 800 euro,I ordered few items for 800 eu...,zalando,i ordered few items for 800 euro,i ordered few items for 800 euro came only che...
2,Always the first choice for apparel shopping.,Always the first choice for ap...,zalando,always the first choice for apparel shopping,always the first choice for apparel shopping i...
3,"Support is bad, return items and do not get mo...",Ordered items for 1200€. Got t...,zalando,support is bad return items and do not get mon...,ordered items for 1200 got them all but in the...
4,Utterley useless in every resepct,"2 items ordered, the first cam...",zalando,utterly useless in every respect,2 items ordered the first came the next day dh...


In [12]:
sentiments = pd.DataFrame(data['correct_body'])
sentiments.to_csv('sentiments.csv')

In [9]:
def text_preprocessing(sentence):
    #do some cleaning
    sentence = nlp(sentence)
    clean_text = []
    for word in sentence:
        if not word.is_stop:
            if word.is_alpha:
                clean_text.append(word.lemma_)
    return clean_text

In [10]:
# lemma_words_lyrics = []
# for words in review_title:
#     new = text_preprocessing(words.lower())
#     lemma_words_lyrics.append(new)

In [11]:
# sentence_lyrics = []
# for n in lemma_words_lyrics:
#     new_word = ' '.join(n)
#     sentence_lyrics.append(new_word)

In [12]:
# sentence_lyrics[3]

In [13]:
def get_lemma_sentences(review_list):
    lemma_words = []
    joined_sentence_list = []
    for words in review_list:
        new = text_preprocessing(words.lower())
        lemma_words.append(new)
    for n in lemma_words:
#         n = [spell.correction(word) for word in n]
        new_word = ' '.join(n)
        joined_sentence_list.append(new_word)
    return joined_sentence_list
    
    

In [14]:
lemma_title = get_lemma_sentences(review_title)
lemma_body = get_lemma_sentences(review_body)

In [15]:
data['review_title'] = lemma_title
data['review_body'] = lemma_body

In [16]:
data.head()

Unnamed: 0,review_title,review_body,company_source
0,order item little,order item little payment misunderstanding zea...,zalando
1,order item euro,order item euro come cheap item euro order del...,zalando
2,choice apparel shopping,choice apparel shopping customer year sure mis...,zalando
3,support bad return item money,order item get end lot item fit want return it...,zalando
4,utterly useless respect,item order come day dhl electrictronically inf...,zalando


In [17]:
all_review_body = pd.DataFrame(data['review_body'])

In [18]:
all_review_body.to_csv('review_body.csv')