In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import spacy
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import time
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from matplotlib import pyplot as plt

In [2]:
#Completed by: Zongpu Liu

In [3]:
# Input data
data = pd.read_csv("Original data.csv",encoding='ANSI')
data.head()

Unnamed: 0,Pro_name,Pro_color,Cus_name,Rating,Title,Date,Review
0,iphone 11,Black,brenda,1,Not FULLY Unlocked,"Reviewed in the United States on February 13, ...",Purchased this product advertised as fully unl...
1,iphone 11,Black,Katherine,4,"NOT EXPECTED, GREAT PURCHASE!","Reviewed in the United States on August 18, 2020",I was feeling a bit skeptical after I placed m...
2,iphone 11,Black,Katherine,1,Phone was NOT unlocked,"Reviewed in the United States on November 27, ...",Phone was not unlocked could it use it
3,iphone 11,Black,brenda,1,Terribly Flawed. Dona??t waste your time.,"Reviewed in the United States on October 16, 2020",The screen came cracked and popped out of the ...
4,iphone 11,Black,James Roth,3,Be careful used phones,"Reviewed in the United States on January 29, 2020",Be careful mine had a scratch on screen very s...


In [4]:
# Describe the selected original data set
data.describe()

Unnamed: 0,Pro_name,Pro_color,Cus_name,Rating,Title,Date,Review
count,10617,10617,10548,10595,10604,10600,10571
unique,5,4,7174,8,5693,2066,7755
top,iphone XR,Black,Amazon Customer,5,Good,3-Nov-19,Good
freq,5010,7030,800,7282,247,107,170


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10618 entries, 0 to 10617
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Pro_name   10617 non-null  object
 1   Pro_color  10617 non-null  object
 2   Cus_name   10548 non-null  object
 3   Rating     10595 non-null  object
 4   Title      10604 non-null  object
 5   Date       10600 non-null  object
 6   Review     10571 non-null  object
dtypes: object(7)
memory usage: 580.8+ KB


Clean data

In [6]:
# check duplicated data
data.duplicated(subset=['Cus_name','Title','Review']).value_counts()

False    10560
True        58
dtype: int64

In [7]:
# We have 58 duplicate customers with same 'Title' and 'Review'.
# drop duplicated data
sorted_data = data.sort_values(by=["Pro_name"], axis=0, ascending=True, inplace=False, kind='mergesort', na_position='last')

clean1_data = sorted_data.drop_duplicates(subset={'Cus_name','Title','Review'}, keep='first', inplace=False)
clean1_data .shape

(10560, 7)

In [8]:
# Remove unnecessary columns
data = clean1_data.drop(['Pro_name','Pro_color','Date'], axis = 1)

# drop null value 
data1 = data.dropna(axis = 0 , how = 'any')
data1.head()

Unnamed: 0,Cus_name,Rating,Title,Review
4767,Liane,1,Not Unlocked,There is an outstanding balance due on this ph...
4768,Amazon Customer,1,SCAM,not iphone 11 pro max. got box that belongs to...
4769,Marat,1,Dona??t recommend it,Be carefully! It is not new. Check serial numb...
4770,willis,5,Great buy and great value,This is my review of the 512GB iPhone 11 Pro M...
4771,Riddhi,1,Not functioning as advertised! Do not buy!,The phone doesna??t not work. The phone turn...


In [9]:
data1.shape

(10464, 4)

In [10]:
# Convert words to lower case
data1 = data1.applymap(lambda s:s.lower() if type(s) == str else s)
data1.head()

Unnamed: 0,Cus_name,Rating,Title,Review
4767,liane,1,not unlocked,there is an outstanding balance due on this ph...
4768,amazon customer,1,scam,not iphone 11 pro max. got box that belongs to...
4769,marat,1,dona??t recommend it,be carefully! it is not new. check serial numb...
4770,willis,5,great buy and great value,this is my review of the 512gb iphone 11 pro m...
4771,riddhi,1,not functioning as advertised! do not buy!,the phone doesna??t not work. the phone turn...


In [11]:
# Prepare to remove stopwords.
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [12]:
# Because we need to keep some words that express emotions, we need to remove 'no','not' and 'nor' from stops 
stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", \
             "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", \
             'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',\
             'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', \
             'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', \
             'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
             'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', \
             'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', \
             'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', \
             'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', \
             'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", \
             'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', \
             "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't",
             'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't" \
            ]

In [13]:
# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [14]:
#  Define a function to remove unwanted characters, stopwords, and convert words to lower
def clean_text(text, remove_stopwords = True):

    # Replace contractions with their longer forms 
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'/<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    # Optionally, remove stopwords
    if remove_stopwords:
        text = text.split()        
        text = [w for w in text if not w in stopwords]
        text = " ".join(text)

    return text

In [15]:
# Clean the summaries and texts
# Clean the summaries and texts
clean_Review = []
for summary in data1.Review:
    clean_Review.append(clean_text(summary, remove_stopwords=False))
print("Summaries are complete.")

clean_Title = []
for text in data1.Title:
    clean_Title.append(clean_text(text))
print("Texts are complete.")

Summaries are complete.
Texts are complete.


In [16]:
data1['clean_Review'] = clean_Review
data1['clean_Title'] = clean_Title
data1.head()

Unnamed: 0,Cus_name,Rating,Title,Review,clean_Review,clean_Title
4767,liane,1,not unlocked,there is an outstanding balance due on this ph...,there is an outstanding balance due on this ph...,not unlocked
4768,amazon customer,1,scam,not iphone 11 pro max. got box that belongs to...,not iphone 11 pro max got box that belongs to...,scam
4769,marat,1,dona??t recommend it,be carefully! it is not new. check serial numb...,be carefully it is not new check serial numb...,dona recommend
4770,willis,5,great buy and great value,this is my review of the 512gb iphone 11 pro m...,this is my review of the 512gb iphone 11 pro m...,great buy great value
4771,riddhi,1,not functioning as advertised! do not buy!,the phone doesna??t not work. the phone turn...,the phone doesna t not work the phone turned...,not functioning advertised not buy


In [17]:
data1.sample(n=5).style

Unnamed: 0,Cus_name,Rating,Title,Review,clean_Review,clean_Title
213,champagne,5,bad dona??t buy it!,i usually buy all my phones from this seller. but this one came with a defective microphone. however appearance and functionality was good as new. my biggest issue is my microphone wona??t work at all unless i use a headset. dona??t waste your money! y,i usually buy all my phones from this seller but this one came with a defective microphone however appearance and functionality was good as new my biggest issue is my microphone wona t work at all unless i use a headset dona t waste your money y,bad dona buy
5083,lual from facebook,5,all features about this beautiful iphone are truly,my phone opens up immediately soon as i lifted towards my face and i love the quickness.,my phone opens up immediately soon as i lifted towards my face and i love the quickness,features beautiful iphone truly
2549,dayvia morris,1,damaged display,"display defect on black screen as shown in the photos also phone powered off initially when i launced the camera, tried using camera after reboot it worked and then shuts off tring to switch from the face cam to the rear cam i need my money back",display defect on black screen as shown in the photos also phone powered off initially when i launced the camera tried using camera after reboot it worked and then shuts off tring to switch from the face cam to the rear cam i need my money back,damaged display
164,makayla,5,phone was like brand new,i purchased the black iphone 11 and it came in with no scratches or dents in the phone. the screen was also flawless. the battery percent was at 100%. i am satisfied with my purchase and would definitely buy from buyspry again!! also this is my second am,i purchased the black iphone 11 and it came in with no scratches or dents in the phone the screen was also flawless the battery percent was at 100 i am satisfied with my purchase and would definitely buy from buyspry again also this is my second am,phone like brand new
6440,"bidyut das, mumbai",1,not worthy of buy,its very bulky and photo quality is not good.,its very bulky and photo quality is not good,not worthy buy


In [19]:
data1.to_csv('Updated data.csv', index = False)

EDA

In [None]:
data1.describe()

In [None]:
# Randomly select five reviews and check whether the data is clear and successful.
for i in range(5):
    print('Review ',i+1)
    print(clean_Review[i])
    print()

In [None]:
# Count the number of all scores
data1['Rating'].value_counts(normalize=True).plot(kind='bar')
plt.xlabel('Score')
plt.ylabel('Number')
plt.show()

In [None]:
pip install -U textblob

In [None]:
#Use TextBlob to calculate sentiment, the position is in the range of [-1,1], where 1 represents positive sentiment 
#and -1 represents active sentiment
from textblob import TextBlob

data1['polarity'] = data1['clean_Review'].map(lambda text: TextBlob(text).sentiment.polarity)

# Review length
data1['review_len'] = data1['clean_Review'].astype(str).apply(len)
# The number of review words
data1['word_count'] = data1['clean_Review'].apply(lambda x: len(str(x).split()))
data1.head()

In [None]:
import pandas as pd
import seaborn as sns
import datetime
import time

#Statistical review sentiment polarity score distribution
plt.rcParams["font.sans-serif"]='SimHei'
plt.rcParams['axes.unicode_minus']=False
%config InlineBackend.figure_format='svg'

plt.hist(x=data1[['polarity']],bins=50,
        color='steelblue',
        edgecolor='black')
# Add x-axis label and y-axis label
plt.xlabel('Polarity')
plt.ylabel('Number')
# Add tittle
plt.title('Distribution of polarity')
# Show chart
plt.show()

In [None]:
# Distribution of review length
plt.hist(x=data1[['review_len']],bins=100,
        color='steelblue',
        edgecolor='black')
# Add x-axis label and y-axis label
plt.xlabel('review_len')
plt.ylabel('Number')
plt.title('Distribution of review_len')
plt.show()

In [None]:
!pip install plotly
!pip install cufflinks

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px

In [None]:
sns.pairplot(data=data1, x_vars=['polarity', 'review_len', 'word_count'],\
             y_vars=['polarity', 'review_len', 'word_count'], \
             hue='Rating');

In [None]:
(lambda x:(x + 3)*5/2)(3)


In [None]:
x = 2

print(((x + 3)*5/2))

In [None]:
func = lambda x:return x
print(func(2))

In [None]:
print(func(2))

In [None]:
my_list = ['Real', 'Python'] 
def func(x):
    print(''.join(x))




In [None]:
func(2)

In [None]:
fname = "Blessy"
lname = "Vincent"

f = lambda fname,lname: (' ').join([fname,lname])
f(fname,lname)

In [None]:
nameL = ["Blessy","Vincent"]

f = lambda nameL: (' ').join([nameL])
f(nameL)

In [None]:
my_list = ['Real', 'Python'] 
def func(x):
    return '09'.join(x)

In [None]:
func("me too")

In [None]:
b="blessy"
c = ['b','l']



In [None]:
lambda func(x): return ''.join(x)

In [None]:
f = lambda x: 'a'.join(x)
f('mou')

In [None]:
lambda func(x):return ' '.join(x)

In [None]:
L = [lambda x : x ** 2, 
     lambda x : x ** 3, 
     lambda x : x ** 4]

for item in L:
   print (item(2))

In [None]:
my_list = [1, 5, 4, 6, 8, 11, 3, 12]
new_list = list(filter(lambda x: (x%2 == 0) , my_list))

print(new_list)

In [None]:
my_list = ['Real', 'Python'] 
def ls(x):
    return ''.join(x)

In [None]:
ls('me')

ls('me')

In [None]:
my_list = ['Real', 'Python'] 
f = lambda x:''.join(x)

In [None]:
f('me')

In [None]:
my_list = ['Real', 'Python'] 
lambda func(x):return ' '.join(x)


In [None]:
lambda x:return ' '.join(x)

In [None]:
func = lambda x: return x
print(func(2))

In [None]:
lambda x:''.join(x)

In [None]:
f("hello")

In [None]:
my_list = [1, 5, 4, 6, 8, 11, 3, 12]
new_list = list(filter(lambda x: (x%2 == 0) , my_list))

print(new_list)

In [None]:
func = lambda x :  x 


In [None]:
(lambda x: (x + 3)*5/2)(3)

In [None]:
class Sales:
    def __init__(self, id):
        self.id = id
        id = 100 

var = Sales(123)
print(var.id)