# Sentiment Analysis on the Extracted Tweets (CeraVe)

## 1 - Import the libraries

In [1]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from textblob import TextBlob

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.pipeline import make_pipeline

import pandas as pd
import numpy as np
import re  
import matplotlib.pyplot as plt

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Angela\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Angela\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 2 - Import the tweets

In [2]:
# This are tweets mentioning the brand together with other key words in the last five years
# key words: "review, quality, packaging, scam, works, animal, testing, cruelty, free, skin, recommend, scam"
decleor = pd.read_csv("tweets_decleor.csv")
decleor = decleor.drop('Unnamed: 0', axis=1) # dropping a useless column

In [3]:
## NLP TRIGRAMS AND BIGRAMS
## BUSQUEDA DE PALABRAS CLAVE RELACIONADAS CON EL MUNDO DE LA COSMETICA 
## Y DE LA EMPRESA -- QUE PORCENTAJE SON MALOS? HAY ALGUNA ESPECIALMENTE MALA?
## Usar tokenizer 

## 3 - Conducting Sentiment Analysis

### 3.1 - Prepare Textual Data for Sentiment Analysis

In [4]:
# 1 - Text cleaning
def clean_up(s):
    clean = re.sub(r'http\S+', '', s)
    return re.sub('[^A-Za-z ]+', ' ', clean).lower().strip()

def clean_up2(s):
    clean2 = re.sub(r'\W*\b\w{1,3}\b', '', s)
    return re.sub('[^A-Za-z ]+', ' ', clean2).lower().strip()

# 2 - Tokenization
def tokenize(s):
    return nltk.word_tokenize(s)

# 3 - Stem and Lemmatize
def stem_and_lemmatize(l):
    ps = nltk.PorterStemmer()
    lemmatizer = nltk.WordNetLemmatizer()
    list_words = []
    
    for w in l:
        s = ps.stem(w)
        s = lemmatizer.lemmatize(s)
        list_words += [s] 
    return list_words

# 4 - Stop words removal
def remove_stopwords(l):
    stop_words = stopwords.words('english')
    return[w for w in l if w not in stop_words]

In [5]:
# In order to avoid errors, change type to string
decleor['Tweet'] = decleor['Tweet'].apply(str)  # change to string for avoiding errors

In [6]:
decleor['Tweet'] = decleor['Tweet'].apply(clean_up).apply(clean_up2).apply(tokenize).apply(remove_stopwords)
decleor

Unnamed: 0,Date,User,Tweet
0,2022-12-31 23:20:17+00:00,dickinsonhqddd,"[decleor, cleansing, hydrating, ritual, piece,..."
1,2022-12-30 18:49:38+00:00,nsdbeauty,"[decleor, hydra, floral, anti, pollution, hydr..."
2,2022-12-30 07:11:10+00:00,sengerdvspz,"[decleor, cleansing, hydrating, ritual, piece]"
3,2022-12-29 12:45:57+00:00,Gretchen62H,"[decleor, aroma, cleanse, make, remover, unise..."
4,2022-12-29 08:21:25+00:00,Waldo42M,"[decleor, cleansing, hydrating, ritual, piece,..."
...,...,...,...
1727,2020-01-03 17:30:09+00:00,Strand0nBeauty,"[affiliate, link, needing, decleoruk, decleor,..."
1728,2020-01-03 11:44:54+00:00,sarah_berryman1,"[onset, autumn, dull, glow, check, fabulous, m..."
1729,2020-01-03 06:33:08+00:00,delooxnederland,"[dagaanbieding, verzorging, decleor, aromessen..."
1730,2020-01-02 16:18:26+00:00,hannah_summer,"[collagemw, tkmaxx, lots, decleor, quinn, witc..."


### 3.2 - Get Polarity and Subjectivity of Tweets

In [7]:
# To avoid problems
decleor['Tweet'] = decleor['Tweet'].apply(str)
tweet = decleor['Tweet']

In [8]:
# Create a function to get subjectivity
def getSubjectivity(tweet):
    return TextBlob(tweet).sentiment.subjectivity
# Create a function to get polarity
def getPolarity(tweet):
    return TextBlob(tweet).sentiment.polarity

# Create two new columns called 'Subjectivity' and 'Polarity'
decleor['Subjectivity'] = decleor['Tweet'].apply(getSubjectivity)
decleor['Polarity'] = decleor['Tweet'].apply(getPolarity)

In [9]:
decleor

Unnamed: 0,Date,User,Tweet,Subjectivity,Polarity
0,2022-12-31 23:20:17+00:00,dickinsonhqddd,"['decleor', 'cleansing', 'hydrating', 'ritual'...",0.00,0.000000
1,2022-12-30 18:49:38+00:00,nsdbeauty,"['decleor', 'hydra', 'floral', 'anti', 'pollut...",0.60,-0.133333
2,2022-12-30 07:11:10+00:00,sengerdvspz,"['decleor', 'cleansing', 'hydrating', 'ritual'...",0.00,0.000000
3,2022-12-29 12:45:57+00:00,Gretchen62H,"['decleor', 'aroma', 'cleanse', 'make', 'remov...",0.00,0.000000
4,2022-12-29 08:21:25+00:00,Waldo42M,"['decleor', 'cleansing', 'hydrating', 'ritual'...",0.00,0.000000
...,...,...,...,...,...
1727,2020-01-03 17:30:09+00:00,Strand0nBeauty,"['affiliate', 'link', 'needing', 'decleoruk', ...",0.00,0.000000
1728,2020-01-03 11:44:54+00:00,sarah_berryman1,"['onset', 'autumn', 'dull', 'glow', 'check', '...",0.75,0.054167
1729,2020-01-03 06:33:08+00:00,delooxnederland,"['dagaanbieding', 'verzorging', 'decleor', 'ar...",0.95,0.600000
1730,2020-01-02 16:18:26+00:00,hannah_summer,"['collagemw', 'tkmaxx', 'lots', 'decleor', 'qu...",0.95,0.600000


In [10]:
# Create a function to get the sentiment text
def getSentiment(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

In [11]:
# Create a column to store the text sentiment
decleor['Sentiment'] = decleor['Polarity'].apply(getSentiment)
# Show the data
decleor

Unnamed: 0,Date,User,Tweet,Subjectivity,Polarity,Sentiment
0,2022-12-31 23:20:17+00:00,dickinsonhqddd,"['decleor', 'cleansing', 'hydrating', 'ritual'...",0.00,0.000000,Neutral
1,2022-12-30 18:49:38+00:00,nsdbeauty,"['decleor', 'hydra', 'floral', 'anti', 'pollut...",0.60,-0.133333,Negative
2,2022-12-30 07:11:10+00:00,sengerdvspz,"['decleor', 'cleansing', 'hydrating', 'ritual'...",0.00,0.000000,Neutral
3,2022-12-29 12:45:57+00:00,Gretchen62H,"['decleor', 'aroma', 'cleanse', 'make', 'remov...",0.00,0.000000,Neutral
4,2022-12-29 08:21:25+00:00,Waldo42M,"['decleor', 'cleansing', 'hydrating', 'ritual'...",0.00,0.000000,Neutral
...,...,...,...,...,...,...
1727,2020-01-03 17:30:09+00:00,Strand0nBeauty,"['affiliate', 'link', 'needing', 'decleoruk', ...",0.00,0.000000,Neutral
1728,2020-01-03 11:44:54+00:00,sarah_berryman1,"['onset', 'autumn', 'dull', 'glow', 'check', '...",0.75,0.054167,Positive
1729,2020-01-03 06:33:08+00:00,delooxnederland,"['dagaanbieding', 'verzorging', 'decleor', 'ar...",0.95,0.600000,Positive
1730,2020-01-02 16:18:26+00:00,hannah_summer,"['collagemw', 'tkmaxx', 'lots', 'decleor', 'qu...",0.95,0.600000,Positive


In [12]:
decleor['Sentiment'].value_counts()

Positive    894
Neutral     649
Negative    189
Name: Sentiment, dtype: int64

In [13]:
decleor.to_excel('decleor_clean.xlsx')

### 3.3 - Plot the results

It will take a HUUUGE time to lost and it is time consuming. We have no time to loose so we will visualize the Data with Tableau

## 4 - Sentiment Analysis of Bigram/Trigram

In [14]:
# We will explore word associations
# N-Grams analysis are often used to see which words often show up together
# N-Gram -- Contiguous sequence of n items from a given sample of text

In [15]:
stoplist = stopwords.words('english')

In [16]:
tweet = decleor['Tweet']

In [17]:
c_vec = CountVectorizer(stop_words=stoplist, ngram_range=(2,3))
# Matrix of ngrams
ngrams = c_vec.fit_transform(tweet)
# Count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)
# List of NGrams
vocab = c_vec.vocabulary_
df_ngram = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'bigram/trigram'})

In [18]:
# Similar to the sentiment analysis before, we can calculate the polarity and subjectivity for each bigram/trigram
df_ngram['polarity'] = df_ngram['bigram/trigram'].apply(lambda x: TextBlob(x).polarity)
df_ngram['subjective'] = df_ngram['bigram/trigram'].apply(lambda x: TextBlob(x).subjectivity)

In [19]:
df_ngram

Unnamed: 0,frequency,bigram/trigram,polarity,subjective
0,92,green mandarin,-0.20,0.3000
1,64,essential oils,0.00,0.3000
2,63,decleor aroma,0.00,0.0000
3,57,decleor skincare,0.00,0.0000
4,56,anti ageing,0.00,0.0000
...,...,...,...,...
29680,1,able free,0.45,0.7125
29681,1,able afford collections,0.50,0.6250
29682,1,able afford,0.50,0.6250
29683,1,abandong decleor heavy,-0.20,0.5000


In [20]:
# Create a function to get the sentiment text
def getSentiment(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

In [21]:
# Create a column to store the text sentiment
df_ngram['Sentiment'] = df_ngram['polarity'].apply(getSentiment)
# Show the data
df_ngram

Unnamed: 0,frequency,bigram/trigram,polarity,subjective,Sentiment
0,92,green mandarin,-0.20,0.3000,Negative
1,64,essential oils,0.00,0.3000,Neutral
2,63,decleor aroma,0.00,0.0000,Neutral
3,57,decleor skincare,0.00,0.0000,Neutral
4,56,anti ageing,0.00,0.0000,Neutral
...,...,...,...,...,...
29680,1,able free,0.45,0.7125,Positive
29681,1,able afford collections,0.50,0.6250,Positive
29682,1,able afford,0.50,0.6250,Positive
29683,1,abandong decleor heavy,-0.20,0.5000,Negative


In [22]:
df_ngram['Sentiment'].value_counts()

Neutral     22698
Positive     5572
Negative     1415
Name: Sentiment, dtype: int64

In [23]:
df_ngram.to_excel("decleor_ngram.xlsx")