# Sentiment Analysis on the Extracted Tweets (CeraVe)

## 1 - Import the libraries

In [9]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from textblob import TextBlob

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.pipeline import make_pipeline

import pandas as pd
import numpy as np
import re  
import matplotlib.pyplot as plt

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Angela\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Angela\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 2 - Import the tweets

In [10]:
# This are tweets mentioning the brand together with other key words in the last five years
# key words: "review, quality, packaging, scam, works, animal, testing, cruelty, free, skin, recommend, scam"
cerave = pd.read_csv("tweets_cerave.csv")
cerave = cerave.drop('Unnamed: 0', axis=1) # dropping a useless column

In [11]:
## NLP TRIGRAMS AND BIGRAMS
## BUSQUEDA DE PALABRAS CLAVE RELACIONADAS CON EL MUNDO DE LA COSMETICA 
## Y DE LA EMPRESA -- QUE PORCENTAJE SON MALOS? HAY ALGUNA ESPECIALMENTE MALA?
## Usar tokenizer 

## 3 - Conducting Sentiment Analysis

### 3.1 - Prepare Textual Data for Sentiment Analysis

In [12]:
# 1 - Text cleaning
def clean_up(s):
    clean = re.sub(r'http\S+', '', s)
    return re.sub('[^A-Za-z ]+', ' ', clean).lower().strip()

def clean_up2(s):
    clean2 = re.sub(r'\W*\b\w{1,3}\b', '', s)
    return re.sub('[^A-Za-z ]+', ' ', clean2).lower().strip()

# 2 - Tokenization
def tokenize(s):
    return nltk.word_tokenize(s)

# 3 - Stem and Lemmatize
def stem_and_lemmatize(l):
    ps = nltk.PorterStemmer()
    lemmatizer = nltk.WordNetLemmatizer()
    list_words = []
    
    for w in l:
        s = ps.stem(w)
        s = lemmatizer.lemmatize(s)
        list_words += [s] 
    return list_words

# 4 - Stop words removal
def remove_stopwords(l):
    stop_words = stopwords.words('english')
    return[w for w in l if w not in stop_words]

In [13]:
# In order to avoid errors, change type to string
cerave['Tweet'] = cerave['Tweet'].apply(str)  # change to string for avoiding errors

In [14]:
cerave['Tweet'] = cerave['Tweet'].apply(clean_up).apply(clean_up2).apply(tokenize).apply(remove_stopwords)
cerave

Unnamed: 0,Date,User,Tweet
0,2022-12-31 23:36:46+00:00,lifeasa_HotMess,"[cerave, repair, cream, cream, dark, circles, ..."
1,2022-12-31 23:04:26+00:00,maylyncoc,"[illuminateaest, thank, much, seeing, used, va..."
2,2022-12-31 22:48:22+00:00,PS5Restocks,"[cerave, cleanser]"
3,2022-12-31 22:48:06+00:00,pluggrr,"[cerave, cleanser]"
4,2022-12-31 21:54:02+00:00,FatKidDeals,"[cerave, cleanser]"
...,...,...,...
19995,2022-08-20 13:42:34+00:00,sarcastamy,"[kendrawrites, passing, judgment, singular, bo..."
19996,2022-08-20 13:30:03+00:00,ChefMimiSanchez,"[cerave, sure, actually, restock]"
19997,2022-08-20 13:29:25+00:00,cerave,"[chefmimisanchez, glad, could, help, keep, hea..."
19998,2022-08-20 13:26:27+00:00,cerave,"[melanneyd, love]"


### 3.2 - Get Polarity and Subjectivity of Tweets

In [15]:
# To avoid problems
cerave['Tweet'] = cerave['Tweet'].apply(str)
tweet = cerave['Tweet']

In [16]:
# Create a function to get subjectivity
def getSubjectivity(tweet):
    return TextBlob(tweet).sentiment.subjectivity
# Create a function to get polarity
def getPolarity(tweet):
    return TextBlob(tweet).sentiment.polarity

# Create two new columns called 'Subjectivity' and 'Polarity'
cerave['Subjectivity'] = cerave['Tweet'].apply(getSubjectivity)
cerave['Polarity'] = cerave['Tweet'].apply(getPolarity)

In [17]:
cerave

Unnamed: 0,Date,User,Tweet,Subjectivity,Polarity
0,2022-12-31 23:36:46+00:00,lifeasa_HotMess,"['cerave', 'repair', 'cream', 'cream', 'dark',...",0.683333,0.033333
1,2022-12-31 23:04:26+00:00,maylyncoc,"['illuminateaest', 'thank', 'much', 'seeing', ...",0.350000,0.100000
2,2022-12-31 22:48:22+00:00,PS5Restocks,"['cerave', 'cleanser']",0.000000,0.000000
3,2022-12-31 22:48:06+00:00,pluggrr,"['cerave', 'cleanser']",0.000000,0.000000
4,2022-12-31 21:54:02+00:00,FatKidDeals,"['cerave', 'cleanser']",0.000000,0.000000
...,...,...,...,...,...
19995,2022-08-20 13:42:34+00:00,sarcastamy,"['kendrawrites', 'passing', 'judgment', 'singu...",0.625000,0.000000
19996,2022-08-20 13:30:03+00:00,ChefMimiSanchez,"['cerave', 'sure', 'actually', 'restock']",0.494444,0.250000
19997,2022-08-20 13:29:25+00:00,cerave,"['chefmimisanchez', 'glad', 'could', 'help', '...",0.750000,0.500000
19998,2022-08-20 13:26:27+00:00,cerave,"['melanneyd', 'love']",0.600000,0.500000


In [18]:
# Create a function to get the sentiment text
def getSentiment(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

In [19]:
# Create a column to store the text sentiment
cerave['Sentiment'] = cerave['Polarity'].apply(getSentiment)
# Show the data
cerave

Unnamed: 0,Date,User,Tweet,Subjectivity,Polarity,Sentiment
0,2022-12-31 23:36:46+00:00,lifeasa_HotMess,"['cerave', 'repair', 'cream', 'cream', 'dark',...",0.683333,0.033333,Positive
1,2022-12-31 23:04:26+00:00,maylyncoc,"['illuminateaest', 'thank', 'much', 'seeing', ...",0.350000,0.100000,Positive
2,2022-12-31 22:48:22+00:00,PS5Restocks,"['cerave', 'cleanser']",0.000000,0.000000,Neutral
3,2022-12-31 22:48:06+00:00,pluggrr,"['cerave', 'cleanser']",0.000000,0.000000,Neutral
4,2022-12-31 21:54:02+00:00,FatKidDeals,"['cerave', 'cleanser']",0.000000,0.000000,Neutral
...,...,...,...,...,...,...
19995,2022-08-20 13:42:34+00:00,sarcastamy,"['kendrawrites', 'passing', 'judgment', 'singu...",0.625000,0.000000,Neutral
19996,2022-08-20 13:30:03+00:00,ChefMimiSanchez,"['cerave', 'sure', 'actually', 'restock']",0.494444,0.250000,Positive
19997,2022-08-20 13:29:25+00:00,cerave,"['chefmimisanchez', 'glad', 'could', 'help', '...",0.750000,0.500000,Positive
19998,2022-08-20 13:26:27+00:00,cerave,"['melanneyd', 'love']",0.600000,0.500000,Positive


In [20]:
cerave['Sentiment'].value_counts()

Neutral     9058
Positive    8574
Negative    2368
Name: Sentiment, dtype: int64

In [21]:
cerave.to_excel('cerave_clean.xlsx')

### 3.3 - Plot the results

It will take a HUUUGE time to lost and it is time consuming. We have no time to loose so we will visualize the Data with Tableau

## 4 - Sentiment Analysis of Bigram/Trigram

In [22]:
# We will explore word associations
# N-Grams analysis are often used to see which words often show up together
# N-Gram -- Contiguous sequence of n items from a given sample of text

In [23]:
stoplist = stopwords.words('english')

In [34]:
tweet = cerave['Tweet'].sample(7000)

In [35]:
c_vec = CountVectorizer(stop_words=stoplist, ngram_range=(2,6))
# Matrix of ngrams
ngrams = c_vec.fit_transform(tweet)
# Count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)
# List of NGrams
vocab = c_vec.vocabulary_
df_ngram = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'bigram/trigram'})

In [36]:
# Similar to the sentiment analysis before, we can calculate the polarity and subjectivity for each bigram/trigram
df_ngram['polarity'] = df_ngram['bigram/trigram'].apply(lambda x: TextBlob(x).polarity)
df_ngram['subjective'] = df_ngram['bigram/trigram'].apply(lambda x: TextBlob(x).subjectivity)

In [37]:
df_ngram

Unnamed: 0,frequency,bigram/trigram,polarity,subjective
0,544,hyaluronic acid,0.000000,0.000000
1,435,face wash,0.000000,0.000000
2,350,cerave cleanser,0.000000,0.000000
3,342,cerave hydrating,0.000000,0.000000
4,336,facial cleanser,0.000000,0.000000
...,...,...,...,...
204729,1,aaaah skin super sensitive avoid,0.216667,0.783333
204730,1,aaaah skin super sensitive,0.216667,0.783333
204731,1,aaaah skin super,0.333333,0.666667
204732,1,aaaah skin,0.000000,0.000000


In [38]:
# Create a function to get the sentiment text
def getSentiment(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

In [39]:
# Create a column to store the text sentiment
df_ngram['Sentiment'] = df_ngram['polarity'].apply(getSentiment)
# Show the data
df_ngram

Unnamed: 0,frequency,bigram/trigram,polarity,subjective,Sentiment
0,544,hyaluronic acid,0.000000,0.000000,Neutral
1,435,face wash,0.000000,0.000000,Neutral
2,350,cerave cleanser,0.000000,0.000000,Neutral
3,342,cerave hydrating,0.000000,0.000000,Neutral
4,336,facial cleanser,0.000000,0.000000,Neutral
...,...,...,...,...,...
204729,1,aaaah skin super sensitive avoid,0.216667,0.783333,Positive
204730,1,aaaah skin super sensitive,0.216667,0.783333,Positive
204731,1,aaaah skin super,0.333333,0.666667,Positive
204732,1,aaaah skin,0.000000,0.000000,Neutral


In [40]:
df_ngram['Sentiment'].value_counts()

Neutral     135721
Positive     49910
Negative     19103
Name: Sentiment, dtype: int64

In [41]:
# save to df this too
df_ngram.to_excel("cerave_ngram.xlsx")