# Sentiment Analysis on the Extracted Tweets (CeraVe)

## 1 - Import the libraries

In [1]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from textblob import TextBlob

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.pipeline import make_pipeline

import pandas as pd
import numpy as np
import re  
import matplotlib.pyplot as plt

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Angela\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Angela\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 2 - Import the tweets

In [2]:
# This are tweets mentioning the brand together with other key words in the last five years
# key words: "review, quality, packaging, scam, works, animal, testing, cruelty, free, skin, recommend, scam"
vichy = pd.read_csv("tweets_vichy.csv")
vichy = vichy.drop('Unnamed: 0', axis=1) # dropping a useless column

In [3]:
## NLP TRIGRAMS AND BIGRAMS
## BUSQUEDA DE PALABRAS CLAVE RELACIONADAS CON EL MUNDO DE LA COSMETICA 
## Y DE LA EMPRESA -- QUE PORCENTAJE SON MALOS? HAY ALGUNA ESPECIALMENTE MALA?
## Usar tokenizer 

## 3 - Conducting Sentiment Analysis

### 3.1 - Prepare Textual Data for Sentiment Analysis

In [4]:
# 1 - Text cleaning
def clean_up(s):
    clean = re.sub(r'http\S+', '', s)
    return re.sub('[^A-Za-z ]+', ' ', clean).lower().strip()

def clean_up2(s):
    clean2 = re.sub(r'\W*\b\w{1,3}\b', '', s)
    return re.sub('[^A-Za-z ]+', ' ', clean2).lower().strip()

# 2 - Tokenization
def tokenize(s):
    return nltk.word_tokenize(s)

# 3 - Stem and Lemmatize
def stem_and_lemmatize(l):
    ps = nltk.PorterStemmer()
    lemmatizer = nltk.WordNetLemmatizer()
    list_words = []
    
    for w in l:
        s = ps.stem(w)
        s = lemmatizer.lemmatize(s)
        list_words += [s] 
    return list_words

# 4 - Stop words removal
def remove_stopwords(l):
    stop_words = stopwords.words('english')
    return[w for w in l if w not in stop_words]

In [5]:
# In order to avoid errors, change type to string
vichy['Tweet'] = vichy['Tweet'].apply(str)  # change to string for avoiding errors

In [6]:
vichy['Tweet'] = vichy['Tweet'].apply(clean_up).apply(clean_up2).apply(tokenize).apply(remove_stopwords)
vichy

Unnamed: 0,Date,User,Tweet
0,2022-12-30 22:50:24+00:00,eseead_ca,"[vichy, mineral, hyaluronic, acid, serum, face..."
1,2022-12-29 18:56:09+00:00,Earthlite,"[vichy, showers, great, induce, relaxation, be..."
2,2022-12-29 07:10:34+00:00,EmoKidAtHeart,"[update, vichy, serum, broke, even, sensitive,..."
3,2022-12-27 17:00:01+00:00,Vichy_USA,"[stress, skin, need, address, essential, needs..."
4,2022-12-26 16:00:26+00:00,BestPharmacyGr,"[vichy, super, healthy, skin, vichylaboratoire..."
...,...,...,...
1726,2020-01-03 22:59:33+00:00,Vichy_USA,"[sensitive, skin, requires, extra, hydration, ..."
1727,2020-01-03 17:39:25+00:00,_Guchu,"[vichy, mineral, water, awesome, packaging, ni..."
1728,2020-01-03 14:57:11+00:00,flankericeman,"[howthox, vichy, nutrilogie, skin]"
1729,2020-01-02 18:17:32+00:00,SashaHopeWrites,"[mikkuchan, clinique, vichy, best, expensive, ..."


### 3.2 - Get Polarity and Subjectivity of Tweets

In [7]:
# To avoid problems
vichy['Tweet'] = vichy['Tweet'].apply(str)
tweet = vichy['Tweet']

In [8]:
# Create a function to get subjectivity
def getSubjectivity(tweet):
    return TextBlob(tweet).sentiment.subjectivity
# Create a function to get polarity
def getPolarity(tweet):
    return TextBlob(tweet).sentiment.polarity

# Create two new columns called 'Subjectivity' and 'Polarity'
vichy['Subjectivity'] = vichy['Tweet'].apply(getSubjectivity)
vichy['Polarity'] = vichy['Tweet'].apply(getPolarity)

In [9]:
vichy

Unnamed: 0,Date,User,Tweet,Subjectivity,Polarity
0,2022-12-30 22:50:24+00:00,eseead_ca,"['vichy', 'mineral', 'hyaluronic', 'acid', 'se...",0.433333,0.066667
1,2022-12-29 18:56:09+00:00,Earthlite,"['vichy', 'showers', 'great', 'induce', 'relax...",0.517857,0.310714
2,2022-12-29 07:10:34+00:00,EmoKidAtHeart,"['update', 'vichy', 'serum', 'broke', 'even', ...",0.900000,0.100000
3,2022-12-27 17:00:01+00:00,Vichy_USA,"['stress', 'skin', 'need', 'address', 'essenti...",0.400000,0.208333
4,2022-12-26 16:00:26+00:00,BestPharmacyGr,"['vichy', 'super', 'healthy', 'skin', 'vichyla...",0.583333,0.416667
...,...,...,...,...,...
1726,2020-01-03 22:59:33+00:00,Vichy_USA,"['sensitive', 'skin', 'requires', 'extra', 'hy...",0.460000,0.030000
1727,2020-01-03 17:39:25+00:00,_Guchu,"['vichy', 'mineral', 'water', 'awesome', 'pack...",0.750000,0.608333
1728,2020-01-03 14:57:11+00:00,flankericeman,"['howthox', 'vichy', 'nutrilogie', 'skin']",0.000000,0.000000
1729,2020-01-02 18:17:32+00:00,SashaHopeWrites,"['mikkuchan', 'clinique', 'vichy', 'best', 'ex...",0.613333,0.270000


In [10]:
# Create a function to get the sentiment text
def getSentiment(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

In [11]:
# Create a column to store the text sentiment
vichy['Sentiment'] = vichy['Polarity'].apply(getSentiment)
# Show the data
vichy

Unnamed: 0,Date,User,Tweet,Subjectivity,Polarity,Sentiment
0,2022-12-30 22:50:24+00:00,eseead_ca,"['vichy', 'mineral', 'hyaluronic', 'acid', 'se...",0.433333,0.066667,Positive
1,2022-12-29 18:56:09+00:00,Earthlite,"['vichy', 'showers', 'great', 'induce', 'relax...",0.517857,0.310714,Positive
2,2022-12-29 07:10:34+00:00,EmoKidAtHeart,"['update', 'vichy', 'serum', 'broke', 'even', ...",0.900000,0.100000,Positive
3,2022-12-27 17:00:01+00:00,Vichy_USA,"['stress', 'skin', 'need', 'address', 'essenti...",0.400000,0.208333,Positive
4,2022-12-26 16:00:26+00:00,BestPharmacyGr,"['vichy', 'super', 'healthy', 'skin', 'vichyla...",0.583333,0.416667,Positive
...,...,...,...,...,...,...
1726,2020-01-03 22:59:33+00:00,Vichy_USA,"['sensitive', 'skin', 'requires', 'extra', 'hy...",0.460000,0.030000,Positive
1727,2020-01-03 17:39:25+00:00,_Guchu,"['vichy', 'mineral', 'water', 'awesome', 'pack...",0.750000,0.608333,Positive
1728,2020-01-03 14:57:11+00:00,flankericeman,"['howthox', 'vichy', 'nutrilogie', 'skin']",0.000000,0.000000,Neutral
1729,2020-01-02 18:17:32+00:00,SashaHopeWrites,"['mikkuchan', 'clinique', 'vichy', 'best', 'ex...",0.613333,0.270000,Positive


In [12]:
vichy['Sentiment'].value_counts()

Positive    1161
Neutral      417
Negative     153
Name: Sentiment, dtype: int64

In [13]:
vichy.to_excel('vichy_clean.xlsx')

### 3.3 - Plot the results

It will take a HUUUGE time to lost and it is time consuming. We have no time to loose so we will visualize the Data with Tableau

## 4 - Sentiment Analysis of Bigram/Trigram

In [14]:
# We will explore word associations
# N-Grams analysis are often used to see which words often show up together
# N-Gram -- Contiguous sequence of n items from a given sample of text

In [15]:
stoplist = stopwords.words('english')

In [17]:
tweet = vichy['Tweet']

In [18]:
c_vec = CountVectorizer(stop_words=stoplist, ngram_range=(2,6))
# Matrix of ngrams
ngrams = c_vec.fit_transform(tweet)
# Count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)
# List of NGrams
vocab = c_vec.vocabulary_
df_ngram = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'bigram/trigram'})

In [19]:
# Similar to the sentiment analysis before, we can calculate the polarity and subjectivity for each bigram/trigram
df_ngram['polarity'] = df_ngram['bigram/trigram'].apply(lambda x: TextBlob(x).polarity)
df_ngram['subjective'] = df_ngram['bigram/trigram'].apply(lambda x: TextBlob(x).subjectivity)

In [20]:
df_ngram

Unnamed: 0,frequency,bigram/trigram,polarity,subjective
0,213,vichyusa vichylover,0.0,0.0
1,181,hyaluronic acid,0.0,0.0
2,176,sensitive skin,0.1,0.9
3,162,vichy mineral,0.0,0.0
4,129,skin care,0.0,0.0
...,...,...,...,...
100757,1,aadvmx dermatology vichyataad,0.0,0.0
100758,1,aadvmx bookmark link dermatology vichyscience,0.0,0.0
100759,1,aadvmx bookmark link dermatology,0.0,0.0
100760,1,aadvmx bookmark link,0.0,0.0


In [21]:
# Create a function to get the sentiment text
def getSentiment(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

In [22]:
# Create a column to store the text sentiment
df_ngram['Sentiment'] = df_ngram['polarity'].apply(getSentiment)
# Show the data
df_ngram

Unnamed: 0,frequency,bigram/trigram,polarity,subjective,Sentiment
0,213,vichyusa vichylover,0.0,0.0,Neutral
1,181,hyaluronic acid,0.0,0.0,Neutral
2,176,sensitive skin,0.1,0.9,Positive
3,162,vichy mineral,0.0,0.0,Neutral
4,129,skin care,0.0,0.0,Neutral
...,...,...,...,...,...
100757,1,aadvmx dermatology vichyataad,0.0,0.0,Neutral
100758,1,aadvmx bookmark link dermatology vichyscience,0.0,0.0,Neutral
100759,1,aadvmx bookmark link dermatology,0.0,0.0,Neutral
100760,1,aadvmx bookmark link,0.0,0.0,Neutral


In [23]:
df_ngram['Sentiment'].value_counts()

Neutral     68387
Positive    26853
Negative     5522
Name: Sentiment, dtype: int64

In [24]:
df_ngram.to_excel("vichy_ngram.xlsx")