# Sentiment Analysis on the Extracted Tweets (CeraVe)

## 1 - Import the libraries

In [1]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from textblob import TextBlob

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.pipeline import make_pipeline

import pandas as pd
import numpy as np
import re  
import matplotlib.pyplot as plt

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Angela\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Angela\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 2 - Import the tweets

In [2]:
# This are tweets mentioning the brand together with other key words in the last five years
# key words: "review, quality, packaging, scam, works, animal, testing, cruelty, free, skin, recommend, scam"
skinceuticals = pd.read_csv("tweets_skinceuticals.csv")
skinceuticals = skinceuticals.drop('Unnamed: 0', axis=1) # dropping a useless column

In [3]:
## NLP TRIGRAMS AND BIGRAMS
## BUSQUEDA DE PALABRAS CLAVE RELACIONADAS CON EL MUNDO DE LA COSMETICA 
## Y DE LA EMPRESA -- QUE PORCENTAJE SON MALOS? HAY ALGUNA ESPECIALMENTE MALA?
## Usar tokenizer 

## 3 - Conducting Sentiment Analysis

### 3.1 - Prepare Textual Data for Sentiment Analysis

In [4]:
# 1 - Text cleaning
def clean_up(s):
    clean = re.sub(r'http\S+', '', s)
    return re.sub('[^A-Za-z ]+', ' ', clean).lower().strip()

def clean_up2(s):
    clean2 = re.sub(r'\W*\b\w{1,3}\b', '', s)
    return re.sub('[^A-Za-z ]+', ' ', clean2).lower().strip()

# 2 - Tokenization
def tokenize(s):
    return nltk.word_tokenize(s)

# 3 - Stem and Lemmatize
def stem_and_lemmatize(l):
    ps = nltk.PorterStemmer()
    lemmatizer = nltk.WordNetLemmatizer()
    list_words = []
    
    for w in l:
        s = ps.stem(w)
        s = lemmatizer.lemmatize(s)
        list_words += [s] 
    return list_words

# 4 - Stop words removal
def remove_stopwords(l):
    stop_words = stopwords.words('english')
    return[w for w in l if w not in stop_words]

In [5]:
# In order to avoid errors, change type to string
skinceuticals['Tweet'] = skinceuticals['Tweet'].apply(str)  # change to string for avoiding errors

In [6]:
skinceuticals['Tweet'] = skinceuticals['Tweet'].apply(clean_up).apply(clean_up2).apply(tokenize).apply(remove_stopwords)
skinceuticals

Unnamed: 0,Date,User,Tweet
0,2022-12-31 16:02:10+00:00,BestMRDeals,"[skinceuticals, biocellulose, restorative, mas..."
1,2022-12-31 16:00:54+00:00,BestMRDeals,"[skinceuticals, clarifying, clay, mask, price,..."
2,2022-12-31 15:11:45+00:00,BestMRDeals,"[skinceuticals, phyto, corrective, mask, price..."
3,2022-12-31 15:10:58+00:00,dickensjfgqa,"[skinceuticals, hydrating, thkbedo]"
4,2022-12-31 14:56:08+00:00,BestMRDeals,"[skinceuticals, hydrating, mask, price, dermst..."
...,...,...,...
15586,2020-01-01 02:31:50+00:00,BlakeMcCoyTV,"[axfrommn, skinceuticals, glass, champagne, sh..."
15587,2020-01-01 02:28:12+00:00,BlakeMcCoyTV,"[bowlermikey, skinceuticals, think, young, goo..."
15588,2020-01-01 02:28:07+00:00,AXfromMN,"[blakemccoydc, skinceuticals, never, skincare,..."
15589,2020-01-01 02:24:52+00:00,bowlermikey,"[blakemccoydc, skinceuticals, young, stuff, an..."


### 3.2 - Get Polarity and Subjectivity of Tweets

In [7]:
# To avoid problems
skinceuticals['Tweet'] = skinceuticals['Tweet'].apply(str)
tweet = skinceuticals['Tweet']

In [8]:
# Create a function to get subjectivity
def getSubjectivity(tweet):
    return TextBlob(tweet).sentiment.subjectivity
# Create a function to get polarity
def getPolarity(tweet):
    return TextBlob(tweet).sentiment.polarity

# Create two new columns called 'Subjectivity' and 'Polarity'
skinceuticals['Subjectivity'] = skinceuticals['Tweet'].apply(getSubjectivity)
skinceuticals['Polarity'] = skinceuticals['Tweet'].apply(getPolarity)

In [9]:
skinceuticals

Unnamed: 0,Date,User,Tweet,Subjectivity,Polarity
0,2022-12-31 16:02:10+00:00,BestMRDeals,"['skinceuticals', 'biocellulose', 'restorative...",0.000000,0.000000
1,2022-12-31 16:00:54+00:00,BestMRDeals,"['skinceuticals', 'clarifying', 'clay', 'mask'...",0.000000,0.000000
2,2022-12-31 15:11:45+00:00,BestMRDeals,"['skinceuticals', 'phyto', 'corrective', 'mask...",0.000000,0.000000
3,2022-12-31 15:10:58+00:00,dickensjfgqa,"['skinceuticals', 'hydrating', 'thkbedo']",0.000000,0.000000
4,2022-12-31 14:56:08+00:00,BestMRDeals,"['skinceuticals', 'hydrating', 'mask', 'price'...",0.000000,0.000000
...,...,...,...,...,...
15586,2020-01-01 02:31:50+00:00,BlakeMcCoyTV,"['axfrommn', 'skinceuticals', 'glass', 'champa...",0.000000,0.000000
15587,2020-01-01 02:28:12+00:00,BlakeMcCoyTV,"['bowlermikey', 'skinceuticals', 'think', 'you...",0.500000,0.400000
15588,2020-01-01 02:28:07+00:00,AXfromMN,"['blakemccoydc', 'skinceuticals', 'never', 'sk...",0.000000,0.000000
15589,2020-01-01 02:24:52+00:00,bowlermikey,"['blakemccoydc', 'skinceuticals', 'young', 'st...",0.400000,0.100000


In [10]:
# Create a function to get the sentiment text
def getSentiment(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

In [11]:
# Create a column to store the text sentiment
skinceuticals['Sentiment'] = skinceuticals['Polarity'].apply(getSentiment)
# Show the data
skinceuticals

Unnamed: 0,Date,User,Tweet,Subjectivity,Polarity,Sentiment
0,2022-12-31 16:02:10+00:00,BestMRDeals,"['skinceuticals', 'biocellulose', 'restorative...",0.000000,0.000000,Neutral
1,2022-12-31 16:00:54+00:00,BestMRDeals,"['skinceuticals', 'clarifying', 'clay', 'mask'...",0.000000,0.000000,Neutral
2,2022-12-31 15:11:45+00:00,BestMRDeals,"['skinceuticals', 'phyto', 'corrective', 'mask...",0.000000,0.000000,Neutral
3,2022-12-31 15:10:58+00:00,dickensjfgqa,"['skinceuticals', 'hydrating', 'thkbedo']",0.000000,0.000000,Neutral
4,2022-12-31 14:56:08+00:00,BestMRDeals,"['skinceuticals', 'hydrating', 'mask', 'price'...",0.000000,0.000000,Neutral
...,...,...,...,...,...,...
15586,2020-01-01 02:31:50+00:00,BlakeMcCoyTV,"['axfrommn', 'skinceuticals', 'glass', 'champa...",0.000000,0.000000,Neutral
15587,2020-01-01 02:28:12+00:00,BlakeMcCoyTV,"['bowlermikey', 'skinceuticals', 'think', 'you...",0.500000,0.400000,Positive
15588,2020-01-01 02:28:07+00:00,AXfromMN,"['blakemccoydc', 'skinceuticals', 'never', 'sk...",0.000000,0.000000,Neutral
15589,2020-01-01 02:24:52+00:00,bowlermikey,"['blakemccoydc', 'skinceuticals', 'young', 'st...",0.400000,0.100000,Positive


In [12]:
skinceuticals['Sentiment'].value_counts()

Positive    8679
Neutral     5707
Negative    1205
Name: Sentiment, dtype: int64

In [13]:
skinceuticals.to_excel('skinceuticals_clean.xlsx')

### 3.3 - Plot the results

It will take a HUUUGE time to lost and it is time consuming. We have no time to loose so we will visualize the Data with Tableau

## 4 - Sentiment Analysis of Bigram/Trigram

In [14]:
# We will explore word associations
# N-Grams analysis are often used to see which words often show up together
# N-Gram -- Contiguous sequence of n items from a given sample of text

In [15]:
stoplist = stopwords.words('english')

In [16]:
tweet = skinceuticals['Tweet'].sample(10000)

In [17]:
c_vec = CountVectorizer(stop_words=stoplist, ngram_range=(2,3))
# Matrix of ngrams
ngrams = c_vec.fit_transform(tweet)
# Count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)
# List of NGrams
vocab = c_vec.vocabulary_
df_ngram = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'bigram/trigram'})

In [18]:
# Similar to the sentiment analysis before, we can calculate the polarity and subjectivity for each bigram/trigram
df_ngram['polarity'] = df_ngram['bigram/trigram'].apply(lambda x: TextBlob(x).polarity)
df_ngram['subjective'] = df_ngram['bigram/trigram'].apply(lambda x: TextBlob(x).subjectivity)

In [19]:
df_ngram

Unnamed: 0,frequency,bigram/trigram,polarity,subjective
0,639,skinceuticals ferulic,0.00,0.000000
1,332,vitamin serum,0.00,0.000000
2,305,free skinceuticals,0.40,0.800000
3,265,triple lipid,0.00,0.000000
4,255,skinceuticals products,0.00,0.000000
...,...,...,...,...
130568,1,aadvmx first,0.25,0.333333
130569,1,aadmember asdsskinmd skinceuticals,0.00,0.000000
130570,1,aadmember asdsskinmd,0.00,0.000000
130571,1,aaahhh hyunson binjin,0.00,0.000000


In [20]:
# Create a function to get the sentiment text
def getSentiment(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

In [21]:
# Create a column to store the text sentiment
df_ngram['Sentiment'] = df_ngram['polarity'].apply(getSentiment)
# Show the data
df_ngram

Unnamed: 0,frequency,bigram/trigram,polarity,subjective,Sentiment
0,639,skinceuticals ferulic,0.00,0.000000,Neutral
1,332,vitamin serum,0.00,0.000000,Neutral
2,305,free skinceuticals,0.40,0.800000,Positive
3,265,triple lipid,0.00,0.000000,Neutral
4,255,skinceuticals products,0.00,0.000000,Neutral
...,...,...,...,...,...
130568,1,aadvmx first,0.25,0.333333,Positive
130569,1,aadmember asdsskinmd skinceuticals,0.00,0.000000,Neutral
130570,1,aadmember asdsskinmd,0.00,0.000000,Neutral
130571,1,aaahhh hyunson binjin,0.00,0.000000,Neutral


In [22]:
df_ngram['Sentiment'].value_counts()

Neutral     100435
Positive     24191
Negative      5947
Name: Sentiment, dtype: int64

In [24]:
df_ngram.to_excel("skinceuticals_ngram.xlsx")