# Sentiment Analysis on the Extracted Tweets (CeraVe)

## 1 - Import the libraries

In [24]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from textblob import TextBlob

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.pipeline import make_pipeline

import pandas as pd
import numpy as np
import re  
import matplotlib.pyplot as plt

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Angela\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Angela\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 2 - Import the tweets

In [25]:
# This are tweets mentioning the brand together with other key words in the last five years
# key words: "review, quality, packaging, scam, works, animal, testing, cruelty, free, skin, recommend, scam"
larocheposay = pd.read_csv("tweets_Larocheposay.csv")
larocheposay = larocheposay.drop('Unnamed: 0', axis=1) # dropping a useless column

In [26]:
## NLP TRIGRAMS AND BIGRAMS
## BUSQUEDA DE PALABRAS CLAVE RELACIONADAS CON EL MUNDO DE LA COSMETICA 
## Y DE LA EMPRESA -- QUE PORCENTAJE SON MALOS? HAY ALGUNA ESPECIALMENTE MALA?
## Usar tokenizer 

## 3 - Conducting Sentiment Analysis

### 3.1 - Prepare Textual Data for Sentiment Analysis

In [27]:
# 1 - Text cleaning
def clean_up(s):
    clean = re.sub(r'http\S+', '', s)
    return re.sub('[^A-Za-z ]+', ' ', clean).lower().strip()

def clean_up2(s):
    clean2 = re.sub(r'\W*\b\w{1,3}\b', '', s)
    return re.sub('[^A-Za-z ]+', ' ', clean2).lower().strip()

# 2 - Tokenization
def tokenize(s):
    return nltk.word_tokenize(s)

# 3 - Stem and Lemmatize
def stem_and_lemmatize(l):
    ps = nltk.PorterStemmer()
    lemmatizer = nltk.WordNetLemmatizer()
    list_words = []
    
    for w in l:
        s = ps.stem(w)
        s = lemmatizer.lemmatize(s)
        list_words += [s] 
    return list_words

# 4 - Stop words removal
def remove_stopwords(l):
    stop_words = stopwords.words('english')
    return[w for w in l if w not in stop_words]

In [28]:
# In order to avoid errors, change type to string
larocheposay['Tweet'] = larocheposay['Tweet'].apply(str)  # change to string for avoiding errors

In [29]:
larocheposay['Tweet'] = larocheposay['Tweet'].apply(clean_up).apply(clean_up2).apply(tokenize).apply(remove_stopwords)
larocheposay

Unnamed: 0,Date,User,Tweet
0,2022-12-31 23:45:55+00:00,AomLeland88,"[roche, posay, toleriane, ultra, face, makeup,..."
1,2022-12-31 23:45:07+00:00,SAlertPro,"[roche, posay, substiane, riche, face, moistur..."
2,2022-12-31 23:07:37+00:00,zizzycarter,"[care, much, scented, skincare, products, quit..."
3,2022-12-31 22:18:24+00:00,lemkenxfhc,"[roche, posay, toleriane, purifying, foaming, ..."
4,2022-12-31 21:38:21+00:00,_titilay0_,"[need, roche, posay, skin, starting, crack]"
...,...,...,...
19995,2021-02-19 17:39:28+00:00,sevenjetc,"[dannypsavage, five, minutes, need, save, mone..."
19996,2021-02-19 16:51:02+00:00,izzatchubbz,"[roche, posay, baume, whitecast, sikit, tekeju..."
19997,2021-02-19 16:20:58+00:00,fritkis,"[imspeaking, thinking, cucumber, rosemary, roc..."
19998,2021-02-19 15:47:16+00:00,tyler_steele,"[larocheposayusa, launched, save, skin, annual..."


### 3.2 - Get Polarity and Subjectivity of Tweets

In [30]:
# To avoid problems
larocheposay['Tweet'] = larocheposay['Tweet'].apply(str)
tweet = larocheposay['Tweet']

In [31]:
# Create a function to get subjectivity
def getSubjectivity(tweet):
    return TextBlob(tweet).sentiment.subjectivity
# Create a function to get polarity
def getPolarity(tweet):
    return TextBlob(tweet).sentiment.polarity

# Create two new columns called 'Subjectivity' and 'Polarity'
larocheposay['Subjectivity'] = larocheposay['Tweet'].apply(getSubjectivity)
larocheposay['Polarity'] = larocheposay['Tweet'].apply(getPolarity)

In [32]:
larocheposay

Unnamed: 0,Date,User,Tweet,Subjectivity,Polarity
0,2022-12-31 23:45:55+00:00,AomLeland88,"['roche', 'posay', 'toleriane', 'ultra', 'face...",0.000000,0.000000
1,2022-12-31 23:45:07+00:00,SAlertPro,"['roche', 'posay', 'substiane', 'riche', 'face...",0.000000,0.000000
2,2022-12-31 23:07:37+00:00,zizzycarter,"['care', 'much', 'scented', 'skincare', 'produ...",0.200000,0.200000
3,2022-12-31 22:18:24+00:00,lemkenxfhc,"['roche', 'posay', 'toleriane', 'purifying', '...",0.566667,0.166667
4,2022-12-31 21:38:21+00:00,_titilay0_,"['need', 'roche', 'posay', 'skin', 'starting',...",0.100000,0.000000
...,...,...,...,...,...
19995,2021-02-19 17:39:28+00:00,sevenjetc,"['dannypsavage', 'five', 'minutes', 'need', 's...",0.667857,0.342857
19996,2021-02-19 16:51:02+00:00,izzatchubbz,"['roche', 'posay', 'baume', 'whitecast', 'siki...",0.000000,0.000000
19997,2021-02-19 16:20:58+00:00,fritkis,"['imspeaking', 'thinking', 'cucumber', 'rosema...",0.500000,0.250000
19998,2021-02-19 15:47:16+00:00,tyler_steele,"['larocheposayusa', 'launched', 'save', 'skin'...",0.066667,0.000000


In [33]:
# Create a function to get the sentiment text
def getSentiment(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

In [34]:
# Create a column to store the text sentiment
larocheposay['Sentiment'] = larocheposay['Polarity'].apply(getSentiment)
# Show the data
larocheposay

Unnamed: 0,Date,User,Tweet,Subjectivity,Polarity,Sentiment
0,2022-12-31 23:45:55+00:00,AomLeland88,"['roche', 'posay', 'toleriane', 'ultra', 'face...",0.000000,0.000000,Neutral
1,2022-12-31 23:45:07+00:00,SAlertPro,"['roche', 'posay', 'substiane', 'riche', 'face...",0.000000,0.000000,Neutral
2,2022-12-31 23:07:37+00:00,zizzycarter,"['care', 'much', 'scented', 'skincare', 'produ...",0.200000,0.200000,Positive
3,2022-12-31 22:18:24+00:00,lemkenxfhc,"['roche', 'posay', 'toleriane', 'purifying', '...",0.566667,0.166667,Positive
4,2022-12-31 21:38:21+00:00,_titilay0_,"['need', 'roche', 'posay', 'skin', 'starting',...",0.100000,0.000000,Neutral
...,...,...,...,...,...,...
19995,2021-02-19 17:39:28+00:00,sevenjetc,"['dannypsavage', 'five', 'minutes', 'need', 's...",0.667857,0.342857,Positive
19996,2021-02-19 16:51:02+00:00,izzatchubbz,"['roche', 'posay', 'baume', 'whitecast', 'siki...",0.000000,0.000000,Neutral
19997,2021-02-19 16:20:58+00:00,fritkis,"['imspeaking', 'thinking', 'cucumber', 'rosema...",0.500000,0.250000,Positive
19998,2021-02-19 15:47:16+00:00,tyler_steele,"['larocheposayusa', 'launched', 'save', 'skin'...",0.066667,0.000000,Neutral


In [35]:
larocheposay['Sentiment'].value_counts()

Positive    9874
Neutral     8177
Negative    1949
Name: Sentiment, dtype: int64

In [36]:
larocheposay.to_excel('larocheposay_clean.xlsx')

### 3.3 - Plot the results

It will take a HUUUGE time to lost and it is time consuming. We have no time to loose so we will visualize the Data with Tableau

## 4 - Sentiment Analysis of Bigram/Trigram

In [37]:
# We will explore word associations
# N-Grams analysis are often used to see which words often show up together
# N-Gram -- Contiguous sequence of n items from a given sample of text

In [38]:
stoplist = stopwords.words('english')

In [39]:
tweet = larocheposay['Tweet'].sample(10000)

In [40]:
c_vec = CountVectorizer(stop_words=stoplist, ngram_range=(2,3))
# Matrix of ngrams
ngrams = c_vec.fit_transform(tweet)
# Count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)
# List of NGrams
vocab = c_vec.vocabulary_
df_ngram = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'bigram/trigram'})

In [41]:
# Similar to the sentiment analysis before, we can calculate the polarity and subjectivity for each bigram/trigram
df_ngram['polarity'] = df_ngram['bigram/trigram'].apply(lambda x: TextBlob(x).polarity)
df_ngram['subjective'] = df_ngram['bigram/trigram'].apply(lambda x: TextBlob(x).subjectivity)

In [42]:
df_ngram

Unnamed: 0,frequency,bigram/trigram,polarity,subjective
0,9717,roche posay,0.0,0.0
1,744,roche posay anthelios,0.0,0.0
2,744,posay anthelios,0.0,0.0
3,694,posay effaclar,0.0,0.0
4,678,roche posay effaclar,0.0,0.0
...,...,...,...,...
129446,1,aaaah skin,0.0,0.0
129447,1,aaaaaahhhhhhh roche posay,0.0,0.0
129448,1,aaaaaahhhhhhh roche,0.0,0.0
129449,1,aaaaaaaaaaaaa wait thnk,0.0,0.0


In [43]:
# Create a function to get the sentiment text
def getSentiment(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

In [44]:
# Create a column to store the text sentiment
df_ngram['Sentiment'] = df_ngram['polarity'].apply(getSentiment)
# Show the data
df_ngram

Unnamed: 0,frequency,bigram/trigram,polarity,subjective,Sentiment
0,9717,roche posay,0.0,0.0,Neutral
1,744,roche posay anthelios,0.0,0.0,Neutral
2,744,posay anthelios,0.0,0.0,Neutral
3,694,posay effaclar,0.0,0.0,Neutral
4,678,roche posay effaclar,0.0,0.0,Neutral
...,...,...,...,...,...
129446,1,aaaah skin,0.0,0.0,Neutral
129447,1,aaaaaahhhhhhh roche posay,0.0,0.0,Neutral
129448,1,aaaaaahhhhhhh roche,0.0,0.0,Neutral
129449,1,aaaaaaaaaaaaa wait thnk,0.0,0.0,Neutral


In [45]:
df_ngram['Sentiment'].value_counts()

Neutral     97047
Positive    24663
Negative     7741
Name: Sentiment, dtype: int64

In [46]:
df_ngram.to_excel('laroche_ngram.xlsx')