# ANALYSIS OF USER SENTIMENT ON TWITTER IN SPANISH ABOUT TECHNOLOGIES APPLYING TEXT MINING

# INSTALLATION OF LIBRARIES WITH PIP INSTALL

In [None]:
# Installing the Twint library from GitHub
#!pip install --user --upgrade -e git+https://github.com/twintproject/twint.git#egg=twint
!pip install --upgrade git+https://github.com/twintproject/twint.git@origin/master#egg=twint
#!pip uninstall twint
#!pip install git+git://github.com/ajctrl/twint@patch-1


In [None]:
!pip install torch

In [None]:
!pip install tensorflow

In [None]:
# Installation of the Pipeline library
!pip install pipeline

In [None]:
# Installing the Transformers library
!pip install transformers

# IMPORTAR LIBRERÍAS

In [None]:
#For handling DataFrames
import pandas as pd

#For matrix management
import numpy as np

#To draw graphs
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

#For handling dates and times
from datetime import date, timedelta

#For Twitter data extraction
import twint

#When the Twint library is executed in Jupyter, the following error appears
# -> RuntimeError: This event loop is already running
#To solve this problem we can insert the anest_asyncio library
import nest_asyncio
nest_asyncio.apply()

#For natural language processing
import nltk
nltk.download('popular')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

#To find regular expressions such as characters
#special or alphabet
import re

#To build language processing applications
#natural (NLP) lemmatization
import spacy

#For the analysis of feelings
from transformers import pipeline

import collections
import itertools

# CONFIGURATION AND APPLICATION OF FILTERS IN TWINT FOR SEARCHING ON TWITTER

In [None]:
# Setting
c = twint.Config()

c.Search = "Technology" #To extract only the tweets that contain this word
c.Limit = 100000 #To indicate the maximum number of tweets to collect
c.Since = "2023-01-01" #To extract the tweets that have been published since the date we consider.
c.Pandas = True #To be able to store the results in a Dataframe from the Pandas library
#c.Elasticsearch = "http://localhost:9200"
twint.run.Search(c) #Start searching

#Parameters taken from the documentation: https://github.com/twintproject/twint/wiki/Configuration

## Put Tweets in a Dataframe with Pandas

In [None]:
# Function to be able to see the columns of the extracted DataFrame
def column_names():
     return twint.output.panda.Tweets_df.columns
#Function to save the columns in a Dataframe
def twint_to_pd(columns):
     return twint.output.panda.Tweets_df[columns]

In [None]:
#see all columns
print(column_names())

In [None]:
# We store all the data in a DataFrame
tweet_df = twint_to_pd(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
       'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
       'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
       'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url',
       'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
       'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
       'trans_dest'])

In [None]:
#Print the DataFrame
tweet_df

In [None]:
#See the total number of rows in the data lake
Tweet df.shape

In [None]:
#We save the Data Lake in a csv
tweet df.to_csv("DataLake.csv", index=False)

## Using Required Columns

In [None]:
# We use specific columns
tweet_df = twint_to_pd(["date","username","language","tweet","nlikes"])

In [None]:
#Print the DataFrame
tweet_df

In [None]:
#We save only the columns that we are going to use in a csv
tweet_df.to_csv("DataTwitter.csv", index=False)

# WE LIMIT THE DATALAKE TO SPANISH

In [None]:
#We limit the dataframe to Spanish (es)
tweet_eng = tweet_df[tweet_df['language']=='en']

In [None]:
#Print the DataFrame
tweet_esp

In [None]:
#We save with the Tweets filter in Spanish
tweet_eng.to_csv("DataTwitterES.csv", index=False)

## EDA of Dates

In [None]:
# Convert the "Date" column to a list
dates_list = tweet_eng['date'].to_list()

In [None]:

dates = []
for t in dates_list:
     # extract the date part from the date time
     date_str = t.split(' ')[0]
     # extract the time from the date
     year,month,day = [int(i) for i in date_str.split('-')]
     # create a date object
     d = date(year, month, day)
     #sort
     dates.append(d)

# sort dates
dates.sort()

# find the first and last date
min_date = dates[0]
max_date = dates[-1]

# calculate the number of days
length = (max_date - min_date).days + 1

# histogram using plot
plt.figure(figsize=(12,8))
plt.hist(dates)
plt.show()

# PREPROCESSING

# Deleting Special Characters and StopWords

In [None]:
# Creation of Function for data cleaning
def cleanCharacters(tweet):
     hash_text = re.sub(r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", '', tweet) #Delete word with hashtag
     at_text = re.sub(r'(?:@[\w_]+)', '', hash_text) #Delete user with at
     sp_text = re.sub(r'[^\w\s]', '', at_text) #Delete punctuation marks
     text_sinlink = re.sub(r'http\S+', '', text_sp) #Delete Https Links
     text_num = re.sub(r'(?:(?:\d+,?)+(?:\.?\d+)?)', '',text_sinlink) #Delete numbers
     word_tokens = word_tokenize(text_num)

     # Delete StopWords(connectors like for,by,of,a,like,etc)
     stop_words = set(stopwords.words('spanish'))

     filtered_sentence = []

     for w in word_tokens:
         if w not in stop_words:
             filtered_sentence.append(w)

     return filtered_sentence

# Example:
# cleanupCharacters('''a #word I use @piero 955584741 😂 hi hi papa mama 😁 news 😁 (such as "the", "a", "a", "in") 😂😁😁 from a search engine such as http ://saturdays.ai/2022/04/07/hapyness-analisis-de-sentimientos-en-la-poblacion-de-aragon/''')

In [None]:
#We use the function "limpiezaCaracteres" to obtain lists of the clean words and we use the join to convert it to Object
#Everything will be saved in the new column "tweets_transform"
tweet_eng["tweets_transform"] = tweet_eng["tweet"].apply(limpiezaCaracteres).apply(lambda x:" ".join(x))

In [None]:
# We print the DataFrame with the new column "Tweets Transform"
tweet esp

In [None]:
#We save the Dataframe with the tweets without StopWords and Special Characters
tweet_eng.to_csv("DataTwitterStopWords.csv", index=False)

## We use only the column with the Transformed Tweets

In [None]:
#We eliminate the Tweet column
tweet_limpio = tweet_esp.drop(['tweet'], axis=1)

In [None]:
#We print the new DataFrame
tweet_limpio

In [None]:
#We Save Data with Transformed Tweets
tweet_limpio.to_csv("DataTwitterTransform.csv", index=False)

# Delete Rows with Special Characters

In [None]:
test = clean_tweet.dropna(axis=0) # first remove nulls
with_script = test[test.tweets_transform.str.contains("_")]
with_stripe = test[test.tweets_transform.str.contains("一")]
with_chinese = test[test.tweets_transform.str.contains("세기 소녀")]
with_may = test[test.tweets_transform.str.contains("ᔕᗴᗰᗩᑎᗩ ᗪᗴ ᒪᗩ ᑕᎥᗴᑎᑕᎥᗩ")]
with_rare = test[test.tweets_transform.str.contains("ღ")]
total = pd.concat([with_dash,with_dash,with_chinese,with_may,with_rare],axis=0) # Joining the dataframes
tweet_clean2 = tweet_clean.drop(index=total.index)

In [None]:
clean_tweet2

In [None]:
#We Save Data with Transformed Tweets
tweet_limpio2.to_csv("DataTwitterSinEspecial.csv", index=False)

# lemmatization

In [None]:
#We divide the Dataframe into 5 parts
list = []
for i in range(0.54001,10800):
     list.append(clean_tweet2.loc[i:i+10799,:])

In [None]:
data1 = pd.DataFrame(lista[0])
data2 = pd.DataFrame(lista[1])
data3 = pd.DataFrame(lista[2])
data4 = pd.DataFrame(lista[3])
data5 = pd.DataFrame(lista[4])

In [None]:
#We keep the 5 divisions to be able to Lematize it by
data1.to_csv("data1.csv", index=False)
data2.to_csv("data2.csv", index=False)
data3.to_csv("data3.csv", index=False)
data4.to_csv("data4.csv", index=False)
data5.to_csv("data5.csv", index=False)


In [None]:
# We print the first 10800 data
data1

In [None]:
#Create a function to convert tweets to Slogans()
def lemmatization(tweet):
     nlp = spacy.load('es_core_news_sm')
     doc = nlp(tweet)
     lemmas = [tok.lemma_.lower() for tok in doc]
     object = " ".join(lemmas)
     return object

# Lemmatization = Transform from (ask -> ask) or (I am -> be)
# # Example
#lemmatization('I am a text that is crying out to be processed. hahahahahah')

In [None]:
#Primeros 10 Tweets sin lematizar
# tweet_limpio["tweets_transform"][:10]

In [None]:
##Primeros 10 Tweets lematizados
# tweet_limpio["tweets_transform"][:10].apply(lematizacion)

### First Lematized Data Division

In [None]:
data1["tweets_transform"] = data1["tweets_transform"].astype(str).apply(Lemmatization)

In [None]:
data1

In [None]:
data1.to_csv("data1process.csv", index=False)

### Second Lematized Data Division

In [None]:
data2["tweets_transform"] = data2["tweets_transform"].astype(str).apply(Lemmatization)

In [None]:
data2.to_csv("data2process.csv", index=False)

In [None]:
data2

### Third Lematized Data Division

In [None]:
data3["tweets_transform"] = data3["tweets_transform"].astype(str).apply(Lemmatization)

In [None]:
data3.to_csv("data3process.csv", index=False)

In [None]:
data3

### Fourth Division of Lematized Data

In [None]:
data4["tweets_transform"] = data4["tweets_transform"].astype(str).apply(Lemmatization)

In [None]:
data4.to_csv("data4process.csv", index=False)

In [None]:
data4

### Fifth Lemmatized Data Division

In [None]:
data5["tweets_transform"] = data5["tweets_transform"].astype(str).apply(Lemmatization)

In [None]:
data5

In [None]:
data5.to_csv("data5process.csv", index=False)

## Joining the data division in a DataFrame

In [None]:
d12 = pd.concat([data1,data2]) #Joining Data1 and Data 2
d123 = pd.concat([d12,data3]) #Joining Data12 and Data 3
d1234 = pd.concat([d123,data4]) #Joining Data123 and Data 4
processed_tweet = pd.concat([d1234,data5]) #Joining Data1234 and Data 5

In [None]:
tweet_procesado

In [None]:
#Saving the Processed Data
tweet_procesado.to_csv("DataLematizada.csv", index=False)

# Análisis de Sentimientos

In [None]:
clasificacion = pipeline("sentiment-analysis")

In [None]:
#Funcion para el Analisis de Sentimientos
def analisis(data):
    results = clasificacion(data)
    return results

#Ejemplos
# analisis("ser triste")

In [None]:
results = tweet_procesado["tweets_transform"].apply(analisis)
results

In [None]:
sentimientos = results.copy()

In [None]:
ind = 0
positivo = []
comentario = []
scores = []

for result in sentimientos:
    #print(result[0]['label'])
    comentario.append(result[0]['label'])
    positivo.append(tweet_procesado['tweets_transform'][ind])
    #print({result['label']},{round(result['score'],4)})

    scores.append(round(result[0]['score'],4))
    ind=ind + 1

In [None]:
dataSentimientos = pd.DataFrame({"Comentario":comentario,"Score":scores,"Tweet":positivo})

In [None]:
dataSentimientos

In [None]:
dataSentimientos.to_csv("DataTwitterSentimientos.csv", index=False)

In [None]:
def tokenization(text):
    text = word_tokenize(text.lower())
    return text

In [None]:
nube = dataSentimientos.copy()
nube["Lista"] = nube['Tweet'].apply(tokenization)

In [None]:
nube

In [None]:
nube.to_csv("DataTwitterNube.csv", index=False)

In [None]:
nubePalabras = nube.explode("Lista")
nubePalabras

In [None]:
nubePalabras.to_csv("DataTwitterNubeExplode.csv", index=False)

In [None]:
words = list(itertools.chain(nubePalabras['Lista']))
wf = collections.Counter(words)
wf.most_common(20)

In [None]:
words = ' '.join(nube['Tweet'])

In [None]:
wordcloud = WordCloud(background_color="white",
                      # stopwords = stopwords,
                    #   colormap = "icefire",
                      scale = 2).generate(words)
# Display the generated image:
plt.figure(figsize = (15, 15), dpi = 300, facecolor = None)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad = 0)

In [None]:
a = nubePalabras['Lista'].value_counts().sort_values(ascending = True).tail(16)
x = a.index
plt.figure(figsize = (8,5))
plt.style.use('seaborn-white')
plt.barh(x,a, color = "firebrick")
plt.xlabel("Counts", fontsize = 30)
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)

In [None]:
dataSentimientos["Comentario"].value_counts()

In [None]:
dataSentimientos["target"]=dataSentimientos["Comentario"].map({'NEGATIVE': 0, 'POSITIVE':1})

In [None]:
dataSentimientos

In [None]:
dataSentimientos.to_csv("DataTwitterSentimientoTarget.csv", index=False)

In [None]:
dataSentimientos["target"].value_counts()

In [None]:
plt.hist(dataSentimientos['target'],bins=4)

# Modelos de Machine Learning

## Division de Datos

In [None]:
from collections import defaultdict

import pickle
from sklearn import preprocessing
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier

In [None]:
datos = dataSentimientos.sample(n=10000)
datos.reset_index(inplace=True, drop=True)
datos

In [None]:
X = datos['Tweet']
X


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

tfidfvectorizer = TfidfVectorizer()
tfidf_wm = tfidfvectorizer.fit_transform(X)
tfidf_tokens = tfidfvectorizer.get_feature_names_out()

ml = pd.DataFrame(data = tfidf_wm.toarray(),columns = tfidf_tokens)
contarVec = pd.concat((datos[["Comentario", 'Score',"target"]], ml), axis = 1)
contarVec

In [None]:
contarVec.to_csv("DataTwitterCountVectorizer.csv", index=False)

In [None]:
contarVec2 = contarVec.copy()
datos_X = contarVec2.drop(["Comentario","Score","target"], axis = 1)
datos_y = contarVec2["target"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(datos_X,datos_y,test_size = 0.2,random_state = 123)

In [None]:
print("Total de datos en X Entrenamiento: ",X_train.shape[0])
print("Total de datos en X Test: ",X_test.shape[0])
print("Total de datos en Y Entrenamiento: ",y_train.shape[0])
print("Total de datos en Y Test: ",y_test.shape[0])

## Naive Bayes

### Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
modelGNB = GaussianNB()
modelGNB.fit(X_train, y_train)
modelGNB

In [None]:
prediction_gnb = modelGNB.predict(X_test)
conf_gnb       = confusion_matrix(y_test, prediction_gnb)
acc_gnb        = accuracy_score(y_test, prediction_gnb)
prec_gnb       = precision_score(y_test, prediction_gnb, average="weighted")
rec_gnb        = recall_score(y_test, prediction_gnb, average="weighted")
f1_gnb         = f1_score(y_test, prediction_gnb, average="weighted")

print("Confusion Matrix: \n", conf_gnb, '\n')
print("Accuracy    : ", acc_gnb)
print("Recall      : ", prec_gnb)
print("Precision   : ", rec_gnb)
print("F1 Score    : ", f1_gnb)

In [None]:
print(classification_report(y_test,prediction_gnb))

### Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
modelMNB = MultinomialNB()
modelMNB.fit(X_train, y_train)
modelMNB

In [None]:
prediction_MNB = modelMNB.predict(X_test)
conf_mnb      = confusion_matrix(y_test, prediction_MNB)
acc_mnb       = accuracy_score(y_test, prediction_MNB)
prec_mnb       = precision_score(y_test, prediction_MNB, average="weighted")
rec_mnb        = recall_score(y_test, prediction_MNB, average="weighted")
f1_mnb        = f1_score(y_test, prediction_MNB, average="weighted")

print("Confusion Matrix: \n", conf_mnb, '\n')
print("Accuracy    : ", acc_mnb)
print("Recall      : ", prec_mnb)
print("Precision   : ", rec_mnb)
print("F1 Score    : ", f1_mnb)

In [None]:
print(classification_report(y_test,prediction_MNB))

## Random Forest Clasifier

In [None]:
modelRF = RandomForestClassifier(random_state = 123) #valores por defecto
modelRF = modelRF.fit(X_train, y_train)
modelRF

In [None]:
#EVALUATION
prediction_rf = modelRF.predict(X_test)
conf_rf       = confusion_matrix(y_test, prediction_rf)
acc_rf        = accuracy_score(y_test, prediction_rf)
prec_rf       = precision_score(y_test, prediction_rf, average="weighted")
rec_rf        = recall_score(y_test, prediction_rf, average="weighted")
f1_rf         = f1_score(y_test, prediction_rf, average="weighted")

print("Confusion Matrix: \n", conf_rf, '\n')
print("Accuracy    : ", acc_rf)
print("Recall      : ", prec_rf)
print("Precision   : ", rec_rf)
print("F1 Score    : ", f1_rf)

In [None]:
print(classification_report(y_test,prediction_rf))

## SVM

In [None]:
modelSVM = svm.SVC(random_state = 1) #valores por defecto
modelSVM = modelSVM.fit(X_train, y_train)
modelSVM

In [None]:
prediction_svm = modelSVM.predict(X_test)
conf_svm       = confusion_matrix(y_test, prediction_svm)
acc_svm        = accuracy_score(y_test, prediction_svm)
prec_svm       = precision_score(y_test, prediction_svm, average="weighted")
rec_svm        = recall_score(y_test, prediction_svm, average="weighted")
f1_svm         = f1_score(y_test, prediction_svm, average="weighted")

print("Confusion Matrix: \n", conf_svm, '\n')
print("Accuracy    : ", acc_svm)
print("Recall      : ", prec_svm)
print("Precision   : ", rec_svm)
print("F1 Score    : ", f1_svm)

In [None]:
print(classification_report(y_test,prediction_svm))

## Comparacion de Modelos

In [None]:
gnb = pd.DataFrame({"Parametros":["Accuracy", "Recall", "Precision", "F1_score"],
                    "Valor":[acc_gnb, prec_gnb, rec_gnb, f1_gnb]})
gnb["Modelo"] = "GNB"

mnb = pd.DataFrame({"Parametros":["Accuracy", "Recall", "Precision", "F1_score"],
                    "Valor":[acc_mnb, prec_mnb, rec_mnb, f1_mnb]})
mnb["Modelo"] = "MNB"

rf = pd.DataFrame({"Parametros":["Accuracy", "Recall", "Precision", "F1_score"],
                    "Valor":[acc_rf, prec_rf, rec_rf, f1_rf]})
rf["Modelo"] = "RF"
svm = pd.DataFrame({"Parametros":["Accuracy", "Recall", "Precision", "F1_score"],
                    "Valor":[acc_svm, prec_svm, rec_svm, f1_svm]})
svm["Modelo"] = "SVM"
comparacion = pd.concat((gnb, mnb, rf, svm), axis = 0).reset_index(drop = True)
comparacion["Valorador"] = "Grupo1"

In [None]:
sns.set_context("notebook", font_scale = 1.7)
# sns.set_theme(style="ticks")
sns.set_palette("pastel")
a = sns.barplot(data = comparacion, x = "Parametros", y = "Valor", hue = "Modelo")
# sns.move_legend(a, "lower center", bbox_to_anchor=(1.2, 0.6), title="Model", frameon=False)
plt.legend(bbox_to_anchor=(1.1, 1), loc='upper left', borderaxespad=0)