In [1]:
import pandas as pd
import spacy
import nltk
from nltk.stem import SnowballStemmer
import gensim
from gensim import corpora
import pyLDAvis.gensim
import pickle


In [2]:
from spacy.lang.es import Spanish
parser = Spanish()
nltk.download('stopwords')
es_stop = set(nltk.corpus.stopwords.words('spanish'))
stemmer = SnowballStemmer('spanish')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abel1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
es_stop |= {"https","hola","gracias"}

In [4]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            continue
            #lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens


In [5]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in es_stop]
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens



In [6]:
path = r'C:\Users\abel1\Jupyter Notebooks\DH\DESAFIO 3\\'
claro_tweets = pd.read_excel(path + 'claro_prev.xlsx')
claro_tweets['company'] = 'claro'
movistar_tweets = pd.read_excel(path + 'movistar_prev.xlsx')
movistar_tweets['company'] = 'movistar'
personal_tweets = pd.read_excel(path + 'personal_prev.xlsx')
personal_tweets['company'] = 'personal'


In [7]:
#eliminamos todos los tweets de claro asociados a publicidad de eventos de gaming
claro_tweets=claro_tweets[claro_tweets.author_name != '9zTeam']
claro_tweets=claro_tweets[~claro_tweets.text.str.contains('9zteam',case=True)]


In [8]:
#claro_tweets[claro_tweets.text.str.contains('brasil',case=True)]

In [9]:
tweets_df = pd.concat([claro_tweets, movistar_tweets, personal_tweets]).reset_index()
tweets_df['tokens'] = tweets_df.apply(lambda x: prepare_text_for_lda(x.text), axis=1)



In [29]:
tweets_df.company.value_counts()

movistar    1075
claro        984
personal     906
Name: company, dtype: int64

In [10]:
tweets_df['bigrams'] = tweets_df['tokens'].apply(lambda row: list(nltk.ngrams(row, 2)))

In [11]:
tweets_df['bigrams'] = tweets_df.apply(lambda x: ["_".join(tokens) for tokens in x['bigrams']], axis=1)

In [12]:
a = tweets_df[['company','bigrams']].set_index('company').explode('bigrams').reset_index().groupby(['company','bigrams']).size().reset_index()
a.sort_values(0, ascending=False)#.pivot_table(tweets_df,aggfunc='sum',index='company',,fill_value=True)
b = a.groupby('bigrams').sum().reset_index()
b['company'] = 'Total'
bigrams_counts_data = pd.concat([a,b]).reset_index(drop=True)
bigrams_counts_data.columns = ['company', 'ngram', 'value']
bigram_data  = bigrams_counts_data.pivot_table(index='ngram',columns='company', aggfunc='sum',fill_value=0)
bigram_data = bigram_data.droplevel(0,axis=1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """


In [47]:
top_ngrams = bigrams_counts_data[bigrams_counts_data.company=='Total'].sort_values('value', ascending=False).head(20).ngram.unique()
top_ngrams_70 = bigrams_counts_data[bigrams_counts_data.company=='Total'].sort_values('value', ascending=False).head(1000).ngram.unique()

In [48]:
tweets_df.columns

Index(['index', 'text', 'author_name', 'created_at', 'id', 'text_clean',
       'company', 'tokens', 'bigrams'],
      dtype='object')

In [49]:
lookup =tweets_df[['id','company','bigrams']].set_index(['id','company']).explode('bigrams').reset_index().groupby(['id','company','bigrams']).size()
lookup = lookup.astype('bool').reset_index()
lookup = lookup[lookup['bigrams'].isin(top_ngrams_70)].set_index(['id','company','bigrams']).unstack(fill_value=False)
lookup = lookup.droplevel(0,axis=1).reset_index()

In [50]:
lookup = pd.merge(lookup,tweets_df[['id','created_at','tokens']])
lookup['text_clean'] = lookup.tokens.apply(" ".join)

In [53]:
lookup['Words'] = lookup.tokens.apply(len)

In [55]:
lookup['Words_clipped'] = lookup['Words']

In [56]:
lookup.company.value_counts()

movistar    719
claro       646
personal    552
Name: company, dtype: int64

In [57]:
lookup.to_csv('lookup_tweets.csv')

In [100]:
lookup

Unnamed: 0,id,company,20:30hs_celebr,amig_esper,amig_famili,amig_mañan,atencion_client,buen_tard,celebr_amig,compr_celul,...,solucion_problem,stream_manag,suficient_phishing,telecom_suficient,tir_felic,victori_venc,vuelv_tir,vuelv_vuelv,created_at,text_clean
0,1283458861769335040,personal,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,2020-07-15 17:50:12,@sifueraunaflor @personalar buenas tardes grac...
1,1283458902886174976,personal,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,2020-07-15 17:50:22,@sifueraunaflor @personalar buenas tardes grac...
2,1283459034268478976,personal,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,2020-07-15 17:50:53,@sifueraunaflor @personalar buenas tardes grac...
3,1283459335776023040,personal,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,2020-07-15 17:52:05,@sifueraunaflor @personalar @defensoriacaba bu...
4,1283472382691290880,personal,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,2020-07-15 18:43:56,@personalar buenas tardes abone mi factura el ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
717,1286167261749957120,claro,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,2020-07-23 05:12:25,rt @filonewsok una entrevista entre @frankkast...
718,1286169897270152960,claro,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,2020-07-23 05:22:54,rt @filonewsok una entrevista entre @frankkast...
719,1286189353727939072,claro,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,2020-07-23 06:40:12,rt @9zteam locker room los esperamos a todos e...
720,1286195740130370048,claro,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,2020-07-23 07:05:35,rt @filonewsok una entrevista entre @frankkast...


In [61]:
top_bigrams_counts_data = bigrams_counts_data[bigrams_counts_data.ngram.isin(top_ngrams)].pivot_table(index='company',columns='ngram',fill_value=0).stack().reset_index()

In [58]:
bigram_data  = top_bigrams_counts_data.pivot_table(index='ngram',columns='company', aggfunc='sum',fill_value=0)
bigram_data = bigram_data.droplevel(0,axis=1)

NameError: name 'top_bigrams_counts_data' is not defined

In [59]:
bigram_data

company,Total,claro,movistar,personal
ngram,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
+543517247001_lueg,984,984,0,0
......_.....,1075,0,1075,0
......_sig,906,0,0,906
....._nadi,906,0,0,906
....._sig,906,0,0,906
...,...,...,...,...
yyyyyyyyyy_......,906,0,0,906
zap_sumat,984,984,0,0
zon_madr,984,984,0,0
zulem_marit,906,0,0,906


In [62]:
top_bigrams_counts_data.to_csv('bigrams_counts_data.csv', index=False)
bigram_data.reset_index().to_csv('bigram_data.csv', index=False)


In [14]:
dictionary = corpora.Dictionary(tweets_df['bigrams'].values)
tweets_df['bow'] = tweets_df.apply(lambda x: dictionary.doc2bow(x.bigrams), axis=1)
corpus = tweets_df['bow'].values

In [37]:
NUM_TOPICS = 4
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)

In [38]:
def apply_topics(grp):
    topics = ldamodel.get_document_topics(grp['bow'])
    for i in range(len(topics)):
        grp[f'topic_{i}'] = topics[i][1]
    return grp
tweets_df = tweets_df.apply(apply_topics,axis=1)



In [39]:
topics = ldamodel.print_topics(num_words=6)
for topic in topics:
    print(topic)

(0, '0.004*"buen_tard" + 0.004*"recomend_realiz" + 0.003*"realiz_denunci" + 0.003*"dqoz7jjtyn_salud" + 0.003*"denunci_ingres" + 0.003*"ingres_dqoz7jjtyn"')
(1, '0.003*"esper_respuest" + 0.002*"cobr_factur" + 0.002*"sig_darm" + 0.001*"aument_mes" + 0.001*"piens_reembols" + 0.001*"reembols_diner"')
(2, '0.011*"movistargarc_movistargarc" + 0.006*"ransomwar_telecom" + 0.006*"cuid_ransomwar" + 0.006*"suficient_phishing" + 0.006*"telecom_suficient" + 0.004*"atencion_client"')
(3, '0.004*"internet_hog" + 0.004*"contrat_internet" + 0.003*"hog_contrat" + 0.002*"consult_pued" + 0.002*"pued_mejor" + 0.002*"mejor_ofert"')


In [40]:
bigram_vectors=tweets_df[['bigrams','topic_0','topic_1','topic_2','topic_3']]

In [43]:
bigram_vectors.to_csv('bigram_vectors.csv', index=False)

In [41]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [42]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [21]:
pyLDAvis.save_html(lda_display,'lda_report.html')

# EXPLORACION DE BIGRAMAS SOBRE TODO EL UNIVERSO

In [22]:
txt=[" ".join(tokens) for tokens in tweets_df['tokens'].values]

In [23]:
from string import punctuation
# remove punctuation and make lower case
txt = ''.join(c for c in txt if c not in punctuation).lower()

In [24]:
print(txt)

consult pued mejor ofert silenci negat abandon port numer estan notific y3iiqnqhlsacost despu veng pib arregl famos fibr optic pus l0sebrwoe7conmig esper actu vecinbuen respuest reclam pued sab cabl expliquconmig algun orientacion municbclar celebr cumplean 20aniversari podri favor respond tweet felicitacion muchbuen respuest reclam pued sab cabl expliquaverigu tuentiseman aprox problem recepcion llam entra direct casill pued comunic senal siempr equip modern prob reinici algun problem servicicort carg text tweets veloc speedtest lleg test mejor bastant complet carg relat rap algui ficfstcji7excelent servici clar senal mejor promocion mism atencion client resuelv rap vent telefon atencion personaliz acompan usuari recomaveriguconsult pued mejor ofert silencimati canal telefon fibr optic excelent servici client trabaj pag sufrconsult pued mejor ofert silenciconsult pued mejor ofert silenciacab dat pospag pued chat whatapp pued hac llam vide llam pandemi tenazzzzzzzzcomentari compart exp

In [25]:
import nltk
import pandas as pd
from nltk.probability import FreqDist
from IPython.core.display import HTML

# We need the punkt library to tokenize the text
nltk.download("punkt")

# Tokenize the text into individual words
words = txt.split()

# Get the frequency distribution of the words into a data frame
fdist = FreqDist(words)
count_frame = pd.DataFrame(fdist, index =[0]).T
count_frame.columns = ['Count']
count_frame = count_frame.sort_values('Count', ascending=False)

# Display the dataframe as HTML (so it's not truncated)
display(HTML(count_frame.to_html()))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mportuese\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,Count
servici,318
pued,207
internet,193
hac,187
llam,183
line,158
esper,139
mism,137
client,136
solucion,131


In [26]:
# Get the frequency distribution of the remaining words

fdist = FreqDist(words)
count_frame = pd.DataFrame(fdist, index =[0]).T
count_frame.columns = ['Count']

# Plot the frequency of the top 60 words
counts = count_frame.sort_values('Count', ascending = False)
fig = plt.figure(figsize=(16, 9))
ax = fig.gca()    
counts['Count'][:60].plot(kind = 'bar', ax = ax, color='teal')
ax.set_title('Frecuencia de los tokens más habituales')
ax.set_ylabel('Frecuencia de tokens')
ax.set_xlabel('token')
plt.show()

NameError: name 'plt' is not defined

In [None]:
#Generamos los bigramas

from nltk import ngrams
from nltk.probability import FreqDist
import matplotlib.pyplot as plt

# Get n-grams where n = 2
n = 2
nGramsInDoc = []
nGrams = ngrams(words, n)
for grams in nGrams:
    nWords = ' '.join(g for g in grams)
    nGramsInDoc.append(nWords)

# Count the frequency of each n-gram
fdist = FreqDist(nGramsInDoc)
count_frame = pd.DataFrame(fdist, index =[0]).T
count_frame.columns = ['Count']

# Plot the frequency of the top 60 bigrams
counts = count_frame.sort_values('Count', ascending = False)
fig = plt.figure(figsize=(16, 9))
ax = fig.gca()    
counts['Count'][:60].plot(kind = 'bar', ax = ax, color='teal')
ax.set_title('Frequency of the most common n-grams')
ax.set_ylabel('Frequency of n-gram')
ax.set_xlabel('n-gram')
plt.show()


In [None]:
count_frame.head()