In [None]:
import pandas as pd
import nltk as nltk
import credentials
from pymongo import MongoClient
from wordcloud import WordCloud 
import matplotlib.pyplot as plt
from utils_functions import transform_text, columns_to_datetime, set_sentiment
import re
import string
from nltk import FreqDist
import seaborn as sns

In [None]:
### Mongo connection
conn = MongoClient(credentials.DB_URI)
db = conn[credentials.DB_NAME]
collection = db[f'{credentials.PREFIX_COLLECTION.strip()}{credentials.QUERY.strip()}'.replace(' ', '_').lower()]


In [None]:
##To see the keys of the data
collection.find_one().keys()

In [None]:
data_tweets = collection.find({}, {'stored_at':1, 'created_at_yyyymmdd':1 , 'retweet_count':1, 'truncated':1, 'full_text':1, 'geo':1})
df = pd.DataFrame(list(data_tweets))

In [None]:
df = columns_to_datetime(df, ['stored_at', 'created_at_yyyymmdd'])
df['clean_text'] = df['full_text'].apply(transform_text)

In [None]:
from textblob import TextBlob
def set_sentiment2(text):
    sentiment=0
    text = str(text)
    if text != '':
        analysis = TextBlob(text)
        if len(analysis)>3:
            sentiment = analysis.polarity
        else:
            sentiment = 0
    return sentiment

In [None]:
df['polarity'] = df['clean_text'].apply(set_sentiment2)

In [None]:
df['polarity'].describe()


In [None]:
text = ' '.join(df.clean_text)
text = text.lower().replace(credentials.QUERY,'')

In [None]:
wordcloud = WordCloud(width=1024, height=720, background_color='black', min_font_size=14).generate(text)

In [None]:
plt.figure(figsize=(16,9), facecolor=None)
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.savefig('images/a_wordcloud.png')
plt.show()

In [None]:
def tokenice_ngrams(text, ngrams):
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = text.replace('  ','')
    tokenized = text.split(" ")
    md_bigrams= list(nltk.ngrams(tokenized, ngrams))
    return md_bigrams

In [None]:
fdist_a = FreqDist(tokenice_ngrams(text, 2))
fig= plt.figure(figsize=(16,9), facecolor=None)
plt.title(f'Bi-Gramas mas usuales en tuits con el topico = {credentials.QUERY}', fontsize=25)
plt.xlabel('Bigrams', fontsize=16)
plt.ylabel('Counts', fontsize=16)
fdist_a.plot(20)
fig.savefig('images/a_bigrams_more.png', dpi=fig.dpi)

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(16, 9)
plt.title(f'Polarity tuits of {credentials.QUERY} vs retweet')
plt.xlim(0,50)
plt.ylim(-1.2,1.2)
sns.scatterplot(data=df, x='retweet_count', y="polarity", hue="polarity", legend=False, ax=ax)
plt.savefig('images/a_polarity.png')