# Este notebook realiza uma análise exploratória em notícias dos anos 2015, 2016 e 107

<br><br><br>
Iniciamos com o download e extração do arquivo "articles.csv". Esta abordagem é usada para reduzir o tamanho do projeto no repositório.

In [None]:
import requests
from io import BytesIO
import zipfile

url = "https://storage.googleapis.com/kaggle-data-sets/3660/471747/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20210317%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20210317T183726Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=host&X-Goog-Signature=9d910df0dfe6b50f40ab098821fe213af251543b0a2a48752769fad945d6186202c897c33397c2ff6b0e2521bf1fbae16e062415ab8997078f537a9e19d6a1601d3c2d59ac0a8a2fcb98c38bd98fc36f5672df425541a6a066d6e7ea5f0de7c13a67188490a5db174a6de41c54e63da1ad331b6d716ce47f5bf2bce12c100b3bba9c8e31c323f7908ba390cf3e3e636ae8017076062c35b60428cc101a5ccc2755db9917f9d4be51f4b661a57943603cc702d89fff87c540f0c87e7265fa1cc532019c8075af010630a787a69ab3327c12bc77f8d38ff56e88ecc75229b4e6d5ea8dc82567e146f048ffeeb443cb63010b033343961e4776ae26fbd69637cefd"

filebytes = BytesIO(
    requests.get(url).content
)

myzip = zipfile.ZipFile(filebytes)
myzip.extractall("./")

<br><br><br>
Bibliotecas usadas são importadas a seguir:

In [None]:
import pandas
from datetime import datetime
import matplotlib.pyplot as plt
import matplotlib

<br><br><br>
As noticías são carregas e definimos um novo dataframe com as colunas que serão utilizadas

In [None]:
df = pandas.read_csv("articles.csv")

df=df.loc[:, ["date", "title", "text", "category" ]]

df["date"] = df["date"].astype("datetime64[D]")

In [None]:
dateMax=max(df['date'])
dateMin=min(df['date'])
yearMax=datetime.date(dateMax).year
yearMin=datetime.date(dateMin).year
dictionaryCategory={}
dictionaryCategory=dictionaryCategory.fromkeys(set(df["category"]),0)
listYears=set(pandas.DatetimeIndex(df["date"]).year)
listCategory=set(df["category"])

In [None]:
numberArticlesYears=pandas.DatetimeIndex(df["date"]).year.value_counts().to_dict()

plt.figure(figsize=(16, 8), dpi=350)
plt.bar([str(i) for i in list(numberArticlesYears.keys())], list(numberArticlesYears.values()))
plt.ylabel("Quantidade de notícias", fontsize=14)
plt.xlabel("Ano", fontsize=14)
plt.title("Total de notícias por ano \n {} - {}".format(dateMin.strftime("%d-%m-%Y"), dateMax.strftime("%d-%m-%Y")), fontsize=18)


plt.text(-0.15, list(numberArticlesYears.values())[0], list(numberArticlesYears.values())[0], fontsize=12)
plt.text(0.85, list(numberArticlesYears.values())[1], list(numberArticlesYears.values())[1], fontsize=12)
plt.text(1.85, list(numberArticlesYears.values())[2], list(numberArticlesYears.values())[2], fontsize=12)


plt.show()



In [None]:
# dfYear=df.loc[pandas.DatetimeIndex(df["date"]).year == dateSelect].copy()
# print(dfYear)

# numberArticlesAllYears=df["category"].value_counts().to_dict()

# print(numberArticlesAllYears)
from matplotlib.font_manager import FontProperties

fontP = FontProperties()
fontP.set_size('xx-small')



dictTodasCategorias={}
for chave in listCategory:
    if chave not in dictTodasCategorias:
        dictTodasCategorias[chave]=[]


        
# print(dictTodasCategorias)
# print(dictTodasCategorias)

labelsX=[]
listArtMY={}
for ano in sorted(list(listYears)):
    dfYear=df.loc[pandas.DatetimeIndex(df["date"]).year == ano].copy()
    listMeses=list(set(pandas.DatetimeIndex(dfYear["date"]).strftime("%m-%Y")))
    listMeses.sort()
    for mes in listMeses:
        labelsX.append(mes)
        dfmes=dfYear.loc[pandas.DatetimeIndex(dfYear["date"]).strftime("%m-%Y") == mes].copy()
        numberArticlesYears2=dfmes["category"].value_counts().to_dict()
        for chave in list(dictTodasCategorias.keys()):
            if chave in numberArticlesYears2:
                dictTodasCategorias[chave].append(numberArticlesYears2[chave])
            else:
                dictTodasCategorias[chave].append(0)


# print(dictTodasCategorias)
# print(labelsX)
labels=list(dictTodasCategorias.keys())
# print(labels)


plt.figure(figsize=(16, 8), dpi=350)
plt.xticks(rotation=90)
plt.grid(True)
for linha in dictTodasCategorias.values():
    plt.plot(labelsX, linha, label=labels)

plt.legend(labels, loc="upper right", bbox_to_anchor=(1.12, 1), prop=fontP)


plt.ylabel("Quantidade das notícias por mês", fontsize=14)
plt.xlabel("Meses", fontsize=14)
plt.title("Frequencia de notícias de todas categorias", fontsize=18)

In [None]:
# plt.figure(figsize=(25,17))
# figure, axis = plt.subplots(2, 2) 
# axis[0, 0].bar(list(numberArticlesYears.keys())[0:mostFrequently], list(numberArticlesYears.values())[0:mostFrequently])
# plt.savefig("teste.png")
# plt.show()

In [None]:
dateSelect=2015


dfYear=df.loc[pandas.DatetimeIndex(df["date"]).year == dateSelect].copy()
# print(dfYear)

numberArticlesYears=dfYear["category"].value_counts().to_dict()


coefAngul= (-3000)
first=True
numberOld=0
mostFrequently=0
for i in list(numberArticlesYears.values()):
    if first == False:
        if (i-numberOld) < coefAngul:
            break
        mostFrequently+=1
        numberOld = i
    else:
        numberOld = i
        first=False
        mostFrequently+=1
            
print("Categorias mais frequentes em {}:".format(dateSelect))
print(list(numberArticlesYears.keys())[0:mostFrequently])
print(list(numberArticlesYears.values())[0:mostFrequently])



plt.figure(figsize=(16, 8), dpi=350)
plt.bar(list(numberArticlesYears.keys())[0:mostFrequently], list(numberArticlesYears.values())[0:mostFrequently])
plt.ylabel("Quantidade de notícias", fontsize=14)
plt.xlabel("Categorias", fontsize=14)
plt.title("Categorias mais frequentes em {}".format(dateSelect), fontsize=18)



coordX=-0.35
for tmp in list(numberArticlesYears.values())[0:mostFrequently]:
    plt.text(coordX, (tmp+50), tmp, fontsize=12)
    coordX+=1.




# plt.bar(list(numberArticlesYears.keys()), list(numberArticlesYears.values()))

In [None]:

dictCategoriasImportantes={}
for chave in list(numberArticlesYears.keys())[0:mostFrequently]:
    if chave not in dictCategoriasImportantes:
        dictCategoriasImportantes[chave]=[]


In [None]:
dateSelect=2016


dfYear=df.loc[pandas.DatetimeIndex(df["date"]).year == dateSelect].copy()
# print(dfYear)

numberArticlesYears=dfYear["category"].value_counts().to_dict()


coefAngul= (-3000)
first=True
numberOld=0
mostFrequently=0
for i in list(numberArticlesYears.values()):
    if first == False:
        if (i-numberOld) < coefAngul:
            break
        mostFrequently+=1
        numberOld = i
    else:
        numberOld = i
        first=False
        mostFrequently+=1
            
print("Categorias mais frequentes em {}:".format(dateSelect))
print(list(numberArticlesYears.keys())[0:mostFrequently])
print(list(numberArticlesYears.values())[0:mostFrequently])

plt.figure(figsize=(16, 8), dpi=350)
plt.bar(list(numberArticlesYears.keys())[0:mostFrequently], list(numberArticlesYears.values())[0:mostFrequently])
plt.ylabel("Quantidade de notícias", fontsize=14)
plt.xlabel("Categorias", fontsize=14)
plt.title("Categorias mais frequentes em {}".format(dateSelect), fontsize=18)



coordX=-0.35
for tmp in list(numberArticlesYears.values())[0:mostFrequently]:
    plt.text(coordX, (tmp+50), tmp, fontsize=12)
    coordX+=1.


In [None]:

# dictCategoriasImportantes={}
for chave in list(numberArticlesYears.keys())[0:mostFrequently]:
    if chave not in dictCategoriasImportantes:
        dictCategoriasImportantes[chave]=[]


In [None]:
dateSelect=2017

dfYear=df.loc[pandas.DatetimeIndex(df["date"]).year == dateSelect].copy()

numberArticlesYears=dfYear["category"].value_counts().to_dict()


coefAngul= (-1290)
first=True
numberOld=0
mostFrequently=0
for i in list(numberArticlesYears.values()):
    if first == False:
        if (i-numberOld) < coefAngul:
            break
        mostFrequently+=1
        numberOld = i
    else:
        numberOld = i
        first=False
        mostFrequently+=1
            
print("Categorias mais frequentes em {}:".format(dateSelect))
print(list(numberArticlesYears.keys())[0:mostFrequently])
print(list(numberArticlesYears.values())[0:mostFrequently])


plt.figure(figsize=(16, 8), dpi=350)
plt.bar(list(numberArticlesYears.keys())[0:mostFrequently], list(numberArticlesYears.values())[0:mostFrequently])
plt.ylabel("Quantidade de notícias", fontsize=14)
plt.xlabel("Categorias", fontsize=14)
plt.title("Categorias mais frequentes em {}".format(dateSelect), fontsize=18)



coordX=-0.35
for tmp in list(numberArticlesYears.values())[0:mostFrequently]:
    plt.text(coordX, (tmp+50), tmp, fontsize=12)
    coordX+=1.


In [None]:
for chave in list(numberArticlesYears.keys())[0:mostFrequently]:
    if chave not in dictCategoriasImportantes:
        dictCategoriasImportantes[chave]=[]


In [None]:
labelsX=[]
listArtMY={}
for ano in sorted(list(listYears)):
    dfYear=df.loc[pandas.DatetimeIndex(df["date"]).year == ano].copy()
    listMeses=list(set(pandas.DatetimeIndex(dfYear["date"]).strftime("%m-%Y")))
    listMeses.sort()
    for mes in listMeses:
        labelsX.append(mes)
        dfmes=dfYear.loc[pandas.DatetimeIndex(dfYear["date"]).strftime("%m-%Y") == mes].copy()
        numberArticlesYears2=dfmes["category"].value_counts().to_dict()
        for chave in list(dictCategoriasImportantes.keys()):
            dictCategoriasImportantes[chave].append(numberArticlesYears2[chave])
        else:
            dictTodasCategorias[chave].append(0)

labels=dictCategoriasImportantes.keys()
plt.figure(figsize=(16, 8), dpi=350)
for linha in dictCategoriasImportantes.values():
    plt.xticks(rotation=90)
    plt.grid(True)
    plt.plot(labelsX, linha, label=labels)
    plt.legend(labels, loc=3)


plt.ylabel("Quantidade de notícias por mês", fontsize=14)
plt.xlabel("Meses", fontsize=14)
plt.title("Frequencia de notícias das principais categorias", fontsize=18)

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords as stopwordsNLTK
from string import punctuation

#### Aprovação do impeachment de Dilma
mesAnoInteresse="04-2016"
categoriaInteresse="poder"

# ##### Vazamento de gravação entre Lula e Dilma
# mesAnoInteresse="03-2016"
# categoriaInteresse="poder"

##### Olimpíadas no Brasil
# mesAnoInteresse="08-2016"
# categoriaInteresse="esporte"

# ##### Pan-Americano em Toronto
# mesAnoInteresse="07-2015"
# categoriaInteresse="esporte"

##### Fim de campeonatos estaduais e inicio da Libertadores
# mesAnoInteresse="05-2015"
# categoriaInteresse="esporte"



dfAnal=df.loc[pandas.DatetimeIndex(df["date"]).strftime("%m-%Y") == mesAnoInteresse].copy()
dfInteresse=dfAnal.loc[dfAnal["category"] == categoriaInteresse].copy()


dfInteresse.dropna(subset=['title'], axis=0, inplace = True)

titulos = dfInteresse['title']

titulos = " ".join(s for s in titulos)

titulos=titulos.lower()

stopwords = set(STOPWORDS)

stopwords.update(["da", "meu", "em", "você", "de", "ao", "os", "é", "para", "se", "terá", "na", "são", "fazem",
                 "à", "ma", "seu", "dos", "um", "até", "deve", "após", "faz", "foi", "dizem", "quer", "que",
                 "por", "conta", "vão", "tem", "ser", "pedir", "chama", "antes", "e", "diz", "sobre"])

stopwords.update(set(stopwordsNLTK.words('portuguese') + list(punctuation)))


# palavras_sem_stopwords = [palavra for palavra in palavras if palavra not in stopwords]

wordcloud = WordCloud(stopwords=stopwords,
                      background_color='black', width=1600,                            
                      height=800).generate(titulos)

fig, ax = plt.subplots(figsize=(16,8)) 
ax.imshow(wordcloud, interpolation='bilinear')
ax.set_axis_off()

plt.imshow(wordcloud)     

# print(titulos)

In [None]:
## labels=dictCategoriasImportantes.keys()
plt.figure(figsize=(16, 8), dpi=350)
for linha in dictCategoriasImportantes.values():
    plt.xticks(rotation=90)
    plt.grid(True)
    plt.plot(labelsX, linha, label=labels)
    plt.legend(labels, loc=3)



plt.annotate("Olimpíadas no Brasil", xy=("09-2016", 1100), xytext=("12-2016", 1000),arrowprops=dict(facecolor='black', shrink=2),)

plt.annotate("Vazamento de conversa \nLula e Dilma", xy=("03-2016", 1080), xytext=("03-2016", 300),arrowprops=dict(facecolor='black', shrink=2),)

plt.annotate("Aprovação de impeachment \nde Dilma", xy=("04-2016", 1080), xytext=("09-2016", 900),arrowprops=dict(facecolor='black', shrink=2),)

plt.annotate("Pan-Americano", xy=("07-2015", 950), xytext=("07-2015", 1100),arrowprops=dict(facecolor='black', shrink=2),)

plt.annotate("Fim e início de\ncampeonatos\n(Estaduais e Libertadores)", xy=("05-2015", 820), xytext=("05-2015", 300),arrowprops=dict(facecolor='black', shrink=2),)

plt.ylabel("Quantidade de notícias por mês", fontsize=14)
plt.xlabel("Meses", fontsize=14)
plt.title("Frequencia de notícias das principais categorias", fontsize=18)
plt.show()

In [None]:
import nltk
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import cross_val_predict
from collections import Counter

import numpy as np

dataset = pd.read_csv('Tweets_Mg.csv',encoding='utf-8')


def PreprocessamentoSemStopWords(instancia):
    instancia = re.sub(r"http\S+", "", instancia).lower().replace(',','').replace('.','').replace(';','').replace('-','')
    stopwords = set(nltk.corpus.stopwords.words('portuguese'))
    palavras = [i for i in instancia.split() if not i in stopwords]
    return (" ".join(palavras))

def Stemming(instancia):
    stemmer = nltk.stem.RSLPStemmer()
    palavras=[]
    for w in instancia.split():
        palavras.append(stemmer.stem(w))
    return (" ".join(palavras))

def Preprocessamento(instancia):
    #remove links, pontos, virgulas,ponto e virgulas dos tweets
    #coloca tudo em minusculo
    instancia = re.sub(r"http\S+", "", instancia).lower().replace(',','').replace('.','').replace(';','').replace('-','').replace(':','')
    return (instancia)

tweets = dataset['Text'].values
classes = dataset['Classificacao'].values
tweets = [PreprocessamentoSemStopWords(texto) for texto in tweets]
dfAnal=df.loc[pandas.DatetimeIndex(df["date"]).strftime("%m-%Y") == mesAnoInteresse].copy()
dfInteresse=dfAnal.loc[dfAnal["category"] == categoriaInteresse].copy()
# dfInteresse.dropna(subset=["title"], axis=0, inplace = True)
# listInteresse=dfInteresse["title"].values.tolist()
dfInteresse.dropna(subset=["text"], axis=0, inplace = True)
listInteresse=dfInteresse["text"].values.tolist()
listInteresse=[PreprocessamentoSemStopWords(texto) for texto in listInteresse]
tweets.extend(listInteresse)
tweets = [PreprocessamentoSemStopWords(texto) for texto in tweets]
vectorizer = CountVectorizer(analyzer="word")
freq_tweets = vectorizer.fit_transform(tweets)
from sklearn.ensemble import RandomForestClassifier
modelo = RandomForestClassifier(n_estimators=50)
# modelo = MultinomialNB()
modelo.fit(freq_tweets[0:8199],classes)
resultados = cross_val_predict(modelo, freq_tweets[0:8199], classes, cv=10)
precisao=np.around(metrics.accuracy_score(classes,resultados)*100, 4)
sentimentos = modelo.predict(freq_tweets[8199:])
resultSentimentos=Counter(sentimentos)
plt.figure(figsize=(6, 3), dpi=150, facecolor="white")
plt.pie(resultSentimentos.values(), labels=resultSentimentos.keys(), autopct="%1.1f%%",
        shadow=True, startangle=90)
plt.title("Polaridade das notícias \n({} de {})".format(mesAnoInteresse, categoriaInteresse), fontsize=14)
plt.xlabel("Modelo com precisão de {}%".format(precisao), fontsize=12)
plt.show()

In [None]:

dfAnal=df.loc[pandas.DatetimeIndex(df["date"]).strftime("%m-%Y") == mesAnoInteresse].copy()
dfInteresse=dfAnal.loc[dfAnal["category"] == categoriaInteresse].copy()


dfInteresse.dropna(subset=['title'], axis=0, inplace = True)

# titulos = dfInteresse['title']
# listaDias=dfInteresse["date"].tolist()#.strftime("%Y-%m-%d")#.tolist()
listaDias=[data.strftime("%Y-%m-%d") for data in dfInteresse["date"].tolist()]#.strftime("%Y-%m-%d")#.tolist()

numberArticlesDia={}
listaDias.sort()
# listaDias=set(listaDias)


for dia in listaDias:
    dfDia=dfInteresse[pandas.DatetimeIndex(dfInteresse["date"]).strftime("%Y-%m-%d") == dia].copy()
    numberArticlesDia[dia]=pandas.DatetimeIndex(dfDia["date"]).strftime("%Y-%m-%d").value_counts().tolist()[0]

    
    
# print(numberArticlesDia)

# print(list(numberArticlesDia.keys()))
# print(list(numberArticlesDia.values()))

plt.figure(figsize=(16, 8), dpi=350)
# for linha in dictCategoriasImportantes.values():
plt.xticks(rotation=90)
plt.grid(True)
plt.plot(list(numberArticlesDia.keys()), list(numberArticlesDia.values()))



plt.ylabel("Quantidade de notícias por dia", fontsize=14)
plt.xlabel("Dias", fontsize=14)
plt.title("Frequencia de notícias de {} em {}".format(categoriaInteresse, mesAnoInteresse), fontsize=18)



In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords as stopwordsNLTK
from string import punctuation

##### Mês de aprovação do impeachment de Dilma
# mesAnoInteresse="04-2016"
# categoriaInteresse="poder"

# ##### Mês do vazamento de gravação entre Lula e Dilma
# mesAnoInteresse="03-2016"
# categoriaInteresse="poder"

# ##### Mês do Pan-Americano em Toronto
# mesAnoInteresse="07-2015"
# categoriaInteresse="esporte"

##### Mês de fim de campeonatos estaduais e inicio da Libertadores
# mesAnoInteresse="05-2015"
# categoriaInteresse="esporte"


##### Ouro do futebol nas Olimpíadas no Brasil
dataInteresse="17-04-2016"
# categoriaInteresse="esporte"


dfdia=df.loc[pandas.DatetimeIndex(df["date"]).strftime("%d-%m-%Y") == dataInteresse].copy()
dfInteresse=dfdia.loc[dfdia["category"] == categoriaInteresse].copy()


dfInteresse.dropna(subset=['title'], axis=0, inplace = True)

titulos = dfInteresse['title']

# print(titulos)

titulos = " ".join(s for s in titulos)

titulos=titulos.lower()

stopwords = set(STOPWORDS)

stopwords.update(["da", "meu", "em", "você", "de", "ao", "os", "é", "para", "se", "terá", "na", "são", "fazem",
                 "à", "ma", "seu", "dos", "um", "até", "deve", "após", "faz", "foi", "dizem", "quer", "que",
                 "por", "conta", "vão", "tem", "ser", "pedir", "chama", "antes", "e", "diz", "sobre"])

stopwords.update(set(stopwordsNLTK.words('portuguese') + list(punctuation)))


# palavras_sem_stopwords = [palavra for palavra in palavras if palavra not in stopwords]

wordcloud = WordCloud(stopwords=stopwords,
                      background_color='black', width=1600,                            
                      height=800).generate(titulos)

fig, ax = plt.subplots(figsize=(16,8)) 
ax.imshow(wordcloud, interpolation='bilinear')
ax.set_axis_off()

plt.imshow(wordcloud)     

# print(titulos)