### Imports

In [123]:
# import the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#gensim
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec


from sklearn.cluster import KMeans

#### Jupyter Configuration

In [87]:
pd.options.display.max_colwidth = 100

### Analysing data

#### Loading CSV

In [156]:
#df = pd.read_csv('../raw_data/Tweets_Mg.csv', encoding='utf-8')
df = pd.read_csv('../raw_data/mito_total_tweets.csv', encoding='utf-8')
df.sample(10).T

Unnamed: 0,438595,372526,276790,186142,261418,378494,523680,548478,270538,501095
url,https://twitter.com/ParaisoLeandro/status/1050799278380208128,https://twitter.com/lucaskorzoff/status/1056691363289460736,https://twitter.com/DiogoFrenandes/status/986732903018180609,https://twitter.com/EdsonSilva/status/976126931245633536,https://twitter.com/erickcassiaano/status/1039360324099559424,https://twitter.com/garciajoaoedua1/status/1056580855299260416,https://twitter.com/razevedorsa/status/1036655373355241472,https://twitter.com/DalmaMusic_/status/974307849407889408,https://twitter.com/crvgeira/status/972148447695564800,https://twitter.com/xcelestiales/status/1046420019343708166
date,2018-10-12 17:24:05,2018-10-28 23:37:08,2018-04-18 22:27:11,2018-03-20 16:02:50,2018-09-11 03:49:46,2018-10-28 16:18:00,2018-09-03 16:41:15,2018-03-15 15:34:27,2018-03-09 16:33:46,2018-09-30 15:22:28
content,"@org_conservador @leandroruschel E isso está acontecendo, mas não são os apoiadores de Bolsonaro...",Agora a bosta da Globo vem babar ovo do mito,Gostei de um vídeo @YouTube https://t.co/qYqHtaVEpm canal que fez minha Cartoon (mito),"@churrycristaldo Churry mito volta, eu te amo, #ajudanoix",Nico López é um mito kkkkk,@SchuldinerSieg Aeeee é Bolsonaro fazendo milagres e causas impossíveis é mito!!!,Cadê o alto índice de rejeição do Mito??? Eles estão desesperados!!! KKKKKKKK https://t.co/niAQ9...,@MetaErmal Ahahahaha mito 😂😂😂,MIOTO MITO,@Guerra00001 @Darthmau1 @asgardicn @Aisu_boi @beyoncedefense fico com a frase mais clichê do mun...
id,1050799278380208128,1056691363289460736,986732903018180609,976126931245633536,1039360324099559424,1056580855299260416,1036655373355241472,974307849407889408,972148447695564800,1046420019343708166
reply_count,0,0,0,0,0,0,0,0,1,0
retweet_count,0,0,0,0,0,0,0,0,0,0
like_count,0,2,0,0,0,0,0,0,0,0
quote_count,0,0,0,0,0,0,0,0,0,0
lang,pt,pt,pt,pt,pt,pt,pt,tl,es,pt
username,ParaisoLeandro,lucaskorzoff,DiogoFrenandes,EdsonSilva,erickcassiaano,garciajoaoedua1,razevedorsa,DalmaMusic_,crvgeira,xcelestiales


#### Cheking type of data and missing values

In [157]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 559591 entries, 0 to 559590
Data columns (total 21 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   url              559591 non-null  object
 1   date             559591 non-null  object
 2   content          559591 non-null  object
 3   id               559591 non-null  int64 
 4   reply_count      559591 non-null  int64 
 5   retweet_count    559591 non-null  int64 
 6   like_count       559591 non-null  int64 
 7   quote_count      559591 non-null  int64 
 8   lang             559591 non-null  object
 9   username         559591 non-null  object
 10  displayname      559386 non-null  object
 11  description      450439 non-null  object
 12  verified         559591 non-null  bool  
 13  created          559591 non-null  object
 14  followers_count  559591 non-null  int64 
 15  friends_count    559591 non-null  int64 
 16  location         413345 non-null  object
 17  protected 

#### Dataset considerations

- Most of the content has NaN values
- 26 columns in total. 7 columns are object type, 2 columns are int type, 17 columns are float type
- 15 columns are complety empty, latitude and longitude only has 1% of values filled, the user location is present in 67% of the total rows

#### Check for unique values

In [30]:
df.nunique()

Unnamed: 0                   8199
Created At                   7945
Text                         5765
Geo Coordinates.latitude       57
Geo Coordinates.longitude      57
User Location                1591
Username                     3907
User Screen Name             3966
Retweet Count                 113
Classificacao                   3
Observação                      1
Unnamed: 10                     0
Unnamed: 11                     0
Unnamed: 12                     0
Unnamed: 13                     0
Unnamed: 14                     0
Unnamed: 15                     0
Unnamed: 16                     0
Unnamed: 17                     0
Unnamed: 18                     0
Unnamed: 19                     0
Unnamed: 20                     0
Unnamed: 21                     0
Unnamed: 22                     0
Unnamed: 23                     0
Unnamed: 24                     0
dtype: int64

In [35]:
df['User Location'].value_counts()

Brasil                           591
Minas Gerais                     354
Belo Horizonte - Minas Gerais    234
Belo Horizonte, Brasil           227
Belo Horizonte                   131
                                ... 
SABARÁ-MG                          1
SABARÁ MG                          1
João Monlevade, Brasil             1
zimbabwe                           1
Brasil - Angola - Portugal         1
Name: User Location, Length: 1591, dtype: int64

#### Observations

- There's 3907 unique users 47% of the total number of tweets observed
- There's only 3 classification types (Positivo, Negativo, Neutro)
- The dataset are 'balanced' not 100% but it's okay
- The most common regions of the tweets are Brasil, Minas Gerais, Belo Horizonte respectively

### Data Cleaning

dropping all columns that:

- all null records
- less than 40% data
- all unique records such as id's

In [55]:
df = df[['Created At', 'Text', 'User Location', 'Username', 'Retweet Count', 'Classificacao']]

#### Removing duplicates based on Text

In [56]:
df = df.drop_duplicates(subset=['Text'], keep='first')

#### Defining a clean encoder

In [134]:
import re
import string

from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import  make_column_transformer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords 


class CleaningEncoder(BaseEstimator, TransformerMixin):
    '''
    Receives raw text data from the tweets and returns clean, ready to process data:
    turns all into lower case;
    removes punctuation;
    removes stopwords;
    removes numbers;
    removes users' handles
    
    '''
    def __init__(self):
        pass
                        
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        #assert isinstance(X, pd.DataFrame)
        X_ = X.copy()      
        
        #removin NaN values
        X_ = X_.fillna('nenhuma descrição')
        
        #removing links
        X_ = X_.apply(lambda x: re.sub(r"http\S+", "", x))
        X_ = X_.apply(lambda x: re.sub(r"www.\S+", "", x))
                
        #removing punctuation from each tweet
        new_punc = list(string.punctuation)
        del new_punc[2]
        del new_punc[-11]
        for punctuation in new_punc:
            X_ = X_.str.replace(punctuation, '')
            
        #removing numbers
        X_ = X_.str.replace('\d+', '')
        
        #tokenizing - removes handles, applies lowercase, keeps #, shortens letter repetitions to three
        #ex: kkkkk, kkkkkk, kkkkkkkkk = kkk
        tkn = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
        X_ = X_.apply(lambda x: tkn.tokenize(x))
        
        #removing stopwords
        stop_words = stopwords.words('portuguese')
        stop_words.remove('não')
        addicional = [
            'd', 'ta', 'q', 'tah', 'tao', 'eh', 'vc', 'voce',
            'pq', 'quedê', 'mto', 'mt', 'bj', 'bjs','vcs','bb',
            'b', 'sao', 'axo', 'mano', 'ae', 'neh', 'aí', '...', 'rt',
            'kkk', 'porque', 'né', 'no', 'iai', 'tbm', 'msm', 'jah']
        stop_words.extend(addicional)
        
        #X_ = X_.apply(lambda x: [word for word in x if len(word) > 1])
        X_ = X_.apply(lambda x: ' '.join(word for word in x if word not in (stop_words)))
        
        return X_

#### Execute cleaning process on text column

In [135]:
clean = CleaningEncoder()

df['text_clean'] = clean.fit_transform(df['Text'])

  X_ = X_.str.replace(punctuation, '')
  X_ = X_.str.replace('\d+', '')


In [136]:
df['text_clean']

0                                            catedral santo antônio governador valadaresmg
1                                                        governador valadares minas gerais
2                                                        governador valadares minas gerais
3                                                                                         
4         psol vai questionar aumento vereadores prefeito bh justiça politica estado minas
                                               ...                                        
8176                             trio preso suspeito roubo tráfico abuso sexual uberlândia
8186            trio preso suspeito roubo tráfico abuso sexual uberlândia #operaçãobetalab
8191                    trio preso suspeito roubo tráfico abuso sexual uberlândia #timbeta
8197    trio preso suspeito roubo tráfico abuso sexual uberlândia autores molestado vítima
8198                   trio suspeito roubo cargas preso santa luzia mg #rmg #recordtvminas

#### Removing duplicates again on cleaned data

In [137]:
df = df.drop_duplicates(subset=['text_clean'], keep='first')

#### Converting Date column to DateTime Object and creating two new columns:month and year

In [138]:
df['Created At'] = pd.to_datetime(df['Created At'])
df['month'] = df['Created At'].dt.month
df['year'] = df['Created At'].dt.year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Created At'] = pd.to_datetime(df['Created At'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['month'] = df['Created At'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = df['Created At'].dt.year


#### Storing column 'Classificacao' as our y_pred

In [94]:
y_pred = df['Classificacao']
df.drop(columns=['Classificacao'], inplace=True)

KeyError: 'Classificacao'

### Feature Engineering

#### Converting the "text_clean" column in the format supported by embeddings.

In [141]:
sent = [row for row in df["text_clean"]]
#use Gensim Phrases package to automatically detect common phrases (bigrams) from a list of sentences.
# phrases = Phrases(sent, min_count=1, progress_per=50000)
# bigram = Phraser(phrases)
# sentences = bigram[sent]

sentences = []
for i in range(0, len(sent)):
    sentences.append(sent[i].split(' '))

#sentences

In [142]:
#Initializing the word2vec model

import multiprocessing

w2v_model = Word2Vec(min_count=4,
                     window=5,
                     vector_size =300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     seed= 42,
                     workers=multiprocessing.cpu_count()-1)


#building vocab of the word2vec model from the custom data
w2v_model.build_vocab(sentences, progress_per=50000)

# https://towardsdatascience.com/unsupervised-sentiment-analysis-a38bf1906483

In [143]:
#training the word2vec model
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

(68412, 890970)

In [152]:
# checking similar words to battery in the corpus
w2v_model.wv.most_similar(positive=["lula"])

[('minas', 0.9997169375419617),
 ('estado', 0.9996999502182007),
 ('mg', 0.9996914863586426),
 ('drogas', 0.9996911287307739),
 ('#timbeta', 0.999688446521759),
 ('governo', 0.9996777176856995),
 ('presos', 0.9996697902679443),
 ('en', 0.99966961145401),
 ('pimentel', 0.9996667504310608),
 ('gerais', 0.9996564388275146)]

In [145]:
#saving the word2vec model
w2v_model.save("../raw_data/word2vec.model")

In [146]:
#Loading the word2vec model
word_vectors = Word2Vec.load("../raw_data/word2vec.model").wv

In [147]:
#Feeding the embeddings to a KMeans model to cluster words into positive, negative, and neutral clusters
model = KMeans(n_clusters=3, max_iter=1000, random_state=42, n_init=50).fit(X=word_vectors.vectors.astype('double'))

In [149]:
# check what we have in each cluster to label the clusters
word_vectors.similar_by_vector(model.cluster_centers_[0], topn=200, restrict_vocab=None)

[('minas', 0.9999662637710571),
 ('estado', 0.9999570846557617),
 ('drogas', 0.9999457597732544),
 ('mg', 0.9999454021453857),
 ('gerais', 0.9999250173568726),
 ('presídio', 0.9999250173568726),
 ('governo', 0.9999234080314636),
 ('governador', 0.9999186396598816),
 ('pimentel', 0.999916136264801),
 ('el', 0.9999133348464966),
 ('tráfico', 0.9999099373817444),
 ('amarela', 0.9999091029167175),
 ('en', 0.999907374382019),
 ('#timbeta', 0.9999069571495056),
 ('roubo', 0.9999060034751892),
 ('sobre', 0.9999038577079773),
 ('un', 0.9999029636383057),
 ('le', 0.9999009966850281),
 ('las', 0.9998959898948669),
 ('bh', 0.999895453453064),
 ('após', 0.9998951554298401),
 ('secretaria', 0.9998944997787476),
 ('presos', 0.9998908638954163),
 ('dois', 0.9998884201049805),
 ('mi', 0.9998874664306641),
 ('lo', 0.9998872876167297),
 ('não', 0.9998862743377686),
 ('ter', 0.9998840689659119),
 ('contra', 0.9998803734779358),
 ('politica', 0.999878466129303),
 ('#mg', 0.9998780488967896),
 ('la', 0.999