# Encontrando Padrões em texto com Clustering 

In [23]:
import pandas as pd
import numpy as np
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans, DBSCAN, SpectralClustering

In [4]:
tweets = pd.read_csv("nCoV_tweets.csv", index_col=0, parse_dates=['dt'])
docs = ["curso de data, DATA science", "tutorial de data analysis", "não sei mais o que escrever"]

In [12]:
bag_of_words_transformer = CountVectorizer(binary = False ,analyzer='word', lowercase=False, stop_words=['de','que'], ngram_range=(1,1), min_df=1)
mx = bag_of_words_transformer.fit_transform(docs).todense()
terms = bag_of_words_transformer.get_feature_names()
pd.DataFrame(mx, columns=terms, index=docs)


#lowercase
#analyzer='char'
#strip_accents = 'unicode'
#binary = True

#stop_words=['de']
#ngram_range=(1,1)   unigrama,bigrama, trigrama
#min_df

Unnamed: 0,DATA,analysis,curso,data,escrever,mais,não,science,sei,tutorial
"curso de data, DATA science",1,0,1,1,0,0,0,1,0,0
tutorial de data analysis,0,1,0,1,0,0,0,0,0,1
não sei mais o que escrever,0,0,0,0,1,1,1,0,1,0


In [16]:
#Matriz de frequencia de palavras
# tf-idf ~= frequencia da palavra no documento * inverso da frequencia da palavra em todos os documentos
bag_of_words_transformer = TfidfVectorizer(use_idf=False,norm='l1')
mx = bag_of_words_transformer.fit_transform(docs).todense()
terms= bag_of_words_transformer.get_feature_names()
pd.DataFrame(mx, columns=terms, index=docs)


#norm = Each output row will have unit norm, either: * 'l2' Sum of squares of vector elements is 1
#The consine similarity between two vectors is their dot product when l2 norm has been applied.
#* 'l1': Sum of absolute values of vector elements is 1. See proporcessing.normalize

#use_idf = False
#use_idf = False e norm = l1, frequencia simples

Unnamed: 0,analysis,curso,data,de,escrever,mais,não,que,science,sei,tutorial
"curso de data, DATA science",0.0,0.2,0.4,0.2,0.0,0.0,0.0,0.0,0.2,0.0,0.0
tutorial de data analysis,0.25,0.0,0.25,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.25
não sei mais o que escrever,0.0,0.0,0.0,0.0,0.2,0.2,0.2,0.2,0.0,0.2,0.0


In [17]:
#NLTK
from nltk.stem import SnowballStemmer

In [18]:
stemmer = SnowballStemmer(language='portuguese')
stemmer.stem("analisado"), stemmer.stem("analise")

('analis', 'analis')

In [39]:
bag_of_words_transformer = CountVectorizer(min_df=4, stop_words='english', ngram_range=(3,3))
mx1 = bag_of_words_transformer.fit_transform(tweets['txt'])#to.dense()
mx1.shape

(6706, 618)

In [40]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler

In [41]:
for k in range(2,10):
    cluster = make_pipeline(MaxAbsScaler(),  KMeans(n_clusters=k, random_state=0))
    cluster.fit(mx1)
    p= cluster.predict(mx1)
    
    sil = silhouette_score(mx1, p)
    print(f"k = {k} - Silhoutte: {sil}")

k = 2 - Silhoutte: 0.7745554448279538
k = 3 - Silhoutte: 0.7512903428233243
k = 4 - Silhoutte: 0.7909093389391644
k = 5 - Silhoutte: 0.7855735812615643
k = 6 - Silhoutte: 0.7865938949670536
k = 7 - Silhoutte: 0.7944573862250106
k = 8 - Silhoutte: 0.794740266937335
k = 9 - Silhoutte: 0.7105856322546324


In [42]:
terms = bag_of_words_transformer.get_feature_names()

In [44]:
k=8
cluster = make_pipeline(MaxAbsScaler(), KMeans(n_clusters=k, random_state=0))
cluster.fit(mx1)
p = cluster.predict(mx1)

for c in np.unique(p):
    print(f'\nCluster {c} - Size {(p == c).sum()}')
    rank = pd.Series(np.array(mx1[p==c].mean(axis=0)).squeeze(), index=terms).sort_values().tail(20)
    print(rank)


Cluster 0 - Size 6466
confirmed cases coronavirus              0.002011
coronavirus death toll                   0.002165
30 hours birth                           0.002320
accidentally leaked real                 0.002474
tencent accidentally leaked              0.002629
just 30 hours                            0.002784
zhuang bing du                           0.002938
novel coronavirus 2019                   0.002938
guan zhuang bing                         0.002938
cruise ship japan                        0.003093
2019 novel coronavirus                   0.003248
news china coronavirus                   0.003402
world health organization                0.003712
coronaoutbreak coronanews ncov2019       0.004176
coronavirus coronaoutbreak coronanews    0.004176
coronavirus asiannetwalking https        0.004330
health coronavirus asiannetwalking       0.004330
coronavirus 2019 ncov                    0.005104
amid coronavirus outbreak                0.005104
coronavirus outbreak https 

In [47]:
cluster.transform(mx1)

array([[0.02523771, 2.64575131, 3.22102468, ..., 2.47019268, 2.50312305,
        3.16227766],
       [0.02523771, 2.64575131, 3.22102468, ..., 2.47019268, 2.50312305,
        3.16227766],
       [0.02523771, 2.64575131, 3.22102468, ..., 2.47019268, 2.50312305,
        3.16227766],
       ...,
       [1.73035876, 3.16227766, 3.65718471, ..., 3.01692755, 3.04394892,
        3.60555128],
       [0.02523771, 2.64575131, 3.22102468, ..., 2.47019268, 2.50312305,
        3.16227766],
       [0.02523771, 2.64575131, 3.22102468, ..., 2.47019268, 2.50312305,
        3.16227766]])

In [48]:
tweets['cluster'] = p
for c in np.unique(p):
    print('CLuster {} = {}'.format(c, tweets[tweets['cluster']==c]['txt'].iloc[0]))
    print()

CLuster 0 = what the actual -

CLuster 1 =  Coronavirus-Update !!#Wuhan #CoronavirusOutbreak #China #Coronavirus #PrayForWuhan #coronavirus... https://t.co/z183aeekZN

CLuster 2 = Wuhan Test Lab Opens; CDC Ships Diagnostic Kits: Virus Update SEE DETAILS AT ==&gt; https://t.co/n8oWlcKo0x #virus... https://t.co/7slO1kBzPw

CLuster 3 = Coronavirus Latest Updates: Everything You Need to Know SEE DETAILS AT ==&gt; https://t.co/9orX4j6BuU #virus... https://t.co/EZkVAQuI3G

CLuster 4 = @ABSCBNNews @raphbosano Fully Automated Live #CoronaVirus Updates. 
* Live Statistics: Mortality rate, recovery rat... https://t.co/RXbHWuWM58

CLuster 5 = Vals Is Here Surprise That Special Someone Now

You can reach Us On 0205414305or WhatsApp 0555171905 
For The Bes... https://t.co/E9CSZvxFCT

CLuster 6 = Global Shipping has Been Hit by the Coronavirus

Now Goods are Getting Stranded

by @hannaziady
via @CNNBusiness... https://t.co/NO1WbANzDh

CLuster 7 = A 37-year-old woman has become the 15th person in Aus