### SPACY PORTUGUESE MODEL


- STOP WORDS LIST:  https://github.com/explosion/spaCy/blob/master/spacy/lang/pt/stop_words.py

- PUNCTUATION: https://github.com/explosion/spaCy/blob/master/spacy/lang/pt/punctuation.py

- LANGUAGE DATA: https://spacy.io/usage/linguistic-features#language-data

- pt_core_news_lg: written text (news, media)  https://spacy.io/models/pt#pt_core_news_lg

In [6]:
# !pip install -U spacy
# python -m spacy download pt_core_news_lg

import spacy
import pandas as pd
from spacy.matcher import Matcher
#spacy.cli.download("pt_core_news_lg")
from spacy import displacy
from sklearn.model_selection import train_test_split
from spacy.tokens import DocBin


In [7]:
nlp = spacy.load("pt_core_news_lg")

In [8]:
print(nlp.pipe_names)

['tok2vec', 'morphologizer', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [9]:
nlp.pipe_names, nlp.pipeline

(['tok2vec',
  'morphologizer',
  'parser',
  'attribute_ruler',
  'lemmatizer',
  'ner'],
 [('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x1a9e39fdca0>),
  ('morphologizer',
   <spacy.pipeline.morphologizer.Morphologizer at 0x1a9e3a7a9a0>),
  ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x1a9e3a357b0>),
  ('attribute_ruler',
   <spacy.pipeline.attributeruler.AttributeRuler at 0x1a9e3babac0>),
  ('lemmatizer', <spacy.pipeline.lemmatizer.Lemmatizer at 0x1a9e3bb5b00>),
  ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x1a9e3a355f0>)])

In [10]:
ents = nlp.get_pipe('ner').labels
ents

#miscel- laneous (MISC)

('LOC', 'MISC', 'ORG', 'PER')

In [11]:
text = "O Real Mosteiro de Santa Maria de Belém, designado comumente por Mosteiro dos Jerónimos, por ter sido destinado à Ordem de São Jerónimo, é uma obra-prima da arquitetura portuguesa. Classificado Monumento Nacional, em 1907, e inscrito na Lista do Património Mundial da UNESCO, em 1983, está vinculado ao Protocolo do Estado. A igreja, sede da Paróquia de Santa Maria de Belém, com serviço religioso e horário para vistas patrimoniais, e o claustro, secularizado no século XIX, têm acesso distinto e formam o conjunto patrimonial mais visitado do País."

In [12]:
#%%timeit
doc = nlp(text)

for ent in doc.ents:
    print(f"     {ent.text} {ent.label_}")
print("Entity not available")




     Real Mosteiro de Santa Maria de Belém LOC
     Mosteiro dos Jerónimos LOC
     Ordem de São Jerónimo ORG
     Monumento Nacional LOC
     Lista do Património Mundial da UNESCO MISC
     Protocolo do Estado MISC
     Paróquia de Santa Maria de Belém LOC
     País LOC
Entity not available


In [13]:
# for token in doc:
#     print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
#             token.shape_, token.is_alpha, token.is_stop)


tokenized_text = pd.DataFrame()

for i, token in enumerate(doc):
    tokenized_text.loc[i, 'text'] = token.text
    tokenized_text.loc[i, 'lemma'] = token.lemma_,
    tokenized_text.loc[i, 'pos'] = token.pos_
    tokenized_text.loc[i, 'tag'] = token.tag_
    tokenized_text.loc[i, 'dep'] = token.dep_
    tokenized_text.loc[i, 'shape'] = token.shape_
    tokenized_text.loc[i, 'is_alpha'] = token.is_alpha
    tokenized_text.loc[i, 'is_stop'] = token.is_stop
    tokenized_text.loc[i, 'is_punctuation'] = token.is_punct

tokenized_text

Unnamed: 0,text,lemma,pos,tag,dep,shape,is_alpha,is_stop,is_punctuation
0,O,"(O,)",DET,DET,det,X,True,True,False
1,Real,"(Real,)",PROPN,PROPN,nsubj,Xxxx,True,False,False
2,Mosteiro,"(Mosteiro,)",PROPN,PROPN,flat:name,Xxxxx,True,False,False
3,de,"(de,)",ADP,ADP,case,xx,True,True,False
4,Santa,"(Santa,)",PROPN,PROPN,nmod,Xxxxx,True,False,False
...,...,...,...,...,...,...,...,...,...
98,mais,"(mais,)",ADV,ADV,advmod,xxxx,True,True,False
99,visitado,"(visitar,)",VERB,VERB,acl,xxxx,True,False,False
100,do,"(do,)",ADP,ADP,case,xx,True,True,False
101,País,"(País,)",PROPN,PROPN,obl,Xxxx,True,False,False


In [14]:
ents = nlp.get_pipe('attribute_ruler').labels
ents

()

In [15]:
ents = nlp.get_pipe('parser').labels
ents

('ROOT',
 'acl',
 'acl:relcl',
 'advcl',
 'advmod',
 'amod',
 'appos',
 'aux',
 'aux:pass',
 'case',
 'cc',
 'ccomp',
 'compound',
 'conj',
 'cop',
 'csubj',
 'dep',
 'det',
 'discourse',
 'expl',
 'fixed',
 'flat',
 'flat:foreign',
 'flat:name',
 'iobj',
 'mark',
 'nmod',
 'nsubj',
 'nsubj:pass',
 'nummod',
 'obj',
 'obl',
 'obl:agent',
 'parataxis',
 'punct',
 'xcomp')

In [16]:
ents = nlp.get_pipe('tok2vec').labels
ents

()

# PORTUGUESE TWEETS ANALYSIS

In [17]:
with open("C:\\Test folder\\NoThemeTweets.csv") as csv_file:
    data = pd.read_csv(csv_file)

In [18]:
data.head()

Unnamed: 0,id,tweet_text,tweet_date,sentiment,query_used
0,1031761728445530112,@Tixaa23 14 para eu ir :),Tue Aug 21 04:35:39 +0000 2018,Positivo,:)
1,1031761040462278656,@drexalvarez O meu like eu já dei na época :),Tue Aug 21 04:32:55 +0000 2018,Positivo,:)
2,1031760962372689920,Eu só queria conseguir comer alguma coisa pra ...,Tue Aug 21 04:32:37 +0000 2018,Positivo,:)
3,1031760948250456066,:D que lindo dia !,Tue Aug 21 04:32:33 +0000 2018,Positivo,:)
4,1031760895985246208,"@Primo_Resmungao Pq da pr jeito!!é uma ""oferta...",Tue Aug 21 04:32:21 +0000 2018,Positivo,:)


In [19]:
data = data[['tweet_text','sentiment']].dropna()

## DATA UNDERSTANDING

In [20]:
data.groupby('sentiment').size()

sentiment
Negativo    522707
Positivo    263107
dtype: int64

In [21]:
# class count
class_count_neg, class_count_pos = data['sentiment'].value_counts()

print("positive tweets:", class_count_pos)
print("negative tweets:", class_count_neg)

# separate classes
class_pos = data[data['sentiment'] == "Positivo"]
class_neg = data[data['sentiment'] == "Negativo"]
print("df positivo:", class_pos.shape)
print("df negativo:", class_neg.shape)



positive tweets: 263107
negative tweets: 522707
df positivo: (263107, 2)
df negativo: (522707, 2)


In [22]:
neg_under = class_neg.sample(class_count_pos)
test_under = pd.concat([neg_under, class_pos], axis = 0)
print("total neg and positive:", test_under['sentiment'].value_counts())

total neg and positive: Negativo    263107
Positivo    263107
Name: sentiment, dtype: int64


In [23]:
test_under.shape

(526214, 2)

In [24]:
x = test_under[test_under['sentiment'] == 'Negativo']
x.head()
x[['tweet_text', 'sentiment']]

Unnamed: 0,tweet_text,sentiment
174339,• ¡#Participa! 👀 O Borro :( https://t.co/y4S9k...,Negativo
470482,baekhyun ta vivo sim eu que sequestei eu juro ...,Negativo
400895,eu te amo :( https://t.co/yStnKtjo6M,Negativo
614600,@lelpardo foi sem querer :(,Negativo
285085,@gayhobbitt eu to quase doando os meus tbm mas...,Negativo
...,...,...
226909,pitiquinho :( https://t.co/Fcm1NTiMoU,Negativo
302594,"eu quero pedir ela em namoro, tanto que ia faz...",Negativo
95022,Queria uma parceria pra fazer exercícios comig...,Negativo
544282,@milgraubia Toda vez que saio do ru eu compro ...,Negativo


In [None]:
# tweets = test_under['tweet_text'][:100]
# tweets

## DATA PREPARATION for SPACY

In [26]:
# Creating a train dataset and a validation dataset
train_df = data[:90000].sample(n = 5000)
print("train dataset:", train_df.groupby('sentiment').size())

valid_df = data[90001:200000].sample(n = 5000)
print("valid dataset:",valid_df.groupby('sentiment').size())

train dataset: sentiment
Negativo    1067
Positivo    3933
dtype: int64
valid dataset: sentiment
Negativo    4417
Positivo     583
dtype: int64


In [30]:
# Converting the dataframe into a list of tuples
valid_df['tuples'] = valid_df.apply(lambda row: (row['tweet_text'],row['sentiment']), axis=1)
valid_data = valid_df['tuples'].tolist()
#train[:10]
#print(valid_data[0])
print(type(valid_data))
#print(valid_data)

train_df['tuples'] = train_df.apply(lambda row: (row['tweet_text'],row['sentiment']), axis=1)
train_data = train_df['tuples'].tolist()
#train[:10]
#print(train_data[0])
print(type(train_data))

<class 'list'>
<class 'list'>


In [77]:
def make_docs(data):
    docs = []
    for doc, label in nlp.pipe(data, as_tuples=True):
        if label == 'Negativo':
            doc.cats["positive"] = 0
            doc.cats["negative"] = 1
        else:
            doc.cats["positive"] = 1
            doc.cats["negative"] = 0
        docs.append(doc)
    return (docs)

In [79]:
## create training and validation data
#num_texts = 4500 # amount of validation and training generated

train_docs = make_docs(train_data)
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("./C:/Test folder/KnowingSpacy/self learning/train.spacy")

# valid_docs = make_docs(valid_data[:num_texts])
# second_doc_bin= DocBin(docs=valid_docs)
# second_doc_bin.to_disk("./C:/Test folder/KnowingSpacy/self learning/valid.spacy")

ValueError: Cannot get dimension 'nO' for model 'sparse_linear': value unset

## DATA PREPARATION for SCKITLEARN

In [19]:
X = test_under['tweet_text']
y = test_under['sentiment']

In [20]:
#### Create a train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [21]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((420971,), (105243,), (420971,), (105243,))

In [None]:
# pos = y_train[y_train == "Positivo"].count()
# pos.shape

In [None]:
# neg = y_train[y_train == "Negativo"].count()
# neg.shape

In [None]:
# tweets = []
# for i in X_train:
#     #print('working',  i)
#     tweets.append(i)
#     #tweets = ' '.join( [str(item) from item in tweets] )
# print(tweets)

## MODELLING

#### Training

In [54]:
# Adding the built-in textcat component to the pipeline.
from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL
config = {
   "threshold": 0.5,
   "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
}
if "textcat" not in nlp.pipe_names:
    textcat = nlp.create_pipe("textcat") # Creating the pipeline component
    nlp.add_pipe(textcat)
else:
    textcat = nlp.get_pipe("textcat") 
nlp.pipe_names

['tok2vec',
 'morphologizer',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'textcat']

In [56]:
textcat.add_label("POSITIVO")
textcat.add_label("NEGATIVO")

0

In [57]:
labels_Textcat = nlp.get_pipe('textcat').labels
labels_Textcat

('POSITIVO', 'NEGATIVO')

In [33]:
# Converting the dataframe into a list of tuples
data['tuples'] = data.apply(lambda row: (row['tweet_text'],row['sentiment']), axis=1)
train = data['tuples'].tolist()
#train[:10]

In [46]:
#train[:40]

In [41]:
train[0]

('@whyparkjeon Quero um filme com a Ness e o Jacob juntos :((((((', 'Negativo')

In [39]:
import random

def load_data(limit=0, split=0.8):
    train_data=train
    # Shuffle the data
    random.shuffle(train_data)
    texts, labels = zip(*train_data)
    # get the categories for each review
    cats = [{"POSITIVO": y == "Positivo", "NEGATIVO": y == "Negativo"} for y in labels]

    # Splitting the training and evaluation data
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

n_texts=23486

# Calling the load_data() function 
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)

# Processing the final format of training data
train_data = list(zip(train_texts,[{'cats': cats} for cats in train_cats]))
train_data[:40]

[('@whyparkjeon Quero um filme com a Ness e o Jacob juntos :((((((',
  {'cats': {'POSITIVO': False, 'NEGATIVO': True}}),
 ('eu não dormi e tive paralisia :)))',
  {'cats': {'POSITIVO': True, 'NEGATIVO': False}}),
 ('Hj fiz a sobrancelha depois de quase 6 meses :) q q wilson nova cara novo tudo',
  {'cats': {'POSITIVO': True, 'NEGATIVO': False}}),
 ('quero morrer to torcendo pra eu perder todas as minhas forças sentindo essa colica e simplesmente morrer :)',
  {'cats': {'POSITIVO': True, 'NEGATIVO': False}}),
 ('@eliizviana poxa meu bem :(',
  {'cats': {'POSITIVO': False, 'NEGATIVO': True}}),
 ('sa ae mas :( https://t.co/WG173vc04I',
  {'cats': {'POSITIVO': False, 'NEGATIVO': True}}),
 ('@Veltshook Se não for assim, não dá :P',
  {'cats': {'POSITIVO': True, 'NEGATIVO': False}}),
 ('Puta que pariu isso lá é hora da minha rinite atacar :(',
  {'cats': {'POSITIVO': False, 'NEGATIVO': True}}),
 ('odeio gene gordo tenho que fazer dieta, exercicio etc enquanto tem uns fdp que consegue ser mag

In [43]:
#train_data[:10]

### Evaluating

In [58]:
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if label == "NEGATIVO":
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}


#("Number of training iterations", "n", int))
n_iter=10

In [45]:
from spacy.util import minibatch, compounding

# Disabling other components
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.initialize()

    print("Training the model...")
    print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))

    # Performing training
    for i in range(n_iter):
        losses = {}
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)

      # Calling the evaluate() function and printing the scores
        with textcat.model.use_params(optimizer.averages):
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  
              .format(losses['textcat'], scores['textcat_p'],
                      scores['textcat_r'], scores['textcat_f']))

ValueError: [E955] Can't find table(s) lexeme_norm for language 'pt' in spacy-lookups-data. Make sure you have the package installed or provide your own lookup tables if no default lookups are available for your language.

### Testing the model with new examples

In [36]:
# Testing the model
test_text="I hate the new series of Netflix"
doc=nlp(test_text)
doc.cats 

ValueError: Cannot get dimension 'nO' for model 'sparse_linear': value unset

### Entity Recognition

In [None]:
# Merge noun phrases and entities for easier analysis
# nlp.add_pipe("merge_entities")
# nlp.add_pipe("merge_noun_chunks")

for doc in nlp.pipe(tweets):
    for token in doc:
        if token.ent_type_ == "PER" or token.ent_type_ == "ORG":
            print(token.ent_type_, "-->", token)

In [None]:
doc = list(nlp.pipe(tweets))

In [None]:
for doc in nlp.pipe(tweets):
    for token in doc:
        if token.ent_type_ == "PER" or token.ent_type_ == "ORG":
            print(token.ent_type_, "-->", token)

In [None]:
displacy.render(doc, style = 'dep')

### Sentiment analysis of tweets

In [None]:
## Hashtags and Emoji Detection

In [None]:
pos_emoji = ["😀", "😃", "😂", "🤣", "😊", "😍"]  # Positive emoji
neg_emoji = ["😞", "😠", "😩", "😢", "😭", "😒"]  # Negative emoji

In [None]:
pos = [[{'ORTH': emoji}] for emoji in pos_emoji]
neg = [[{'ORTH': emoji}] for emoji in neg_emoji]

In [None]:
matcher = Matcher(nlp.vocab)

In [None]:
def label_sentiment(matcher, doc, i , matches):
    match_id, start, end = matches[i]
    if doc.vocab.strings[match_id]== 'happy':
        doc.sentiment += 0.1 #cada vez que 1 emoji positivo aparecer soma 0.1
    elif doc.vocab.strings[match_id] == 'sad':
        doc.sentiment -= 0.1

In [None]:
#matcher.add("happy", label_sentiment, *pos)
matcher.add("happy", [*pos], on_match=label_sentiment)

# matcher.add("sad", label_sentiment, *neg)
matcher.add("sad", [*neg], on_match=label_sentiment)

In [None]:
%%timeit #PRINTS WITH SEVEN TUNES
docs = nlp.pipe(tweets) # we only want NER

for doc in docs:
    for ent in doc.ents:
        print(ent.text, ent.label_)
    print()

In [None]:
docs

In [None]:
for i in tweets:
    doc = nlp(i)
    matches = matcher(doc)
    for x in matches:
        for match_id, starts, end in x:
            string_id = doc.vocab.strings[match_id]
            span = doc[start:end]
            print(string_id, span.text)


### CREATE COUNTRIES LABEL in PT

In [None]:
countries = []
capitals = []
for i in DATASET:
  countries.append(i)
  capitals.append(DATASET[i])
import pandas as pd
df=pd.DataFrame()
df['Country'] = countries
df['Capital'] = capitals