In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import *
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn import metrics

In [3]:
dataset = pd.read_csv('DADOS-CodLab/V-oc/TRAIN/2018-Valence-oc-En-train.txt', sep='\t')

In [204]:
# Amostra dos dados
dataset.head()

Unnamed: 0,ID,Tweet,Affect Dimension,Intensity Class,Processed Tweet
0,2017-En-30153,@liamch88 yeah! :) playing well,valence,0: neutral or mixed emotional state can be inf...,yeah play well
1,2017-En-40929,At least I don't have a guy trying to discoura...,valence,0: neutral or mixed emotional state can be inf...,least guy tri discourag anymor want never beco...
2,2017-En-22012,UPLIFT: If you're still discouraged it means y...,valence,0: neutral or mixed emotional state can be inf...,uplift still discourag mean listen wrong voic ...
3,2017-En-30837,"...at your age, the heyday in the blood is tam...",valence,0: neutral or mixed emotional state can be inf...,age heyday blood tame'
4,2017-En-30838,i was so embarrassed when she saw us i was lik...,valence,-2: moderately negative emotional state can be...,embarrass saw us like knvfkkjg think we'r stal...


In [5]:
#categorias
categories = dataset['Intensity Class'].value_counts().index.tolist()

In [129]:
nltk.download('stopwords') realizando o download das stopwords

### Pré-processamento

----------------
Fases do pré-processamento
* Remoção de referências a perfis do tweeter;
* Remoção de Números, pontuações e caracteres especiais;
* Substituição de letras maiúsculas por minúsculas;
* Remoção de stopwords;
* Remoção de stemming;


In [4]:
dataset['Processed Tweet'] = np.nan # coluna que guardará os tweets processados

In [6]:
stemmer = PorterStemmer()
stemmer = 
for ind, tweet in enumerate(dataset['Tweet']):
    t = tweet
    t = re.sub(r'(\b)?@\w+','',t) #removendo referências a outros perfis
    t = re.sub(r'(\b)?#\w+','',t) #removendo hashtags
    t = re.sub(r'(\b)?http://\w+','',t) #removendo links
    t = re.sub(r'(\w+)?\d(\w+)?','',t) # removendo números
    t = re.sub(r'[^A-Za-z0-9\' ]+','',t) # removendo caracteres especiais
    t = re.sub(r' +',' ',t) # removendo espaços duplos
    t = t.lower() # removendo letras maiúsculas
    t = t.strip() # remover espaços vazios no início e no fim
    
    # removendo stopwords
    nsw = ''
    for word in t.split(' '):
        if word not in stopwords.words('english'):
            nsw += word+' '
    nsw = nsw.strip()
    
    # isolando o radical das palavras
    nsw = nsw.split(' ')
    ste = [stemmer.stem(word) for word in nsw]
    processedTweet = ' '.join(ste)
    
    dataset['Processed Tweet'][ind] = processedTweet

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


### Treinamento do Modelo - Naive Bayse

In [288]:
len(dataset['Intensity Class'][0:826])+len(dataset['Intensity Class'][826:1181])

1181

In [7]:
#dataset de treino
train = pd.DataFrame()
train['Processed Tweet'] = dataset['Processed Tweet'][0:826]
train['Intensity Class'] = dataset['Intensity Class'][0:826]

#dataset de teste
test = pd.DataFrame()
test['Processed Tweet'] = dataset['Processed Tweet'][826:1181]
test['Intensity Class'] = dataset['Intensity Class'][826:1181]

In [8]:
# Indexando as palavras do corpus
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train['Processed Tweet'].values.tolist())
X_train_counts.shape

(826, 2573)

In [9]:
# calculando as frequencias das palavras
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(826, 2573)

In [10]:
# treinando o modelo
#clf = MultinomialNB().fit(X_train_tfidf, train['Intensity Class'].values)
clf = SGDClassifier().fit(X_train_tfidf, train['Intensity Class'].values)



In [11]:
docs_new = test['Processed Tweet'].values.tolist()
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, category))

'whenev pout want adrian appear tell stop pout els' => 0: neutral or mixed emotional state can be inferred
'love true taught fear illus' => 1: slightly positive emotional state can be inferred
'team must draw hat daili person' => 1: slightly positive emotional state can be inferred
'pastor feet away shoot victim protest say skeptic offici stori' => -3: very negative emotional state can be inferred
'shot black polic woman typic looney toon think' => -2: moderately negative emotional state can be inferred
"we'r discipleship train detox someth" => -2: moderately negative emotional state can be inferred
'come let whatev' => 0: neutral or mixed emotional state can be inferred
'peopl hit tri shake talk hide commun' => -3: very negative emotional state can be inferred
'punchlin king back citi light' => -1: slightly negative emotional state can be inferred
'watch driven food go devon ave eat nihari make gleeful af' => 1: slightly positive emotional state can be inferred
'meanwhil get train hop

In [15]:
print(metrics.classification_report(test['Intensity Class'].values, predicted, target_names=categories))

                                                         precision    recall  f1-score   support

    0: neutral or mixed emotional state can be inferred       0.00      0.00      0.00        31
-2: moderately negative emotional state can be inferred       0.22      0.30      0.26        67
   1: slightly positive emotional state can be inferred       0.19      0.11      0.14        46
      -3: very negative emotional state can be inferred       0.34      0.45      0.39        98
       3: very positive emotional state can be inferred       0.21      0.17      0.19        46
 2: moderately positive emotional state can be inferred       0.04      0.03      0.04        30
  -1: slightly negative emotional state can be inferred       0.29      0.24      0.26        37

                                            avg / total       0.22      0.25      0.23       355



In [13]:
test['Predicted'] = predicted

In [14]:
# Acurácia
np.mean(test['Predicted'] == test['Intensity Class'])

0.24507042253521127

In [None]:
#Matriz de confusão
metrics.confusion_matrix(test['Intensity Class'], test['Predicted'])