# Reading in our data

In [11]:
import pandas as pd

In [12]:
pd.set_option("display.max_colwidth", 200)
pd.set_option("display.max_columns", 200)

In [13]:
# Downloaded from 
# https://www.kaggle.com/augustop/portuguese-tweets-for-sentiment-analysis?select=NoThemeTweets.csv
corpus = pd.read_csv("NoThemeTweets.csv")
corpus.shape

(785814, 5)

In [14]:
corpus.head(2)

Unnamed: 0,id,tweet_text,tweet_date,sentiment,query_used
0,1031761728445530112,@Tixaa23 14 para eu ir :),Tue Aug 21 04:35:39 +0000 2018,Positivo,:)
1,1031761040462278656,@drexalvarez O meu like eu já dei na época :),Tue Aug 21 04:32:55 +0000 2018,Positivo,:)


In [15]:
corpus['sentiment'].value_counts()

Negativo    522707
Positivo    263107
Name: sentiment, dtype: int64

In [16]:
# random state keeps it reproducible
df = pd.concat([corpus[corpus['sentiment'] == 'Negativo'].sample(25000,random_state=1899),
                corpus[corpus['sentiment'] == 'Positivo'].sample(25000,random_state=1899)])

df['sentiment'].value_counts()

Negativo    25000
Positivo    25000
Name: sentiment, dtype: int64

In [17]:
df = df[['tweet_text','sentiment']]

## Stopwords

In [18]:
# pip install nltk
# nltk.download('stopwords')
import nltk

from nltk.corpus import stopwords
stopwords_pt = nltk.corpus.stopwords.words('portuguese')

In [19]:
# nltk.corpus.stopwords.words('portuguese')[:10]

len(stopwords_pt)

204

# Simple vectorizing with CountVectorizer

In [20]:
# from sklearn.feature_extraction.text import TfidfVectorizer 
# from sklearn.feature_extraction.text import CountVectorizer 
# vectorizer = CountVectorizer(max_features=1000) 
# vectors = vectorizer.fit_transform(df.tweet_text) 
# words_df = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names()) 
# words_df.head()

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(max_features=10000)

count_vectors = count_vectorizer.fit_transform(df.tweet_text)

In [22]:
words_c = pd.DataFrame(count_vectors.toarray(), columns=count_vectorizer.get_feature_names())

words_c.head(5)

Unnamed: 0,00,000,00am,00h,00hrs,00pm,00xblck,01,02,03,05,0500,06,07,08,0800,08fbk7gqx4,09,10,100,1000,100diasdecodigo,10h,10k,10km,10x,11,11h,12,12h,13,13h,14,14h,15,150,15h,16,16h,17,170,17h,18,180,18h,19,196qian,1991,1999,19h,1a,1d,1h,1k,1kg,1kilo,1m,1o,1st,1x,1ª,1º,20,200,2000,2002,2003,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2022,20h,20min,21,21h,22,22h,23,230hj,23h,24,24h,24hrs,25,26,27,28,29,2h,2k,...,yeontazn,yeri,yes,yey,yg,yixing,yo,yoga,yoon,yoongi,yoongibaep,yooniverse_mp3,york,you,young,your,yourself,youtube,youtuber,youtubers,yt,yuri,zap,zayn,zeavy,zehdeabreu,zelune,zero,zica,zoa,zoado,zoando,zoar,zodíaco,zoeira,zona,zoom,zpedro99,zuando,zueira,zuera,zumbi,zuzz0,zé,àquela,às,água,álbum,álbuns,álcool,ápice,árbitro,área,áreas,árvore,ás,áudio,áudios,âncora,ângulo,ânimo,ée,égua,épico,época,éramos,és,ética,íamos,ícone,ídem,ídola,ídolo,ídolos,índia,índios,índole,íris,óbvio,óculos,ódio,óleo,órgãos,ótima,ótimas,ótimo,ótimos,ônibus,última,últimas,último,últimos,única,únicas,único,únicos,úteis,útero,útil,갓세븐
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [23]:
df.head(2)

Unnamed: 0,tweet_text,sentiment
166793,"@hopihari todos que estavam lá. Não adianta chorar leite derramado, já gastei uma grana alta e pelo menos tenho que lembrar da sensação de descer a Montezum, mesmo que foi depois de quase 2h:30min...",Negativo
213700,"Dólar subiu :( - R$4,12 às 17:00",Negativo


In [24]:
words_c['chorar'].head(2)

0    1
1    0
Name: chorar, dtype: int64

# Exploring parameters in TF-IDF

Parameters = features = names OR characteristics

Hyperparameters = values that we use to adjust our model

Also look at: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
## WITH STOPWORDS
vectorizer = TfidfVectorizer(stop_words=stopwords_pt)

vector_matrix = vectorizer.fit_transform(df.tweet_text)

vector_matrix

<50000x77183 sparse matrix of type '<class 'numpy.float64'>'
	with 401655 stored elements in Compressed Sparse Row format>

In [27]:
## WITHOUT STOPWORDS
vectorizer = TfidfVectorizer()

vector_matrix = vectorizer.fit_transform(df.tweet_text)

vector_matrix

<50000x77358 sparse matrix of type '<class 'numpy.float64'>'
	with 568956 stored elements in Compressed Sparse Row format>

In [28]:
## Considers terms in at least in 10 documents and not more than in 70% of the corpus

## min_df : float or int, default=1
##    When building the vocabulary ignore terms that have a document
##    frequency strictly lower than the given threshold. This value is also
##    called cut-off in the literature.
##    If float in range of [0.0, 1.0], the parameter represents a proportion
##    of documents, integer absolute counts.
## This parameter is ignored if vocabulary is not None.

## max_df : float or int, default=1.0
##    When building the vocabulary ignore terms that have a document
##    frequency strictly higher than the given threshold (corpus-specific
##    stop words).
##    If float in range [0.0, 1.0], the parameter represents a proportion of
##    documents, integer absolute counts.
##    This parameter is ignored if vocabulary is not None.

vectorizer = TfidfVectorizer(min_df=50, max_df=0.7)

vector_matrix = vectorizer.fit_transform(df.tweet_text)

vector_matrix

<50000x1132 sparse matrix of type '<class 'numpy.float64'>'
	with 394423 stored elements in Compressed Sparse Row format>

In [29]:
## Considers the most 10k popular terms

## max_features : int, default=None
##    If not None, build a vocabulary that only consider the top
##    max_features ordered by term frequency across the corpus.
##    This parameter is ignored if vocabulary is not None. 
    
vectorizer = TfidfVectorizer(max_features=10000)

vector_matrix = vectorizer.fit_transform(df.tweet_text)

vector_matrix

<50000x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 489829 stored elements in Compressed Sparse Row format>

In [30]:
vectorizer = TfidfVectorizer()

vector_matrix = vectorizer.fit_transform(df.tweet_text)

vector_matrix

<50000x77358 sparse matrix of type '<class 'numpy.float64'>'
	with 568956 stored elements in Compressed Sparse Row format>

## Vectorizing with TF-IDF

In [31]:
vectorizer = TfidfVectorizer(stop_words=stopwords_pt,min_df=50,max_df=0.9)

vector_matrix = vectorizer.fit_transform(df.tweet_text)

vector_matrix

<50000x1013 sparse matrix of type '<class 'numpy.float64'>'
	with 227939 stored elements in Compressed Sparse Row format>

In [32]:
words = pd.DataFrame(vector_matrix.toarray(), columns=vectorizer.get_feature_names())

words.head(2)

## shows the tweets in df above
# words.index = df.tweet_text

Unnamed: 0,00,10,100,11,12,13,15,16,17,18,20,2018,30,50,aa,aaa,aaaa,aaaaa,aaaaaa,aaaaaaa,abraçar,abraço,abrir,acaba,acabar,acabei,acabou,aceito,acha,achando,achar,achei,acho,acontece,acontecendo,acontecer,aconteceu,acordar,acordei,acordo,acredito,adoro,ae,af,aff,agora,agr,agradeço,aguento,ah,ai,ainda,ajuda,ajudar,algo,alguem,algum,alguma,algumas,alguns,alguém,ali,almoço,além,ama,amanha,amanhã,amar,amei,amiga,amigas,amigo,amigos,amizade,amo,amor,amorzinho,amp,ana,anda,andar,aniversário,anjinho,anjo,ano,anos,ansiedade,antes,apaixonada,aparece,apenas,apesar,apoio,app,aprender,aq,aqui,ariana,assim,assistindo,...,to,toda,todas,todo,todos,tomar,tomara,top,tou,trabalhar,trabalho,triste,tristeza,trocar,três,tt,tudo,turma,turno,tv,tweet,tweets,twitter,tá,tão,têm,tô,ultimamente,umas,uns,usa,usando,usar,uso,vai,vais,vale,valeu,vamo,vamos,vc,vcs,vdd,veio,veja,vejo,velho,vem,vendo,venha,ver,verdade,vergonha,vez,vezes,vi,via,viagem,vida,video,vim,vindo,vir,vista,visto,viu,viver,vivo,vo,voce,volta,voltar,voltei,volto,voltou,vontade,votar,voto,vou,voz,vá,várias,vários,vão,vê,vídeo,vídeos,you,youtube,água,época,és,ódio,ótima,ótimo,ônibus,última,último,única,único
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.274524,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.510891,0.0,0.0,0.0,0.486069,0.0,0.0,0.0,0.487742,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
# TF-IDF not only counts the frequency of each term in each document/tweet
# It also adjusts it to its relevance in the corpus

words['chorar'].head(2)

0    0.326766
1    0.000000
Name: chorar, dtype: float64

# Creating and training our model

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

In [35]:
# Set up X and y
# X = features
# Y = labels

# Word counts
X = words

# Labels: positivo/negativo
y = df.sentiment

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [37]:
#%%time
# clf = LogisticRegression(C=1e9, solver='lbfgs', max_iter=4000)
# clf = RandomForestClassifier(n_estimators=50)
# clf = LinearSVC()
# clf = MultinomialNB()
#clf.fit(X, y)

## LogReg

In [57]:
%%time
clf_log = LogisticRegression(C=1e9, solver='lbfgs', max_iter=4000)
clf_log.fit(X_train, y_train)

CPU times: user 1min 27s, sys: 13 s, total: 1min 40s
Wall time: 38.5 s


LogisticRegression(C=1000000000.0, max_iter=4000)

## Random Forest

In [58]:
%%time

clf_random = RandomForestClassifier(n_estimators=50)

clf_random.fit(X_train, y_train)

CPU times: user 2min 20s, sys: 787 ms, total: 2min 21s
Wall time: 2min 21s


RandomForestClassifier(n_estimators=50)

## LinearSVC

In [59]:
%%time

clf_linear = LinearSVC()
clf_linear.fit(X_train, y_train)

CPU times: user 1.08 s, sys: 8.18 ms, total: 1.09 s
Wall time: 1.08 s


LinearSVC()

## Multinomial

In [60]:
%%time

clf_multi = MultinomialNB()
clf_multi.fit(X_train, y_train)

CPU times: user 563 ms, sys: 135 ms, total: 699 ms
Wall time: 426 ms


MultinomialNB()

# Evaluating our model

In [61]:
clf_log.score(X_test, y_test)

0.7308

In [62]:
clf_random.score(X_test, y_test)

0.71456

In [63]:
clf_linear.score(X_test, y_test)

0.73192

In [64]:
clf_multi.score(X_test, y_test)

0.73096

## Confusion matrix

Generally, models are better predicting positive text.

In [65]:
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = clf_random.predict(X_test)
matrix = confusion_matrix(y_true, y_pred)

label_names = pd.Series(['negative', 'positive'])
pd.DataFrame(matrix,
     columns='Predicted ' + label_names,
     index='Is ' + label_names).div(matrix.sum(axis=1), axis=0)
# the .div above turns the results into percentage

Unnamed: 0,Predicted negative,Predicted positive
Is negative,0.68788,0.31212
Is positive,0.258726,0.741274


In [66]:
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = clf_multi.predict(X_test)
matrix = confusion_matrix(y_true, y_pred)

label_names = pd.Series(['negative', 'positive'])
pd.DataFrame(matrix,
     columns='Predicted ' + label_names,
     index='Is ' + label_names).div(matrix.sum(axis=1), axis=0)
# the .div above turns the results into percentage

Unnamed: 0,Predicted negative,Predicted positive
Is negative,0.68772,0.31228
Is positive,0.225744,0.774256


# Explaining our model

In [77]:
# pip install eli5
import eli5


Contribution?,Feature
0.589,Highlighted in text (sum)
-0.099,<BIAS>


In [69]:
eli5.show_weights(clf_linear, top=(10, 10), feature_names=vectorizer.get_feature_names())

Weight?,Feature
+1.642,breve
+1.522,valeu
+1.519,feliz
+1.448,hehe
+1.329,parabéns
+1.273,olá
+1.243,confira
+1.217,co
+1.200,frases
+1.196,obrigado


In [78]:
df[df['tweet_text'].str.contains('catra')].tail(5)

Unnamed: 0,tweet_text,sentiment
161668,"@xlsrgc eu qnd ainda estudava la deixei de esperar na parada do 1 e outras menos movimentadas :( so pego na reitoria e olhe la se n tiver gnt :( eh horrivel andar mais, esperar mais, mas se depend...",Negativo
374390,Não vamos mais ter catra presidente :(((,Negativo
488199,@alcatraz_ Tá foda. Muito medo. :(,Negativo
208843,"A fila anda, a catraca gira, sentiu saudade? Pode furar a fila pq tu é top :)",Positivo
603014,"@Lailoca @zZzcarol quando é evento cultural sempre vejo no catraca livre, mas tem um app (faz um tempinho q n uso) bem legal, se chama meet up :)",Positivo


In [71]:
# eli5.explain_prediction(clf_linear, "O Catra está muito feliz", vec=vectorizer)
eli5.show_prediction(clf_linear, "O Catra está muito feliz", vec=vectorizer)

Contribution?,Feature
0.589,Highlighted in text (sum)
-0.099,<BIAS>


In [72]:
eli5.show_prediction(clf_linear, "estou muito feliz, parabéns! que saudade", vec=vectorizer)

Contribution?,Feature
0.755,Highlighted in text (sum)
0.099,<BIAS>


In [73]:
eli5.show_prediction(clf_linear, "estou muito feliz, parabéns!", vec=vectorizer)

Contribution?,Feature
1.995,Highlighted in text (sum)
0.099,<BIAS>


In [74]:
eli5.show_weights(clf_log, top=(5,5), feature_names=vectorizer.get_feature_names())

Weight?,Feature
+22.509,confira
+18.850,co
+13.384,foo
+6.579,breve
+5.311,hehe
… 557 more positive …,… 557 more positive …
… 447 more negative …,… 447 more negative …
-6.319,queria
-6.927,dói
-7.149,triste


In [46]:
# Doesn't work
# eli5.show_weights(clf_multi, top=(5,5), feature_names=vectorizer.get_feature_names())
# eli5.show_weights(clf_random, top=(5,5), feature_names=vectorizer.get_feature_names())

## Using n-grams

In [92]:
vectorizer = TfidfVectorizer(stop_words=stopwords_pt,min_df=50,max_df=0.9,ngram_range=(2,3))

vector_matrix = vectorizer.fit_transform(df.tweet_text)

words = pd.DataFrame(vector_matrix.toarray(), columns=vectorizer.get_feature_names())

words

Unnamed: 0,acho vou,agora sim,ai gente,ainda bem,alguma coisa,alguém agora,alguém pra,amanhã vou,amo demais,amo https,amo https co,amo tanto,amor https,amor https co,amor vida,ano passado,aqui https,aqui https co,aqui pra,bem https,bem https co,bem triste,boa noite,boa sorte,boa tarde,bom dia,cada vez,catra morreu,chama dm,coisa linda,demais https,demais https co,dessa vez,deve ser,dia https,dia https co,dia pra,dia todo,dor cabeça,dá pra,falta alguém,falta alguém agora,fico feliz,fico triste,fiquei triste,gostei vídeo,gostei vídeo youtube,hoje dia,https co,ir pra,mim https,mim https co,mt triste,nao sei,obrigada https,obrigada https co,pode ser,pq vc,pra casa,pra dar,pra fazer,pra ficar,pra gente,pra ir,pra mim,pra ser,pra ter,pra vc,pra ver,primeira vez,qualquer coisa,queria alguém,queria estar,queria ir,queria poder,queria saber,queria ser,queria tanto,queria ter,queria ver,quero ir,quero ver,sente falta,sente falta alguém,sim https,sim https co,so queria,tanto https,tanto https co,ter ido,to bem,to mt,to triste,to tão,todo dia,todo mundo,todos dias,triste https,triste https co,triste pq,tudo bem,tudo bom,tudo certo,tudo pra,tá tudo,tão bom,tão lindo,tão triste,tô triste,tô tão,vai dar,vai fazer,vai ficar,vai ser,vai ter,vc eh,vc vai,vou conseguir,vou dormir,vou fazer,vou ficar,vou poder,vou tentar,vou ter,vou ver,vídeo youtube,vídeo youtube https,youtube https,youtube https co
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49996,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [94]:
X_train, X_test, y_train, y_test = train_test_split(words, df.sentiment)

clf_linear = LinearSVC()
clf_linear.fit(X_train, y_train)

clf_linear.score(X_test, y_test)

0.56136

In [95]:
eli5.show_weights(clf_linear, top=(10, 10), feature_names=vectorizer.get_feature_names())

Weight?,Feature
+1.187,aqui https co
+0.978,boa tarde
+0.866,bom dia
+0.744,tudo bom
+0.690,boa sorte
+0.679,fico feliz
+0.656,boa noite
+0.555,chama dm
+0.528,youtube https
+0.528,youtube https co
