In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import cross_val_predict, train_test_split, GridSearchCV
from sklearn import metrics
#nltk.download('stopwords')

base_treino = pd.read_csv('PlattsOil.csv')
base_treino['tamanho'] = base_treino['tweet'].apply(lambda x: len(x))
group = base_treino.groupby('Classe').mean()

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(17,4))
axes[0].set_title("Histograma do tamanho tweet")
base_treino["tamanho"].hist(ax=axes[0],bins=8)
axes[1].set_title("Tamanho de tweet por classe")
group["tamanho"].plot(kind='bar')

<matplotlib.axes._subplots.AxesSubplot at 0x208495845c8>

In [2]:
def Clean_and_tokenizing(instance):
    words = instance.lower().split()
    stops = set(stopwords.words('english'))
    mean = [w for w in words if not w in stops]
    return(" ".join(mean))

def clean_caract(instance):
    txt = re.sub(r'^rt @[a-zA-Z:]*',"", instance)
    return txt

def clean_regex(instance):
    txt = re.sub(r'^@[a-zA-Z:]+',"",instance)
    txt_ = re.sub(r'@[a-zA-Z:]+', "", txt)
    return txt_

def Metrics(modelo, tweets, classes):
    resultados = cross_val_predict(modelo, tweets, classes, cv=10)
    return 'Acurácia do modelo: {}'.format(metrics.accuracy_score(classes,resultados))

def Limpeza_dados(instancia):
    # remove links, pontos, virgulas,ponto e virgulas dos tweets
    instancia = re.sub(r"http\S+", "", instancia).lower().replace('.','').replace(';','').replace('-','').replace(':','').replace(')','').replace('|','')
    return (instancia)

def clean_hashtag(instancia):
    instancia = re.sub(r'#[a-zA-z]+',"",instancia)
    return instancia

def letras_apenas(instance):
    letras = re.sub("[^a-zA-Z]"," ", instance) 
    return letras

def clean_ponto_virgula(instance):
    instancia = re.sub(r'[a-zA-Z]+:', "",instance)
    return instancia
    

In [22]:
classe = base_treino['Classe']
tweets = base_treino['tweet']

tweets = [Clean_and_tokenizing(i) for i in tweets]
tweets = [clean_ponto_virgula(i) for i in tweets]
tweets = [Limpeza_dados(i) for i in tweets]
tweets = [clean_caract(i) for i in tweets]
tweets = [clean_regex(i) for i in tweets]
tweets = [clean_hashtag(i) for i in tweets]
tweets = [letras_apenas(i) for i in tweets]
tweets

['   americas   oct     demand growth questions take bite crude falls    ecuador leave opec    economic headwinds could slow oil  gas  s ampp ratings ',
 'economic headwinds could slow growth north american oil  gas  s ampp ratings       trade war  middle east tensions leading uncertainty   upstream industry seen risk full ',
 ' brazil s  pushes ahead onshore field sales       company puts eight blocks purchased ',
 ' west coast becomes top us market  crude overall imports shrink      gulf coast reliance saudi imports falls amid shale boom   saudi oil remains good fit west coast local output falls  ',
 'demand growth questions take bite crude falls   ecuador leave opec january     manufacturing index falls      september full ',
 ' leave   energy ministry      ecuador pledges maintain ties opec members   move caught delegates surprise   ecuador output         b d aug         b d quota     ',
 'refinery margin  asian refiners bear brunt saudi crude outage     ',
 '   emea   oct      s i

# Bag of words

In [4]:
tweet_tokenizer = TweetTokenizer()

Unigrama

In [5]:
vectorizer = CountVectorizer(analyzer = "word", tokenizer=tweet_tokenizer.tokenize)
freq_tweets = vectorizer.fit_transform(tweets)

Bigrama

In [6]:
count_vec_ngram = CountVectorizer(ngram_range=(2, 2), tokenizer=tweet_tokenizer.tokenize)
X_counts_bigram = count_vec_ngram.fit_transform(tweets)

# TF


In [7]:
tf = TfidfVectorizer(use_idf= False, norm ="l1")
tweets_tf = tf.fit_transform(tweets)

# TF_IDF

In [8]:
tfid = TfidfVectorizer(use_idf= True, norm ="l1")
tweets_tfidf = tfid.fit_transform(tweets)

# Algoritmos de classificação


In [9]:
X_train_bofa, X_test_bofa, y_train_bofa, y_test_bofa = train_test_split(freq_tweets,classe, test_size=0.2)
X_train_bofabi, X_test_bofabi, y_train_bofabi, y_test_bofabi = train_test_split(X_counts_bigram,classe, test_size=0.2)
X_train_tf, X_test_tf, y_train_tf, y_test_tf = train_test_split(tweets_tf,classe, test_size=0.2)
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(tweets_tfidf,classe, test_size=0.2)

Naive Bayes

In [10]:
# Parte de treino
modelo_bofa = MultinomialNB().fit(X_train_bofa, y_train_bofa)
modelo_bofabi = MultinomialNB().fit(X_train_bofabi,y_train_bofabi)
modelo_tf = MultinomialNB().fit(X_train_tf,y_train_tf)
modelo_tfidf = MultinomialNB().fit(X_train_tfidf,y_train_tfidf)

In [11]:
# Parte de teste
bofa_pred= modelo_bofa.predict(X_test_bofa)
bofabi_pred = modelo_bofabi.predict(X_test_bofabi)
tf_predict = modelo_tf.predict(X_test_tf)
tfidf_predict = modelo_tfidf.predict(X_test_tfidf)

Suport Vector Machines

In [12]:
param_grid = [
    {'C': [1,10,100,1000]}
]
svc = LinearSVC()
svm_bofa = GridSearchCV(svc,param_grid).fit(X_train_bofa, y_train_bofa)
svm_bofabi = GridSearchCV(svc,param_grid).fit(X_train_bofabi, y_train_bofabi)
svm_tf = GridSearchCV(svc,param_grid).fit(X_train_tf, y_train_tf)
svm_tfidf = GridSearchCV(svc, param_grid).fit(X_train_tfidf, y_train_tfidf)

svm_pred_bofa = svm_bofa.predict(X_test_bofa)
svm_pred_bofabi = svm_bofabi.predict(X_test_bofabi)
svm_pred_tf = svm_tf.predict(X_test_tf)
svm_pred_tfidf = svm_tfidf.predict(X_test_tfidf)



# Avaliação dos algoritmos

In [13]:
acc_bofa = metrics.accuracy_score(y_test_bofa, bofa_pred)
acc_bofabi = metrics.accuracy_score(y_test_bofabi,bofabi_pred)
acc_tf = metrics.accuracy_score(y_test_tf, tf_predict)
acc_tfidf = metrics.accuracy_score(y_test_tfidf,tfidf_predict)

print('Naive Bayes')
print("Acuracia de Bag of Words = {}".format(acc_bofa))
print("Acuracia de Bag of Words Bigram = {}".format(acc_bofabi))
print("Acuracia de freq = {}".format(acc_tf))
print("Acuracia de tfidf = {}".format(acc_tfidf))
print("-----------------------------------------------------")

acc_svm_bofa = metrics.accuracy_score(y_test_bofa, svm_pred_bofa)
acc_svm_bofabi = metrics.accuracy_score(y_test_bofabi, svm_pred_bofabi)
acc_svm_tf = metrics.accuracy_score(y_test_tf, svm_pred_tf)
acc_svm_tfidf = metrics.accuracy_score(y_test_tfidf, svm_pred_tfidf)

print('SVM')
print("Acuracia de Bag of Words = {}".format(acc_svm_bofa))
print("Acuracia de Bag of Words Bigram = {}".format(acc_svm_bofabi))
print("Acuracia de freq = {}".format(acc_svm_tf))
print("Acuracia de tfidf = {}".format(acc_svm_tfidf))

Naive Bayes
Acuracia de Bag of Words = 0.6246418338108882
Acuracia de Bag of Words Bigram = 0.42406876790830944
Acuracia de freq = 0.6590257879656161
Acuracia de tfidf = 0.664756446991404
-----------------------------------------------------
SVM
Acuracia de Bag of Words = 0.6160458452722063
Acuracia de Bag of Words Bigram = 0.7163323782234957
Acuracia de freq = 0.6590257879656161
Acuracia de tfidf = 0.670487106017192


- Aplicando Stemmer

In [14]:
def stem(text):
    words = text.split()                             
    snowball = SnowballStemmer("english")
    stem_words = []
    for w in words:
        stem_words.append(snowball.stem(w))
    
    return( " ".join(stem_words))

tweets_stem = [stem(i) for i in tweets]

# Bag of Words

In [15]:
stem_vectorizer = CountVectorizer(analyzer = "word", tokenizer=tweet_tokenizer.tokenize)
freq_tweets_stems = stem_vectorizer.fit_transform(tweets_stem)

In [16]:
stem_vectorizerbi = CountVectorizer(ngram_range=(2, 2), tokenizer=tweet_tokenizer.tokenize)
freq_tweets_bi = stem_vectorizerbi.fit_transform(tweets_stem)

# TF

In [17]:
tf_stem = TfidfVectorizer(use_idf= False, norm ="l1")
tfstem = tf_stem.fit_transform(tweets_stem)

# TF_IDF

In [18]:
tfidfs = TfidfVectorizer(use_idf= True, norm ="l1")
tweet_tfidfs = tfidfs.fit_transform(tweets_stem)

In [19]:
X_train_bofas, X_test_bofas, y_train_bofas, y_test_bofas = train_test_split(freq_tweets_stems,classe, test_size=0.2)
X_train_bofabis, X_test_bofabis, y_train_bofabis, y_test_bofabis = train_test_split(freq_tweets_bi,classe, test_size=0.2)
X_train_tfs, X_test_tfs, y_train_tfs, y_test_tfs = train_test_split(tfstem,classe, test_size=0.2)
X_train_tfidfs, X_test_tfidfs, y_train_tfidfs, y_test_tfidfs = train_test_split(tweet_tfidfs,classe, test_size=0.2)

In [20]:
modelo_bofas = MultinomialNB().fit(X_train_bofas, y_train_bofas)
modelo_bofabis = MultinomialNB().fit(X_train_bofabis,y_train_bofabis)
modelo_tfs = MultinomialNB().fit(X_train_tfs, y_train_tfs)
modelo_tfidfs = MultinomialNB().fit(X_train_tfidfs, y_train_tfidfs)

predict_bofas = modelo_bofas.predict(X_test_bofas)
predict_bofabis = modelo_bofabis.predict(X_test_bofabis)
predict_tfs = modelo_tfs.predict(X_test_tfs)
predict_tfidfs = modelo_tfidfs.predict(X_test_tfidfs)

#SVM

svm_bofas = GridSearchCV(svc,param_grid).fit(X_train_bofas, y_train_bofas)
svm_bofabis = GridSearchCV(svc,param_grid).fit(X_train_bofabis, y_train_bofabis)
svm_tfs = GridSearchCV(svc,param_grid).fit(X_train_tfs, y_train_tfs)
svm_tfidfs = GridSearchCV(svc, param_grid).fit(X_train_tfidfs, y_train_tfidfs)

svm_pred_bofas = svm_bofas.predict(X_test_bofas)
svm_pred_bofabis = svm_bofabis.predict(X_test_bofabis)
svm_pred_tfs = svm_tfs.predict(X_test_tfs)
svm_pred_tfidfs = svm_tfidfs.predict(X_test_tfidfs)




In [21]:
acc_counts = metrics.accuracy_score(y_test_bofas,predict_bofas)
acc_bofabis = metrics.accuracy_score(y_test_bofabis, predict_bofabis)
acc_tfis = metrics.accuracy_score(y_test_tfs,predict_tfs)
acc_tfidfs = metrics.accuracy_score(y_test_tfidfs,predict_tfidfs)

acc_svm_bofas = metrics.accuracy_score(y_test_bofas,svm_pred_bofas)
acc_svm_bofabis = metrics.accuracy_score(y_test_bofabis,svm_pred_bofabis)
acc_svm_tfis = metrics.accuracy_score(y_test_tfs,svm_pred_tfs)
acc_svm_tfidfs = metrics.accuracy_score(y_test_tfidfs,svm_pred_tfidfs)


print('Naive Bayes')
print("Acuracia de Bag of Words = {}".format(acc_counts))
print("Acuracia de Bag of Words Bigram = {}".format(acc_bofabis))
print("Acuracia de tf = {}".format(acc_tfis))
print("Acuracia de tfidf = {}".format(acc_tfidfs))
print("-----------------------------------------------------")

print('SVM')
print("Acuracia de Bag of Words = {}".format(acc_svm_bofas))
print("Acuracia de Bag of Words Bigram = {}".format(acc_svm_bofabis))
print("Acuracia de tf = {}".format(acc_svm_tfis))
print("Acuracia de tfidf = {}".format(acc_svm_tfidfs))
print("-----------------------------------------------------")



Naive Bayes
Acuracia de Bag of Words = 0.6790830945558739
Acuracia de Bag of Words Bigram = 0.39255014326647564
Acuracia de tf = 0.6561604584527221
Acuracia de tfidf = 0.673352435530086
-----------------------------------------------------
SVM
Acuracia de Bag of Words = 0.6074498567335244
Acuracia de Bag of Words Bigram = 0.6103151862464183
Acuracia de tf = 0.6590257879656161
Acuracia de tfidf = 0.673352435530086
-----------------------------------------------------
