### Import data

In [1]:
import pandas as pd

tweets = pd.read_csv('data/tweets.csv', encoding="UTF-8")

  interactivity=interactivity, compiler=compiler, result=result)


### Remove undesirable values


In [2]:
import math

tweets = tweets[tweets.userOrientation.isin(["target", "left", "right"])]

In [3]:
target_tweets = tweets[tweets.userOrientation == "target"]
tweets = tweets[tweets.userOrientation != "target"]

### Text Preprocessing

In [4]:
text = tweets.text.str.lower()

In [5]:
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()

lists_tokens = text.apply(tokenizer.tokenize)

In [6]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

lemmatized_lists = list()
for list_tokens in lists_tokens:
    words_lemma = [lemmatizer.lemmatize(token) for token in list_tokens]
    lemmatized_lists.append(words_lemma)
    
lemmatized_series = pd.Series(lemmatized_lists)

In [7]:
from nltk import ngrams

def get_ngrams(ngram_size, to_extend):
    
    ngrams_list = to_extend.apply(ngrams, args=(ngram_size,))
    new_n_list = []
    for grams in ngrams_list:
        tuple_gram = [gram for gram in grams]
        string_tuple = ['_'.join(n_tuple) for n_tuple in tuple_gram]
        new_n_list.append(string_tuple)
    return new_n_list

bigrams = get_ngrams(2, lemmatized_series)
trigrams = get_ngrams(3, lemmatized_series)

In [8]:
def list_extend(lst, item):
    lst.extend(item)
    return lst

lemmatized_series = [list_extend(lemma, bigram) for lemma, bigram in zip(lemmatized_series, bigrams)]
lemmatized_series = [list_extend(lemma, trigram) for lemma, trigram in zip(lemmatized_series, trigrams)]

In [9]:
def get_phrases_from_list(list_of_lists):
    phrase_list = [' '.join(list_of_words) for list_of_words in list_of_lists]
    return phrase_list

rebuid_tweets = get_phrases_from_list(lemmatized_series)

In [10]:
def get_vocabulary(list_of_lists):
    flat_list = [word for list_of_words in list_of_lists for word in list_of_words]
    flat_list = set(flat_list)
    return flat_list

vocab = get_vocabulary(lemmatized_series)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

countvec = CountVectorizer(vocabulary=vocab)
bow_tweets = countvec.fit_transform(rebuid_tweets)

### Train Model

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(bow_tweets, tweets.userOrientation, test_size=0.25, random_state=42)

In [22]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
from sklearn.tree import DecisionTreeClassifier

scoring = ['precision_macro', 'recall_macro', 'f1_macro', 'accuracy']
clf = DecisionTreeClassifier(random_state=0)
scores = cross_validate(clf, X_train, y_train, scoring=scoring, cv=5, return_train_score=False)

In [23]:
scores

{'fit_time': array([ 733.52290297,  697.06924701,  692.05028582,  707.82661796,
         797.0581131 ]),
 'score_time': array([ 0.30513191,  0.29063916,  0.28825808,  0.29442906,  0.31918406]),
 'test_accuracy': array([ 0.73768473,  0.73406259,  0.73217908,  0.72652854,  0.73387915]),
 'test_f1_macro': array([ 0.73698303,  0.73310464,  0.73151407,  0.72549015,  0.73314689]),
 'test_precision_macro': array([ 0.73765291,  0.73433597,  0.73206616,  0.72681893,  0.73385197]),
 'test_recall_macro': array([ 0.73678045,  0.73287383,  0.73133239,  0.72527702,  0.73294563])}

In [26]:
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

#### Model persistance

In [27]:
from joblib import dump, load

dump(clf, 'text_orientation_classifier.joblib') 

['text_orientation_classifier.joblib']

### Test model

In [28]:
prediction = clf.predict(X_test)

In [32]:
from sklearn.metrics import classification_report

names = ['left', 'right']

print(classification_report(y_test, prediction, names))


             precision    recall  f1-score   support

       left       0.74      0.77      0.75     11936
      right       0.74      0.71      0.72     11070

avg / total       0.74      0.74      0.74     23006



### Get most important features

In [64]:
feature_importance = pd.DataFrame({'features' : list(vocab), 'importances' : clf.feature_importances_})

In [65]:
feature_importance[feature_importance.importances != 0.0].sort_values(by=['importances'], ascending=False)


Unnamed: 0,features,importances
584981,massa_eram_muito,2.288186e-02
1180255,desigualdade_no_campo,2.117141e-02
1692626,https://t.co/fwpvkfwr8x,1.720845e-02
1060880,de_deputada_q,1.619341e-02
432634,falhas_graf,1.079291e-02
1423923,que_a_explica√ß√£o,9.872618e-03
1183943,como_terroristas_e,9.786409e-03
512564,j√°_tinha_avisado,9.686603e-03
713265,corinthian_e_flamengo,9.096106e-03
1950459,s√£o_op√ß√µes,8.533979e-03
