### Import data

In [1]:
import pandas as pd

tweets = pd.read_csv('data/tweets.csv', encoding="UTF-8")

  interactivity=interactivity, compiler=compiler, result=result)


### Remove undesirable values


In [2]:
import math

tweets = tweets[tweets.userOrientation.isin(["target", "left", "right"])]

In [3]:
target_tweets = tweets[tweets.userOrientation == "target"]
tweets = tweets[tweets.userOrientation != "target"]

### Text Preprocessing

In [4]:
text = tweets.text.str.lower()

In [5]:
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()

lists_tokens = text.apply(tokenizer.tokenize)

In [6]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

lemmatized_lists = list()
for list_tokens in lists_tokens:
    words_lemma = [lemmatizer.lemmatize(token) for token in list_tokens]
    lemmatized_lists.append(words_lemma)
    
lemmatized_series = pd.Series(lemmatized_lists)

In [7]:
from nltk import ngrams

def get_ngrams(ngram_size, to_extend):
    
    ngrams_list = to_extend.apply(ngrams, args=(ngram_size,))
    new_n_list = []
    for grams in ngrams_list:
        tuple_gram = [gram for gram in grams]
        string_tuple = ['_'.join(n_tuple) for n_tuple in tuple_gram]
        new_n_list.append(string_tuple)
    return new_n_list

bigrams = get_ngrams(2, lemmatized_series)
trigrams = get_ngrams(3, lemmatized_series)

In [13]:
def list_extend(lst, item):
    lst.extend(item)
    return lst

lemmatized_series = [list_extend(lemma, bigram) for lemma, bigram in zip(lemmatized_series, bigrams)]
lemmatized_series = [list_extend(lemma, trigram) for lemma, trigram in zip(lemmatized_series, trigrams)]

In [14]:
def get_phrases_from_list(list_of_lists):
    phrase_list = [' '.join(list_of_words) for list_of_words in list_of_lists]
    return phrase_list

rebuid_tweets = get_phrases_from_list(lemmatized_series)

In [15]:
def get_vocabulary(list_of_lists):
    flat_list = [word for list_of_words in list_of_lists for word in list_of_words]
    flat_list = set(flat_list)
    return flat_list

vocab = get_vocabulary(lemmatized_series)

### BoW 

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

countvec = CountVectorizer(vocabulary=vocab)
bow_tweets = countvec.fit_transform(rebuid_tweets)

### TF-IDF

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

tdidfvec = TfidfVectorizer(vocabulary=vocab)
tfidf_tweets = tdidfvec.fit_transform(rebuid_tweets)

## BOW Model

### Train Model

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(bow_tweets, tweets.userOrientation, test_size=0.25, random_state=42)

In [18]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
from sklearn.tree import DecisionTreeClassifier

scoring = ['precision_macro', 'recall_macro', 'f1_macro', 'accuracy']
#clf = DecisionTreeClassifier(random_state=0)
#scores = cross_validate(clf, X_train, y_train, scoring=scoring, cv=5, return_train_score=False)

In [23]:
#scores

{'fit_time': array([ 733.52290297,  697.06924701,  692.05028582,  707.82661796,
         797.0581131 ]),
 'score_time': array([ 0.30513191,  0.29063916,  0.28825808,  0.29442906,  0.31918406]),
 'test_accuracy': array([ 0.73768473,  0.73406259,  0.73217908,  0.72652854,  0.73387915]),
 'test_f1_macro': array([ 0.73698303,  0.73310464,  0.73151407,  0.72549015,  0.73314689]),
 'test_precision_macro': array([ 0.73765291,  0.73433597,  0.73206616,  0.72681893,  0.73385197]),
 'test_recall_macro': array([ 0.73678045,  0.73287383,  0.73133239,  0.72527702,  0.73294563])}

In [26]:
#clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

#### Model persistance

In [19]:
from joblib import dump, load

#dump(clf, 'text_orientation_classifier.joblib') 
clf = load('text_orientation_classifier.joblib')

### Test model

In [20]:
prediction = clf.predict(X_test)

In [21]:
from sklearn.metrics import classification_report

names = ['left', 'right']

print(classification_report(y_test, prediction, names))


             precision    recall  f1-score   support

       left       0.73      0.77      0.75     11936
      right       0.74      0.69      0.71     11070

avg / total       0.73      0.73      0.73     23006



### Get most important features

In [22]:
feature_importance = pd.DataFrame({'features' : list(vocab), 'importances' : clf.feature_importances_})

In [23]:
feature_importance[feature_importance.importances != 0.0].sort_values(by=['importances'], ascending=False)


Unnamed: 0,features,importances
584981,massa_eram_muito,2.288186e-02
1180255,desigualdade_no_campo,2.117141e-02
1692626,https://t.co/fwpvkfwr8x,1.720845e-02
1060880,de_deputada_q,1.619341e-02
432634,falhas_graf,1.079291e-02
1423923,que_a_explicação,9.872618e-03
1183943,como_terroristas_e,9.786409e-03
512564,já_tinha_avisado,9.686603e-03
713265,corinthian_e_flamengo,9.096106e-03
1950459,são_opções,8.533979e-03


## TF-IDF Model

### Train Model

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf_tweets, tweets.userOrientation, test_size=0.25, random_state=40)

In [26]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
from sklearn.tree import DecisionTreeClassifier

scoring = ['precision_macro', 'recall_macro', 'f1_macro', 'accuracy']
clf_tfidf = DecisionTreeClassifier(random_state=0)
#scores = cross_validate(clf, X_train, y_train, scoring=scoring, cv=5, return_train_score=False)

In [None]:
#scores

In [27]:
clf_tfidf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

### Test model

In [28]:
prediction_tfidf = clf_tfidf.predict(X_test)

In [29]:
from sklearn.metrics import classification_report

names = ['left', 'right']

print(classification_report(y_test, prediction_tfidf, names))


             precision    recall  f1-score   support

       left       0.71      0.75      0.73     11843
      right       0.72      0.68      0.70     11163

avg / total       0.72      0.72      0.72     23006



### Get most important features

In [30]:
feature_importance_tfidf = pd.DataFrame({'features' : list(vocab), 'importances' : clf_tfidf.feature_importances_})

In [31]:
feature_importance_tfidf[feature_importance_tfidf.importances != 0.0].sort_values(by=['importances'], ascending=False)

Unnamed: 0,features,importances
584981,massa_eram_muito,3.013738e-02
1692626,https://t.co/fwpvkfwr8x,2.567748e-02
1180255,desigualdade_no_campo,2.173929e-02
1060880,de_deputada_q,1.696789e-02
516207,autora_:_https://t.co/oeyahv9yky,1.196153e-02
432634,falhas_graf,1.067703e-02
1423923,que_a_explicação,1.040226e-02
1183943,como_terroristas_e,1.037389e-02
512564,já_tinha_avisado,9.765929e-03
687076,válidos_._a,9.637421e-03


### Paper implementation - What Drives Media Slant?

In [56]:
phrases_orientation = pd.DataFrame({'features' : lemmatized_series, 'orientation' : tweets.userOrientation})

In [81]:
def reshape_dataframe(row):
    return pd.DataFrame({'word':row['features'], 'orientation':row['orientation']})

list_of_words_orientation = []
for index, row in phrases_orientation.iterrows():
    new_rows = reshape_dataframe(row)
    list_of_words_orientation.append(new_rows)

In [117]:
words_orientation = pd.concat(list_of_words_orientation)

#### Phrase lenght

In [118]:
phrases_length = [string.count('_')+1 for string in words_orientation.word]

In [119]:
words_orientation['length'] = phrases_length

#### Pearson Statistic for every phrase

#### Legend:

p = phrase

l = length of phrase p

o = orientation

e = tweeted by a left-wing supporter

d = tweeted by a right-wing supporter

Ex:
Fple = frequency of a l-length phrase wrote by an profile biased towards the left-wing

fnple = frequency of l-length phrases except of p, wrote by an profile biased towards the left-wing


In [146]:
frequency_plo = words_orientation.groupby(words_orientation.columns.tolist(), as_index=False).size()
frequency_plo = frequency_plo.to_frame("frequency").reset_index()

In [149]:
frequency_ple = frequency_plo[frequency_plo.orientation == "left"]
frequency_pld = frequency_plo[frequency_plo.orientation == "right"]

In [152]:
ple = frequency_ple.word[0]

In [153]:
ple

u'!'