## Testar nao remover hashtags, remover stopwords

## Usar o vocabulario que foi formado para calcular pearson como entrada para calcular o BoW e o TF-iDF e fazer classificação de texto.

### Import data

In [None]:
import pandas as pd

tweets = pd.read_csv('data/tweets.csv', encoding="UTF-8")

### Remove undesirable values


In [None]:
import math

tweets = tweets[tweets.userOrientation.isin(["target", "left", "right"])]

In [None]:
target_tweets = tweets[tweets.userOrientation == "target"]
tweets = tweets[tweets.userOrientation != "target"]

### Text Preprocessing

In [None]:
# lower case
text = tweets.text.str.lower()

In [None]:
# tokenize words
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()

lists_tokens = text.apply(tokenizer.tokenize)

In [6]:
# remove punctuation
new_lists_tokens = list()
for list_tokens in lists_tokens:
    new_list_tokens = list()
    for token in list_tokens:
        if token.isalpha():
            new_list_tokens.append(token)
    new_lists_tokens.append(new_list_tokens)

In [7]:
# lemmatize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

lemmatized_lists = list()
for list_tokens in new_lists_tokens:
    words_lemma = [lemmatizer.lemmatize(token) for token in list_tokens]
    lemmatized_lists.append(words_lemma)
    
lemmatized_series = pd.Series(lemmatized_lists)

In [8]:
# 2gram and 3gram calculation
from nltk import ngrams

def get_ngrams(ngram_size, to_extend):
    
    ngrams_list = to_extend.apply(ngrams, args=(ngram_size,))
    new_n_list = []
    for grams in ngrams_list:
        tuple_gram = [gram for gram in grams]
        string_tuple = ['_'.join(n_tuple) for n_tuple in tuple_gram]
        new_n_list.append(string_tuple)
    return new_n_list

bigrams = get_ngrams(2, lemmatized_series)
trigrams = get_ngrams(3, lemmatized_series)

In [9]:
# add 2 gram and 3 gram t
def list_extend(lst, item):
    lst.extend(item)
    return lst

lemmatized_series = [list_extend(lemma, bigram) for lemma, bigram in zip(lemmatized_series, bigrams)]
lemmatized_series = [list_extend(lemma, trigram) for lemma, trigram in zip(lemmatized_series, trigrams)]

In [10]:
def get_phrases_from_list(list_of_lists):
    phrase_list = [' '.join(list_of_words) for list_of_words in list_of_lists]
    return phrase_list

rebuid_tweets = get_phrases_from_list(lemmatized_series)

In [11]:
def get_vocabulary(list_of_lists):
    flat_list = [word for list_of_words in list_of_lists for word in list_of_words]
    flat_list = set(flat_list)
    return flat_list

vocab = get_vocabulary(lemmatized_series)

### Paper implementation - What Drives Media Slant?

In [12]:
phrases_orientation = pd.DataFrame({'features' : lemmatized_series, 'orientation' : tweets.userOrientation})

In [13]:
def reshape_dataframe(row):
    return pd.DataFrame({'word':row['features'], 'orientation':row['orientation']})

list_of_words_orientation = []
for index, row in phrases_orientation.iterrows():
    new_rows = reshape_dataframe(row)
    list_of_words_orientation.append(new_rows)

In [14]:
words_orientation = pd.concat(list_of_words_orientation)

#### Phrase lenght

In [15]:
phrases_length = [string.count('_')+1 for string in words_orientation.word]

In [16]:
words_orientation['length'] = phrases_length

#### Pearson Statistic for every phrase

#### Legend:

p = phrase

l = length of phrase p

o = orientation

e = tweeted by a left-wing supporter

d = tweeted by a right-wing supporter

Ex:
Fple = frequency of a l-length phrase wrote by an profile biased towards the left-wing

fnple = frequency of l-length phrases except of p, wrote by an profile biased towards the left-wing


In [17]:
frequency_plo = words_orientation.groupby(words_orientation.columns.tolist(), as_index=False).size()
frequency_plo = frequency_plo.to_frame("frequency").reset_index()

#### Discard phrases

In [42]:
def discard_phrases_by_length_freq(table, length, min_freq, max_freq):
    table = table.drop(table[(table.length == length) & (table.frequency < min_freq)].index)
    table = table.drop(table[(table.length == length) & (table.frequency > max_freq)].index)
    return table

In [44]:
frequency_plo = discard_phrases_by_length_freq(frequency_plo, 1, 100, 5000)
frequency_plo = discard_phrases_by_length_freq(frequency_plo, 2, 50, 500)
frequency_plo = discard_phrases_by_length_freq(frequency_plo, 3, 10, 100)

##### Pearson Statistic

In [93]:
def pearson_statistic(target_word):
    
    # frequency of phrase p of length l for left and right tweets
    pled = frequency_plo[frequency_plo.word == target_word]
    
    # length of phrase p
    if len(pled.index) == 0:
        return "-"
    
    length = pled.length.iloc[0]
    
    if("left" not in pled.orientation.unique()):
        complement = pd.DataFrame({"orientation":"left", "word":target_word, "length":length, "frequency":0}, index = [0])
        pled = pled.append(complement)
        
    if("right" not in pled.orientation.unique()):
        complement = pd.DataFrame({"orientation": "right", "word":target_word, "length":length, "frequency":0}, index = [0])
        pled = pled.append(complement)

    # frequency of not phrase p of length l for left and right tweets
    not_pled = frequency_plo[(frequency_plo.word != target_word) & (frequency_plo.length == length)]
    not_pled = not_pled.groupby("orientation").frequency.sum().reset_index()
    
    pld = float(pled[pled.orientation == "right"].frequency)
    ple = float(pled[pled.orientation == "left"].frequency)

    not_pld = float(not_pled[not_pled.orientation == "right"].frequency)
    not_ple = float(not_pled[not_pled.orientation == "left"].frequency)

    X = (pld*not_ple - ple*not_pld)**2/((pld+ple)*(pld+not_pld)*(ple+not_ple)*(not_ple+not_pld))
    return X

In [94]:
X = frequency_plo.word.apply(pearson_statistic)

In [99]:
word_pearson = pd.DataFrame({'word':frequency_plo.word, 'orientation':frequency_plo.orientation, 'X':X})

In [100]:
word_pearson.sort_values(['X'], ascending=False)

Unnamed: 0,X,orientation,word
1429071,7.085834e-03,right,por
1602889,7.054320e-03,right,uma
1524580,6.916672e-03,right,se
1549098,2.028756e-03,right,sobre_a
1103705,1.979482e-03,right,do_que
1121484,1.934319e-03,right,e_não
54811,1.795365e-03,left,ao_vivo
1018971,1.675731e-03,right,contra_o
1352263,1.667524e-03,right,não_tem
1062636,1.622387e-03,right,de_uma


### BoW 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

countvec = CountVectorizer(vocabulary=vocab)
bow_tweets = countvec.fit_transform(rebuid_tweets)

### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tdidfvec = TfidfVectorizer(vocabulary=vocab)
tfidf_tweets = tdidfvec.fit_transform(rebuid_tweets)

## BOW Model

### Train Model

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(bow_tweets, tweets.userOrientation, test_size=0.25, random_state=42)

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
from sklearn.tree import DecisionTreeClassifier

scoring = ['precision_macro', 'recall_macro', 'f1_macro', 'accuracy']
#clf = DecisionTreeClassifier(random_state=0)
#scores = cross_validate(clf, X_train, y_train, scoring=scoring, cv=5, return_train_score=False)

In [None]:
#scores

In [None]:
#clf.fit(X_train, y_train)

#### Model persistance

In [None]:
from joblib import dump, load

#dump(clf, 'text_orientation_classifier.joblib') 
clf = load('text_orientation_classifier.joblib')

### Test model

In [None]:
prediction = clf.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

names = ['left', 'right']

print(classification_report(y_test, prediction, names))


### Get most important features

In [None]:
feature_importance = pd.DataFrame({'features' : list(vocab), 'importances' : clf.feature_importances_})

In [None]:
feature_importance[feature_importance.importances != 0.0].sort_values(by=['importances'], ascending=False)


## TF-IDF Model

### Train Model

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf_tweets, tweets.userOrientation, test_size=0.25, random_state=40)

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
from sklearn.tree import DecisionTreeClassifier

scoring = ['precision_macro', 'recall_macro', 'f1_macro', 'accuracy']
clf_tfidf = DecisionTreeClassifier(random_state=0)
#scores = cross_validate(clf, X_train, y_train, scoring=scoring, cv=5, return_train_score=False)

In [None]:
#scores

In [None]:
clf_tfidf.fit(X_train, y_train)

### Test model

In [None]:
prediction_tfidf = clf_tfidf.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

names = ['left', 'right']

print(classification_report(y_test, prediction_tfidf, names))


### Get most important features

In [None]:
feature_importance_tfidf = pd.DataFrame({'features' : list(vocab), 'importances' : clf_tfidf.feature_importances_})

In [None]:
feature_importance_tfidf[feature_importance_tfidf.importances != 0.0].sort_values(by=['importances'], ascending=False)