# Sentiment Analysis Model

## Training the Data

In [None]:
import pandas as pd
import numpy as np

#NLTK
import nltk
from nltk import word_tokenize, WordPunctTokenizer, regexp_tokenize
from nltk.corpus import stopwords

#Keras
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

#Sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB

#Plotting 
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

## TASS Workshop

Source:

TASS-2017: Workshop on Semantic Analysis at SEPLN (http://www.sepln.org/workshops/tass/2017/)

TASS Workshop on Semantic Analysis has been held since 2012, under the umbrella of the International Conference of the Spanish Society for Natural Language Processing (SEPLN). TASS was the first shared task on sentiment analysis in Twitter in Spanish. The initial aim of TASS was the furtherance of research on sentiment analysis in Spanish with a special interest on the language used in Twitter.

In [None]:
import xml.etree.ElementTree as ET
tree = ET.parse('Resources/general-train-tagged-3l.xml')
root = tree.getroot()

In [None]:
train_data = pd.DataFrame({'tweetText':[],'polarity_value':[]})
row=0
for tweet in root:
    tweetText = tweet.find('content').text
    lang = tweet.find('lang').text
    polarity_value = tweet.find('sentiments').find('polarity').find('value').text

    if lang == 'es':
        train_data.loc[row] = [tweetText,polarity_value]
        row+=1

train_data

In [None]:
train_data = train_data.set_index("polarity_value")
train_data

In [None]:
train_data = train_data.drop("NONE", axis=0)
train_data

In [None]:
train_data = train_data.reset_index()
train_data

In [None]:
train_data['polarity_value'] = train_data['polarity_value'].replace(['NEU'],0)
train_data['polarity_value'] = train_data['polarity_value'].replace(['P'],1)
train_data['polarity_value'] = train_data['polarity_value'].replace(['N'],-1)
train_data

In [None]:
train_data = train_data[['tweetText','polarity_value']]
train_data

In [None]:
train_data.polarity_value.value_counts()

In [None]:
train_data['processed_tweet'] = train_data.tweetText

In [None]:
train_data

## AMLO Tweets

5 days before/after the video scandal from Pio Lopez Obrador

In [None]:
amlo_tweets = pd.read_csv('Resources/Tweets.csv')
amlo_tweets

In [None]:
amlo_tweets=amlo_tweets.rename(columns={'Content':'tweetText'})
amlo_tweets

In [None]:
amlo_tweets = amlo_tweets[["tweetText","Date"]]
amlo_tweets

In [None]:
amlo_tweets['processed_tweet'] = amlo_tweets.tweetText
amlo_tweets

## Data Processing

### Hashtags

In [None]:
import re
hash_regex = re.compile(r"#(\w+)")
hashtags = [] 
def hash_repl(match):
    _ = '__HASH__'+match.group(1).upper()
    hashtags.append(_)
    return _

### URL

In [None]:
url_regex = re.compile(r"(http|https|ftp)://[a-zA-Z0-9\./]+")
def url_repl(match):
    return '__URL__'

### Repetitions

In [None]:
rpt_regex = re.compile(r"(.)\1{1,}", re.IGNORECASE);
def rpt_repl(match):
    return match.group(1)+match.group(1)

In [None]:
# Testing
re.sub(rpt_regex, rpt_repl, "Reppppppeated characters in wordsssss" )

### Usernames

In [None]:
user_regex = re.compile(r"@(\w+)")
usr_names = [] # To store the user names so we can exclude them from some parts of the analysis
def user_repl(match):
    _ = '__USER__'+match.group(1).upper()
    usr_names.append(_)
    return _

### Punctuation

In [None]:
# Spliting by word boundaries
word_bound_regex = re.compile(r"\W+")

# Punctuations
punctuations = \
    [   
        ('__PEXCL__',    ['!', '¡', ] )    ,\
        ('__PQUES__',    ['?', '¿', ] )    ,\
        ('__PPROG__',    ['...', '…', ] )  ,\
    ]

#For punctuation replacement
def punctuations_repl(match):
    text = match.group(0)
    repl = []
    for (key, parr) in punctuations :
        for punc in parr :
            if punc in text:
                repl.append(key)
    if(len(repl)>0 ) :
        return ' '+' '.join(repl)+' '
    else :
        return ' '

### Stopwords

In [None]:
#nltk.download('stopwords')
#stop_words=stopwords.words('spanish')

#def stopwords(text):
#    text = [w for w in text if not w in stop_words]
    
#    return(text)

In [None]:
#stop_words

### Further Cleaning & Stemming

In [None]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('spanish', ignore_stopwords = True)

In [None]:
def clean(text):
    text = re.sub('\w+', lambda x:'' if x.group().startswith('__') else x.group(), text)
    
    return(text)

In [None]:
def processAll(text):
    text = re.sub( hash_regex, hash_repl, text )
    text = re.sub( user_regex, user_repl, text)
    text = re.sub( url_regex, url_repl, text )
    
    text = text.replace('\'','')
    
    text = re.sub( word_bound_regex , punctuations_repl, text )
    text = re.sub( rpt_regex, rpt_repl, text )
    
    text = clean(text)
    #text = stopwords(text)
    
    return text

In [None]:
train_data['processed_tweet'] = train_data.tweetText.apply(processAll)
amlo_tweets['processed_tweet'] = amlo_tweets.tweetText.apply(processAll)

In [None]:
train_data

In [None]:
def sb_stem(text):
    text = [word if(word[0:2]=='__') else word.lower() for word in text.split() if ((len(word) >= 3) or (word in ['no','si', 'sí', 'ni']))] #keep the small words (like 'no')
    text = [stemmer.stem(w) if w[0:2]!='__' else w for w in text ]
    
    return(text)

In [None]:
train_data['stemmed_tweet'] = train_data.processed_tweet.apply(sb_stem)
amlo_tweets['stemmed_tweet'] = amlo_tweets.processed_tweet.apply(sb_stem)

In [None]:
train_data

In [None]:
amlo_tweets

### N-Grams

In [None]:
from nltk import ngrams

def bigramize(tweets, n=2):
    bigrams=[]
    for tweet in tweets:
        bigrams += ngrams(tweets,n=2)
    return bigrams

def trigramize(tweets, n=3):
    trigrams=[]
    for tweet in tweets:
        trigrams += ngrams(tweets,n=3)
    return trigrams

In [None]:
train_data['bigrams'] = train_data.stemmed_tweet.apply(bigramize)
train_data['trigrams'] = train_data.stemmed_tweet.apply(trigramize)

In [None]:
train_data

In [None]:
amlo_tweets['bigrams'] = amlo_tweets.stemmed_tweet.apply(bigramize)
amlo_tweets['trigrams'] = amlo_tweets.stemmed_tweet.apply(trigramize)

In [None]:
amlo_tweets

### Vectorization

In [None]:
train_data2 = train_data[['polarity_value','stemmed_tweet','bigrams','trigrams']]
train_data2

In [None]:
train_data3 = train_data2.copy()
for index, row in train_data2.iterrows():
    for word in row.stemmed_tweet:
        if word not in train_data3.columns:
            train_data3[word] = 0
        train_data3.loc[index, word] = 1
train_data3

In [None]:
amlo_tweets

In [None]:
amlo_tweets2 = amlo_tweets[['Date','stemmed_tweet','bigrams','trigrams']]

In [None]:
amlo_tweets3 = amlo_tweets2.copy()
for index, row in amlo_tweets2.iterrows():
    for word in row.stemmed_tweet:
        if word not in amlo_tweets3.columns:
            amlo_tweets3[word] = 0
        amlo_tweets3.loc[index, word] = 1
amlo_tweets3

# Classification with Naive Bayes

In [None]:
X = train_data3.drop(['polarity_value','stemmed_tweet','bigrams','trigrams'], axis=1)
X

In [None]:
y = train_data3[['polarity_value']]
y

In [None]:
X_test = amlo_tweets3.drop(['Date','stemmed_tweet','bigrams','trigrams'], axis=1)
X_test

In [None]:
# Import train_test_split function
from sklearn.model_selection import train_test_split

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42) # 70% training and 30% test

In [None]:
#Create a Gaussian Classifier
nb = GaussianNB()

#Train the model using the training sets
nb.fit(X_train, y_train)

In [None]:
#Predict the response for test dataset
y_pred = nb.predict(X_test)
y_pred

In [None]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)  

print(cm)  

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(nb, X_test, y_test)
plt.show()

In [None]:
X_test

In [None]:
x = X_test
y = y_pred

In [None]:
plt.plot(x, y, linewidth=2.0)

## Frequently used words

In [None]:
import string
punctuation = list(string.punctuation)
stop = stopwords.words('spanish') + punctuation + ['rt', 'via'] + ['lopezobrador'] 
wordcloud = WordCloud(
                          background_color='white',
                          stopwords=stop,
                          max_words=200,
                          max_font_size=50, 
                          random_state=42
                         ).generate(str(amlo_tweets['processed_tweet']))
plt.imshow(wordcloud)
plt.rcParams["figure.figsize"] = (15,15)
plt.axis('off')
plt.show()