In [1]:
import pandas as pd

# The files are coded in ISO-8859-1

df = pd.read_csv("data/Colera-No.csv")
df[0:5]

Unnamed: 0,Tweet,Info
0,@AS_Manolete Y con el atleti podemos soñar otr...,Colera-Asco
1,La 'rajada' de un ex objetivo del Barça sobre ...,Colera-Asco
2,@marcmarquez93 @3gerardpique @SergiRoberto10 @...,Colera-Asco
3,@LuisOmarTapia @IkerCasillas @ChampionsLeague ...,Colera-Asco
4,"La ""rajada"" de un ex objetivo del Barça sobre ...",Colera-Asco


In [2]:
df.shape

(256, 2)

In [3]:
# Define X and Y
X = df['Tweet'].values.astype(str)
y = df['Info'].values.astype(str)

In [4]:
# Transformer 1


# Sample of statistics using nltk
# Another option is defining a function and pass it as a parameter to FunctionTransformer

from sklearn.base import BaseEstimator, TransformerMixin
from nltk.tokenize import sent_tokenize, word_tokenize

class LexicalStats (BaseEstimator, TransformerMixin):
    """Extract lexical features from each document"""
    
    def number_sentences(self, doc):
        sentences = sent_tokenize(doc, language='english')
        return len(sentences)

    def fit(self, x, y=None):
        return self

    def transform(self, docs):
        return [{'length': len(doc),
                 'num_sentences': self.number_sentences(doc)}
                for doc in docs]

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from nltk.corpus import stopwords
import string


def custom_tokenizer(words):
    spanish_stopwords = stopwords.words('spanish')
    
    """Preprocessing tokens as seen in the lexical notebook"""
    tokens = word_tokenize(words.lower())
    porter = PorterStemmer()
    lemmas = [porter.stem(t) for t in tokens]
    stoplist = spanish_stopwords
    lemmas_clean = [w for w in lemmas if w not in stoplist]
    punctuation = set(string.punctuation)
    lemmas_punct = [w for w in lemmas_clean if  w not in punctuation]

    
    return lemmas_punct


In [6]:
from simM import SimMatrix
from gensim.models import KeyedVectors
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn import feature_selection
from sklearn.base import BaseEstimator, TransformerMixin
from gsitk.preprocess import normalize
import numpy as np
import pandas as pd

class Transformador (BaseEstimator, TransformerMixin):
    def fit (self, x, y=None):
        return self
    def transform(self, X):
        array = []
        for i in range(0,len(X)):
            array.append(normalize.preprocess(X[i]))
        return np.array(array)

In [7]:
embeddings = KeyedVectors.load_word2vec_format('./SBW-vectors-300-min5.txt')

In [8]:
lexicon = pd.read_csv('ElhPolar_esV1.lex.txt', sep='\t', header=None, names=['word', 'sentiment'])
positive = lexicon[lexicon['sentiment']=='positive']['word'].values
negative = lexicon[lexicon['sentiment']=='negative']['word'].values
lexicon_words = [positive,negative]

In [9]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer



ngrams_featurizer = Pipeline([
  ('count_vectorizer',  CountVectorizer(analyzer="word", max_df=0.5, ngram_range=[1,2])),
  ('tfidf_transformer', TfidfTransformer())
])


from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.decomposition import NMF, LatentDirichletAllocation




from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier #For Classification
from sklearn.neighbors import KNeighborsClassifier

## All the steps of the Pipeline should end with a sparse vector as the input data

pipeline = Pipeline([
       ('features', FeatureUnion([
                    ('words', TfidfVectorizer(tokenizer=custom_tokenizer)),

                   ('ngrams', ngrams_featurizer),
                   ('lexical_stats', Pipeline([
                                        ('stats', LexicalStats()),
                                        ('vectors', DictVectorizer())
                                    ])),
                   ('lda', Pipeline([ 
                            ('count', CountVectorizer(tokenizer=custom_tokenizer)),
                            ('lda',  LatentDirichletAllocation(n_topics=4, max_iter=5,
                                                   learning_method='online', 
                                                   learning_offset=50.,
                                                   random_state=0))
                        ])),
                    ('emb', Pipeline([
                            ('preprocess', Transformador()),
                            ('simM', SimMatrix(lexicon_words, embedding=embeddings,
                                               remove_stopwords=False, pooling=np.max,
                                               weighting=False, n_lexicon_words=100,
                                               lex_values=None)),
                            ('scale', MinMaxScaler(feature_range=(0,2))),
                            ('percent', feature_selection.SelectPercentile(feature_selection.f_classif, percentile=25)),
                        ]))

              ])),
       

                #('clf', MultinomialNB(alpha=.01))  # classifier
            ('clf', SVC(C=10, gamma= 1, kernel='linear', probability=True))
        #('clf', AdaBoostClassifier(n_estimators=50, base_estimator=MultinomialNB(alpha=.01), learning_rate=1))
        #('modelknn', KNeighborsClassifier(n_neighbors = 13))
    ])
#SVC, KNeighborsClassifier, AdaBoostClassifier, MultinomialNB
# Using KFold validation

cv = KFold(X.shape[0], 4, shuffle=True, random_state=33)
scores = cross_val_score(pipeline, X, y, cv=cv)
print("Scores in every iteration", scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))



Scores in every iteration [ 0.75      0.65625   0.6875    0.734375]
Accuracy: 0.71 (+/- 0.07)
