## Tweet classification

File names

In [1]:
base_location = './'

# To read
tweets_raw_file = base_location + 'footballsentiment_task.csv'
tweets_run_file = base_location + 'footballsentiment_task_run.csv'
corpus_tweets_2012_xml = base_location + 'general-train-tagged-3l.xml'
corpus_tweets_2017_xml = base_location + 'intertass-train-tagged.xml'

# To generate
corpus_tweets_2012_csv = base_location + 'general-train-tagged-3l.csv'
corpus_tweets_2017_csv = base_location + 'intertass-train-tagged.csv'
corpus_tweets_csv = base_location + 'corpus_tweets.csv'

Import libraries

In [2]:
import pandas as pd
import numpy as np

### Load datasets

In [3]:
tweets_raw = pd.read_csv(tweets_raw_file, encoding='utf-8')
tweets_run = pd.read_csv(tweets_run_file, encoding='utf-8')

print('Total tweets: %d' % len(tweets_raw))
print('Evaluated tweets so far: %d' % len(tweets_run))

Total tweets: 411
Evaluated tweets so far: 177


### Create train and test data

Aux function to convert evaluations to numeric values, accordnig to the rule:
- `Negativo`: -1.
- `Positivo`: 1.
- `Neutro`: 0.
- Anything else: 2.

In [4]:
def convert_to_numeric(evaluation):
    if evaluation == 'Positivo':
        return 1
    elif evaluation == 'Neutro':
        return 0
    elif evaluation == 'Negativo':
        return -1
    else:
        return 2    

Build new array of dictionaries with keys `id` (the task ID), `tweet` (the tweet string) and `score` (the tweet evaluation) by joining data from both CSV files.

In [10]:
# Build dictionary of tweets where key is the task__id
tweets_obj = {}
for index, row in tweets_raw.iterrows():
    tweets_obj[row.task__id] = row.taskinfo__Tweet

# Build dictinary of tweet scores where key is the task_id
scores_obj = {}
for index, row in tweets_run.iterrows():
    scores_obj[row.task_run__task_id] = row.task_run__info

# Create final wteets dictionary
own_tweets = []
for i, key in enumerate(scores_obj):
    own_tweets.append({
        'id': key,
        'tweet': tweets_obj[key], 
        'score': convert_to_numeric(scores_obj[key])
    })

print('Total different tweets evalauted so far: %d' % len(own_tweets))

Total different tweets evalauted so far: 588


### POS Tagging

Import libraries to read XML:

In [11]:
from lxml import objectify

Import/read most recent corpus (2017):

In [13]:
# 4 values of sentiment: N, P, NONE, NEU
xml = objectify.parse(open(corpus_tweets_2017_xml))
root = xml.getroot()
general_tweets_corpus_train_2017 = pd.DataFrame(columns=('content', 'polarity'))
tweets = root.getchildren()
for i in range(0, len(tweets)):
    tweet = tweets[i]
    row = dict(zip(['content', 'polarity'], [tweet.content.text, tweet.sentiment.polarity.value.text]))
    row_s = pd.Series(row)
    row_s.name = i
    general_tweets_corpus_train_2017 = general_tweets_corpus_train_2017.append(row_s)
    
general_tweets_corpus_train_2017.to_csv(corpus_tweets_2017_csv, index=False, encoding='utf-8')

Import/read biggest corpus (2012), to concatenate it with the previous noe:

In [14]:
# 4 values of sentiment: N, P, NONE, NEU
xml = objectify.parse(open(corpus_tweets_2012_xml))
root = xml.getroot()
general_tweets_corpus_train_2012 = pd.DataFrame(columns=('content', 'polarity'))
tweets = root.getchildren()
for i in range(0, len(tweets)):
    tweet = tweets[i]
    row = dict(zip(['content', 'polarity'], [tweet.content.text, tweet.sentiments.polarity.value.text]))
    row_s = pd.Series(row)
    row_s.name = i
    general_tweets_corpus_train_2012 = general_tweets_corpus_train_2012.append(row_s)
    
general_tweets_corpus_train_2012.to_csv(corpus_tweets_2012_csv, index=False, encoding='utf-8')

Concatenate general corpus dataset with 2017 one, to have a better result:

In [45]:
tweets_corpus = pd.concat([
        general_tweets_corpus_train_2012,
        general_tweets_corpus_train_2017
    ])
tweets_corpus.sample(10)

Unnamed: 0,content,polarity
472,"Cruz está que de sale RT ""@agusperezblanco: Mi...",P
6175,“@VaquerBoy: @David_Busta Recuerda que no eres...,P
4932,"Tres años de instrucción, uno y medio de suspe...",N
3095,Preguntan a Valcárcel si aún quiere devolver c...,NEU
352,Cameron rechaza reformar el Tratado de Lisboa ...,NEU
5670,"Opel ha sido, es y será empresa estratégica pa...",P
613,"¿""Agresión con cámara fotográfica""? Cuesta muc...",N
6583,El Supremo juzga a Otegui. repasamos los que h...,NEU
6279,Telefónica contrata a marido d la Vicep. del G...,N
2435,Ya está aqui Bern Shcuster!!!;)) http://t.co/5...,P


In [48]:
print('Total corpus tweets: %d' % len(tweets_corpus))

Total corpus tweets: 6605


Remove tweets without polarity (polarity `NONE`):

In [47]:
tweets_corpus = tweets_corpus.query('polarity != "NONE"')

Remove links:

In [28]:
tweets_corpus = tweets_corpus[-tweets_corpus.content.str.contains('^http.*$')]

In [53]:
import re
url_regex = re.compile('https?:\/\/t\.co\/[\w]{8,8}')
tweets_corpus_no_links = tweets_corpus
tweets_corpus_no_links['content'] = tweets_corpus_no_links['content'].map(lambda x: re.sub(url_regex, '', x))

In [55]:
print('Total corpus tweets after cleaning: %d' % len(tweets_corpus_no_links))

Total corpus tweets after cleaning: 6605


In [56]:
tweets_corpus.sample(10)

Unnamed: 0,content,polarity
1109,"@PabloAIglesias no se si mañana o pasado,jaja",P
2783,"#Andalucia necesita claridad en sus cuentas, t...",N
2263,El Gobierno recuerda a sindicatos y empresario...,N
6233,Según el relato de EP la única presión que se ...,N
3562,Si Chacón gana es urgente que aprenda la difer...,NEU
1284,Es lo maravilloso de este nuevo medio de expre...,P
330,Resumen de la larga y agria noche en Bruselas:...,N
4658,Bruselas será ligeramente flexible con los obj...,N
6951,Desde que en julio de 1988 empecé a trabajar e...,N
1240,Hoy en Cartaya con las personas mayores. Nos m...,P


### Tokenization and stemming

In [57]:
#download spanish stopwords
import nltk
nltk.download("stopwords")

from nltk.corpus import stopwords
spanish_stopwords = stopwords.words('spanish')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/david.santosg/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [58]:
from string import punctuation
non_words = list(punctuation)

#we add spanish punctuation
non_words.extend(['¿', '¡'])
non_words.extend(map(str,range(10)))
non_words

['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~',
 '¿',
 '¡',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9']

In [59]:
from sklearn.feature_extraction.text import CountVectorizer       
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# based on http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html
stemmer = SnowballStemmer('spanish')
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # remove non letters
    text = ''.join([c for c in text if c not in non_words])
    # tokenize
    tokens =  word_tokenize(text)

    # stem
    try:
        stems = stem_tokens(tokens, stemmer)
    except Exception as e:
        print(e)
        print(text)
        stems = ['']
    return stems

### Model Evaluation

In [60]:
from sklearn.cross_validation import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline



In [61]:
tweets_corpus_no_links['polarity_bin'] = 0
tweets_corpus_no_links.polarity_bin[tweets_corpus_no_links.polarity.isin(['P'])] = 1
tweets_corpus_no_links.polarity_bin[tweets_corpus_no_links.polarity.isin(['N'])] = -1
tweets_corpus_no_links.polarity_bin.value_counts(normalize=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


 1    0.484784
-1    0.393641
 0    0.121575
Name: polarity_bin, dtype: float64

In [64]:
from sklearn.model_selection import GridSearchCV
vectorizer = CountVectorizer(
                analyzer = 'word',
                tokenizer = tokenize,
                lowercase = True,
                stop_words = spanish_stopwords)

pipeline = Pipeline([
    ('vect', vectorizer),
    ('cls', LinearSVC()),
])



parameters = {
    'vect__max_df': (0.5, 1.9),
    'vect__min_df': (10, 20,50),
    'vect__max_features': (500, 1000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'cls__C': (0.2, 0.5, 0.7),
    'cls__loss': ('hinge', 'squared_hinge'),
    'cls__max_iter': (500, 1000)
}


grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1 , scoring='roc_auc')
grid_search.fit(tweets_corpus_no_links.content, tweets_corpus_no_links.polarity_bin)

JoblibLookupError: JoblibLookupError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
/opt/Conda.io/lib/python3.6/runpy.py in _run_module_as_main(mod_name='ipykernel.__main__', alter_argv=1)
    188         sys.exit(msg)
    189     main_globals = sys.modules["__main__"].__dict__
    190     if alter_argv:
    191         sys.argv[0] = mod_spec.origin
    192     return _run_code(code, main_globals, None,
--> 193                      "__main__", mod_spec)
        mod_spec = ModuleSpec(name='ipykernel.__main__', loader=<_f...b/python3.6/site-packages/ipykernel/__main__.py')
    194 
    195 def run_module(mod_name, init_globals=None,
    196                run_name=None, alter_sys=False):
    197     """Execute a module's code without importing it

...........................................................................
/opt/Conda.io/lib/python3.6/runpy.py in _run_code(code=<code object <module> at 0x7fb1655f6270, file "/...3.6/site-packages/ipykernel/__main__.py", line 1>, run_globals={'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/opt/Conda.io/lib/python3.6/site-packages/ipykernel/__pycache__/__main__.cpython-36.pyc', '__doc__': None, '__file__': '/opt/Conda.io/lib/python3.6/site-packages/ipykernel/__main__.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': 'ipykernel', '__spec__': ModuleSpec(name='ipykernel.__main__', loader=<_f...b/python3.6/site-packages/ipykernel/__main__.py'), 'app': <module 'ipykernel.kernelapp' from '/opt/Conda.i.../python3.6/site-packages/ipykernel/kernelapp.py'>}, init_globals=None, mod_name='__main__', mod_spec=ModuleSpec(name='ipykernel.__main__', loader=<_f...b/python3.6/site-packages/ipykernel/__main__.py'), pkg_name='ipykernel', script_name=None)
     80                        __cached__ = cached,
     81                        __doc__ = None,
     82                        __loader__ = loader,
     83                        __package__ = pkg_name,
     84                        __spec__ = mod_spec)
---> 85     exec(code, run_globals)
        code = <code object <module> at 0x7fb1655f6270, file "/...3.6/site-packages/ipykernel/__main__.py", line 1>
        run_globals = {'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/opt/Conda.io/lib/python3.6/site-packages/ipykernel/__pycache__/__main__.cpython-36.pyc', '__doc__': None, '__file__': '/opt/Conda.io/lib/python3.6/site-packages/ipykernel/__main__.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': 'ipykernel', '__spec__': ModuleSpec(name='ipykernel.__main__', loader=<_f...b/python3.6/site-packages/ipykernel/__main__.py'), 'app': <module 'ipykernel.kernelapp' from '/opt/Conda.i.../python3.6/site-packages/ipykernel/kernelapp.py'>}
     86     return run_globals
     87 
     88 def _run_module_code(code, init_globals=None,
     89                     mod_name=None, mod_spec=None,

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/ipykernel/__main__.py in <module>()
      1 
      2 
----> 3 
      4 if __name__ == '__main__':
      5     from ipykernel import kernelapp as app
      6     app.launch_new_instance()
      7 
      8 
      9 
     10 

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    653 
    654         If a global instance already exists, this reinitializes and starts it
    655         """
    656         app = cls.instance(**kwargs)
    657         app.initialize(argv)
--> 658         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    659 
    660 #-----------------------------------------------------------------------------
    661 # utility functions, for convenience
    662 #-----------------------------------------------------------------------------

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    469             return self.subapp.start()
    470         if self.poller is not None:
    471             self.poller.start()
    472         self.kernel.start()
    473         try:
--> 474             ioloop.IOLoop.instance().start()
    475         except KeyboardInterrupt:
    476             pass
    477 
    478 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/zmq/eventloop/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    172             )
    173         return loop
    174     
    175     def start(self):
    176         try:
--> 177             super(ZMQIOLoop, self).start()
        self.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    178         except ZMQError as e:
    179             if e.errno == ETERM:
    180                 # quietly return on ETERM
    181                 pass

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/tornado/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    882                 self._events.update(event_pairs)
    883                 while self._events:
    884                     fd, events = self._events.popitem()
    885                     try:
    886                         fd_obj, handler_func = self._handlers[fd]
--> 887                         handler_func(fd_obj, events)
        handler_func = <function wrap.<locals>.null_wrapper>
        fd_obj = <zmq.sugar.socket.Socket object>
        events = 1
    888                     except (OSError, IOError) as e:
    889                         if errno_from_exception(e) == errno.EPIPE:
    890                             # Happens when the client closes the connection
    891                             pass

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    435             # dispatch events:
    436             if events & IOLoop.ERROR:
    437                 gen_log.error("got POLLERR event on ZMQStream, which doesn't make sense")
    438                 return
    439             if events & IOLoop.READ:
--> 440                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    441                 if not self.socket:
    442                     return
    443             if events & IOLoop.WRITE:
    444                 self._handle_send()

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    467                 gen_log.error("RECV Error: %s"%zmq.strerror(e.errno))
    468         else:
    469             if self._recv_callback:
    470                 callback = self._recv_callback
    471                 # self._recv_callback = None
--> 472                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function wrap.<locals>.null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    473                 
    474         # self.update_state()
    475         
    476 

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function wrap.<locals>.null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    409         close our socket."""
    410         try:
    411             # Use a NullContext to ensure that all StackContexts are run
    412             # inside our blanket exception handler rather than outside.
    413             with stack_context.NullContext():
--> 414                 callback(*args, **kwargs)
        callback = <function wrap.<locals>.null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    415         except:
    416             gen_log.error("Uncaught exception, closing connection.",
    417                           exc_info=True)
    418             # Close the socket on an uncaught exception from a user callback

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/ipykernel/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    271         if self.control_stream:
    272             self.control_stream.on_recv(self.dispatch_control, copy=False)
    273 
    274         def make_dispatcher(stream):
    275             def dispatcher(msg):
--> 276                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    277             return dispatcher
    278 
    279         for s in self.shell_streams:
    280             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': 'from sklearn.model_selection import GridSearchCV...nks.content, tweets_corpus_no_links.polarity_bin)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': '2018-05-17T16:49:58.111162', 'msg_id': '813927BE18694FAB9D33E69E1C291C8C', 'msg_type': 'execute_request', 'session': '6FDAF978638049498EA7C7B55ACF455B', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '813927BE18694FAB9D33E69E1C291C8C', 'msg_type': 'execute_request', 'parent_header': {}})
    223             self.log.error("UNKNOWN MESSAGE TYPE: %r", msg_type)
    224         else:
    225             self.log.debug("%s: %s", msg_type, msg)
    226             self.pre_handler_hook()
    227             try:
--> 228                 handler(stream, idents, msg)
        handler = <bound method Kernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = [b'6FDAF978638049498EA7C7B55ACF455B']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': 'from sklearn.model_selection import GridSearchCV...nks.content, tweets_corpus_no_links.polarity_bin)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': '2018-05-17T16:49:58.111162', 'msg_id': '813927BE18694FAB9D33E69E1C291C8C', 'msg_type': 'execute_request', 'session': '6FDAF978638049498EA7C7B55ACF455B', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '813927BE18694FAB9D33E69E1C291C8C', 'msg_type': 'execute_request', 'parent_header': {}}
    229             except Exception:
    230                 self.log.error("Exception in message handler:", exc_info=True)
    231             finally:
    232                 self.post_handler_hook()

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=[b'6FDAF978638049498EA7C7B55ACF455B'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': 'from sklearn.model_selection import GridSearchCV...nks.content, tweets_corpus_no_links.polarity_bin)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': '2018-05-17T16:49:58.111162', 'msg_id': '813927BE18694FAB9D33E69E1C291C8C', 'msg_type': 'execute_request', 'session': '6FDAF978638049498EA7C7B55ACF455B', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '813927BE18694FAB9D33E69E1C291C8C', 'msg_type': 'execute_request', 'parent_header': {}})
    385         if not silent:
    386             self.execution_count += 1
    387             self._publish_execute_input(code, parent, self.execution_count)
    388 
    389         reply_content = self.do_execute(code, silent, store_history,
--> 390                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    391 
    392         # Flush output before sending the reply.
    393         sys.stdout.flush()
    394         sys.stderr.flush()

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code='from sklearn.model_selection import GridSearchCV...nks.content, tweets_corpus_no_links.polarity_bin)', silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    191 
    192         self._forward_input(allow_stdin)
    193 
    194         reply_content = {}
    195         try:
--> 196             res = shell.run_cell(code, store_history=store_history, silent=silent)
        res = undefined
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = 'from sklearn.model_selection import GridSearchCV...nks.content, tweets_corpus_no_links.polarity_bin)'
        store_history = True
        silent = False
    197         finally:
    198             self._restore_input()
    199 
    200         if res.error_before_exec is not None:

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/ipykernel/zmqshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, *args=('from sklearn.model_selection import GridSearchCV...nks.content, tweets_corpus_no_links.polarity_bin)',), **kwargs={'silent': False, 'store_history': True})
    496             )
    497         self.payload_manager.write_payload(payload)
    498 
    499     def run_cell(self, *args, **kwargs):
    500         self._last_traceback = None
--> 501         return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
        self.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        args = ('from sklearn.model_selection import GridSearchCV...nks.content, tweets_corpus_no_links.polarity_bin)',)
        kwargs = {'silent': False, 'store_history': True}
    502 
    503     def _showtraceback(self, etype, evalue, stb):
    504         # try to preserve ordering of tracebacks and print statements
    505         sys.stdout.flush()

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell='from sklearn.model_selection import GridSearchCV...nks.content, tweets_corpus_no_links.polarity_bin)', store_history=True, silent=False, shell_futures=True)
   2712                 self.displayhook.exec_result = result
   2713 
   2714                 # Execute the user code
   2715                 interactivity = "none" if silent else self.ast_node_interactivity
   2716                 has_raised = self.run_ast_nodes(code_ast.body, cell_name,
-> 2717                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler object>
   2718                 
   2719                 self.last_execution_succeeded = not has_raised
   2720 
   2721                 # Reset this so later displayed values do not modify the

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.ImportFrom object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Expr object>], cell_name='<ipython-input-64-bbf023576d52>', interactivity='last', compiler=<IPython.core.compilerop.CachingCompiler object>, result=<ExecutionResult object at 7fb115a232e8, executi..._before_exec=None error_in_exec=None result=None>)
   2822                     return True
   2823 
   2824             for i, node in enumerate(to_run_interactive):
   2825                 mod = ast.Interactive([node])
   2826                 code = compiler(mod, cell_name, "single")
-> 2827                 if self.run_code(code, result):
        self.run_code = <bound method InteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x7fb115a92270, file "<ipython-input-64-bbf023576d52>", line 27>
        result = <ExecutionResult object at 7fb115a232e8, executi..._before_exec=None error_in_exec=None result=None>
   2828                     return True
   2829 
   2830             # Flush softspace
   2831             if softspace(sys.stdout, 0):

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x7fb115a92270, file "<ipython-input-64-bbf023576d52>", line 27>, result=<ExecutionResult object at 7fb115a232e8, executi..._before_exec=None error_in_exec=None result=None>)
   2876         outflag = 1  # happens in more places, so it's easier as default
   2877         try:
   2878             try:
   2879                 self.hooks.pre_run_code_hook()
   2880                 #rprint('Running code', repr(code_obj)) # dbg
-> 2881                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x7fb115a92270, file "<ipython-input-64-bbf023576d52>", line 27>
        self.user_global_ns = {'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', "base_location = '../../ml-football-tweets/'\n\n# T..._tweets_csv = base_location + 'corpus_tweets.csv'", 'import pandas as pd\nimport numpy as np', "tweets_raw = pd.read_csv(tweets_raw_file, encodi...('Evaluated tweets so far: %d' % len(tweets_run))", "base_location = './ml-football-tweets/'\n\n# To re..._tweets_csv = base_location + 'corpus_tweets.csv'", 'import pandas as pd\nimport numpy as np', "base_location = './'\n\n# To read\ntweets_raw_file ..._tweets_csv = base_location + 'corpus_tweets.csv'", 'import pandas as pd\nimport numpy as np', "tweets_raw = pd.read_csv(tweets_raw_file, encodi...('Evaluated tweets so far: %d' % len(tweets_run))", 'def convert_to_numeric(evaluation):\n    if evalu...\n        return -1\n    else:\n        return 2    ', "# Build dictionary of tweets where key is the ta...t tweets evalauted so far: %d' % len(own_tweets))", 'from lxml import objectify', "# 4 values of sentiment: N, P, NONE, NEU\nxml = o...s_tweets_2017_csv, index=False, encoding='utf-8')", "# 4 values of sentiment: N, P, NONE, NEU\nxml = o...s_tweets_2017_csv, index=False, encoding='utf-8')", "# 4 values of sentiment: N, P, NONE, NEU\nxml = o...s_tweets_2012_csv, index=False, encoding='utf-8')", 'tweets_corpus = pd.concat([\n        general_twee...corpus_train_2017\n    ])\ntweets_corpus.sample(10)', 'tweets_corpus = pd.concat([\n        general_twee...corpus_train_2017\n    ])\ntweets_corpus.sample(10)', "print('Total corpus tweets: %d' % len(tweets_corpus))", 'tweets_corpus = tweets_corpus.query(\'polarity != "NONE"\')', "tweets_corpus = tweets_corpus[-tweets_corpus.content.str.contains('^http.*$')]", ...], 'LinearSVC': <class 'sklearn.svm.classes.LinearSVC'>, 'Out': {15:                                                 ...o que estoy viendo a los Reyes... Per...      NEU, 16:                                                 ...y me tiran a la basura" http://t.co/j...        N, 21:                                                 ...n irresponsble         P

[1680 rows x 2 columns], 22:                                                 ...trle derecho  ...        N

[94 rows x 2 columns], 24:                                               co...              #progrmscmbidos cc d juste        N, 25:                                                 ...n 5 minutos de retrso... Feliz Cumple...      NEU, 26:                                                 ...o no dije que jessic se provechr de m...        N, 34:                                                 ...o no dije que jessic se provechr de m...        N, 37:                                                 ...o no dije que jessic se provechr de m...        N, 39:                                                 ...o no dije que jessic se provechr de m...        N, ...}, 'Pipeline': <class 'sklearn.pipeline.Pipeline'>, 'SnowballStemmer': <class 'nltk.stem.snowball.SnowballStemmer'>, '_':  1    0.484784
-1    0.393641
 0    0.121575
Name: polarity_bin, dtype: float64, '_15':                                                 ...o que estoy viendo a los Reyes... Per...      NEU, '_16':                                                 ...y me tiran a la basura" http://t.co/j...        N, ...}
        self.user_ns = {'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', "base_location = '../../ml-football-tweets/'\n\n# T..._tweets_csv = base_location + 'corpus_tweets.csv'", 'import pandas as pd\nimport numpy as np', "tweets_raw = pd.read_csv(tweets_raw_file, encodi...('Evaluated tweets so far: %d' % len(tweets_run))", "base_location = './ml-football-tweets/'\n\n# To re..._tweets_csv = base_location + 'corpus_tweets.csv'", 'import pandas as pd\nimport numpy as np', "base_location = './'\n\n# To read\ntweets_raw_file ..._tweets_csv = base_location + 'corpus_tweets.csv'", 'import pandas as pd\nimport numpy as np', "tweets_raw = pd.read_csv(tweets_raw_file, encodi...('Evaluated tweets so far: %d' % len(tweets_run))", 'def convert_to_numeric(evaluation):\n    if evalu...\n        return -1\n    else:\n        return 2    ', "# Build dictionary of tweets where key is the ta...t tweets evalauted so far: %d' % len(own_tweets))", 'from lxml import objectify', "# 4 values of sentiment: N, P, NONE, NEU\nxml = o...s_tweets_2017_csv, index=False, encoding='utf-8')", "# 4 values of sentiment: N, P, NONE, NEU\nxml = o...s_tweets_2017_csv, index=False, encoding='utf-8')", "# 4 values of sentiment: N, P, NONE, NEU\nxml = o...s_tweets_2012_csv, index=False, encoding='utf-8')", 'tweets_corpus = pd.concat([\n        general_twee...corpus_train_2017\n    ])\ntweets_corpus.sample(10)', 'tweets_corpus = pd.concat([\n        general_twee...corpus_train_2017\n    ])\ntweets_corpus.sample(10)', "print('Total corpus tweets: %d' % len(tweets_corpus))", 'tweets_corpus = tweets_corpus.query(\'polarity != "NONE"\')', "tweets_corpus = tweets_corpus[-tweets_corpus.content.str.contains('^http.*$')]", ...], 'LinearSVC': <class 'sklearn.svm.classes.LinearSVC'>, 'Out': {15:                                                 ...o que estoy viendo a los Reyes... Per...      NEU, 16:                                                 ...y me tiran a la basura" http://t.co/j...        N, 21:                                                 ...n irresponsble         P

[1680 rows x 2 columns], 22:                                                 ...trle derecho  ...        N

[94 rows x 2 columns], 24:                                               co...              #progrmscmbidos cc d juste        N, 25:                                                 ...n 5 minutos de retrso... Feliz Cumple...      NEU, 26:                                                 ...o no dije que jessic se provechr de m...        N, 34:                                                 ...o no dije que jessic se provechr de m...        N, 37:                                                 ...o no dije que jessic se provechr de m...        N, 39:                                                 ...o no dije que jessic se provechr de m...        N, ...}, 'Pipeline': <class 'sklearn.pipeline.Pipeline'>, 'SnowballStemmer': <class 'nltk.stem.snowball.SnowballStemmer'>, '_':  1    0.484784
-1    0.393641
 0    0.121575
Name: polarity_bin, dtype: float64, '_15':                                                 ...o que estoy viendo a los Reyes... Per...      NEU, '_16':                                                 ...y me tiran a la basura" http://t.co/j...        N, ...}
   2882             finally:
   2883                 # Reset our crash handler in place
   2884                 sys.excepthook = old_excepthook
   2885         except SystemExit as e:

...........................................................................
/home/david.santosg/ml-football-tweets/<ipython-input-64-bbf023576d52> in <module>()
     22     'cls__max_iter': (500, 1000)
     23 }
     24 
     25 
     26 grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1 , scoring='roc_auc')
---> 27 grid_search.fit(tweets_corpus_no_links.content, tweets_corpus_no_links.polarity_bin)
     28 
     29 
     30 
     31 

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self=GridSearchCV(cv=None, error_score='raise',
     ..._score=True,
       scoring='roc_auc', verbose=0), X=1       @PauladeLasHeras No te libraras de ayuda...y tan irresponsable 
Name: content, dtype: object, y=1       0
2       1
3      -1
4       1
6       ...6    1
1007    1
Name: polarity_bin, dtype: int64, groups=None)
    940 
    941         groups : array-like, with shape (n_samples,), optional
    942             Group labels for the samples used while splitting the dataset into
    943             train/test set.
    944         """
--> 945         return self._fit(X, y, groups, ParameterGrid(self.param_grid))
        self._fit = <bound method BaseSearchCV._fit of GridSearchCV(...score=True,
       scoring='roc_auc', verbose=0)>
        X = 1       @PauladeLasHeras No te libraras de ayuda...y tan irresponsable 
Name: content, dtype: object
        y = 1       0
2       1
3      -1
4       1
6       ...6    1
1007    1
Name: polarity_bin, dtype: int64
        groups = None
        self.param_grid = {'cls__C': (0.2, 0.5, 0.7), 'cls__loss': ('hinge', 'squared_hinge'), 'cls__max_iter': (500, 1000), 'vect__max_df': (0.5, 1.9), 'vect__max_features': (500, 1000), 'vect__min_df': (10, 20, 50), 'vect__ngram_range': ((1, 1), (1, 2))}
    946 
    947 
    948 class RandomizedSearchCV(BaseSearchCV):
    949     """Randomized search on hyper parameters.

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/sklearn/model_selection/_search.py in _fit(self=GridSearchCV(cv=None, error_score='raise',
     ..._score=True,
       scoring='roc_auc', verbose=0), X=1       @PauladeLasHeras No te libraras de ayuda...y tan irresponsable 
Name: content, dtype: object, y=1       0
2       1
3      -1
4       1
6       ...6    1
1007    1
Name: polarity_bin, dtype: int64, groups=None, parameter_iterable=<sklearn.model_selection._search.ParameterGrid object>)
    559                                   fit_params=self.fit_params,
    560                                   return_train_score=self.return_train_score,
    561                                   return_n_test_samples=True,
    562                                   return_times=True, return_parameters=True,
    563                                   error_score=self.error_score)
--> 564           for parameters in parameter_iterable
        parameters = undefined
        parameter_iterable = <sklearn.model_selection._search.ParameterGrid object>
    565           for train, test in cv_iter)
    566 
    567         # if one choose to see train score, "out" will contain train score info
    568         if self.return_train_score:

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<generator object BaseSearchCV._fit.<locals>.<genexpr>>)
    763             if pre_dispatch == "all" or n_jobs == 1:
    764                 # The iterable was consumed all at once by the above for loop.
    765                 # No need to wait for async callbacks to trigger to
    766                 # consumption.
    767                 self._iterating = False
--> 768             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    769             # Make sure that we get a last message telling us we are done
    770             elapsed_time = time.time() - self._start_time
    771             self._print('Done %3i out of %3i | elapsed: %s finished',
    772                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
LookupError                                        Thu May 17 16:49:58 2018
PID: 6995                            Python 3.6.0: /opt/Conda.io/bin/python
...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        self.items = [(<function _fit_and_score>, (Pipeline(steps=[('vect', CountVectorizer(analyze...l2', random_state=None, tol=0.0001, verbose=0))]), 1       @PauladeLasHeras No te libraras de ayuda...y tan irresponsable 
Name: content, dtype: object, 1       0
2       1
3      -1
4       1
6       ...6    1
1007    1
Name: polarity_bin, dtype: int64, make_scorer(roc_auc_score, needs_threshold=True), array([2010, 2014, 2017, ..., 6602, 6603, 6604]), array([   0,    1,    2, ..., 2417, 2419, 2445]), 0, {'cls__C': 0.2, 'cls__loss': 'hinge', 'cls__max_iter': 500, 'vect__max_df': 0.5, 'vect__max_features': 500, 'vect__min_df': 10, 'vect__ngram_range': (1, 1)}), {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': True, 'return_times': True, 'return_train_score': True})]
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0=<list_iterator object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = (Pipeline(steps=[('vect', CountVectorizer(analyze...l2', random_state=None, tol=0.0001, verbose=0))]), 1       @PauladeLasHeras No te libraras de ayuda...y tan irresponsable 
Name: content, dtype: object, 1       0
2       1
3      -1
4       1
6       ...6    1
1007    1
Name: polarity_bin, dtype: int64, make_scorer(roc_auc_score, needs_threshold=True), array([2010, 2014, 2017, ..., 6602, 6603, 6604]), array([   0,    1,    2, ..., 2417, 2419, 2445]), 0, {'cls__C': 0.2, 'cls__loss': 'hinge', 'cls__max_iter': 500, 'vect__max_df': 0.5, 'vect__max_features': 500, 'vect__min_df': 10, 'vect__ngram_range': (1, 1)})
        kwargs = {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': True, 'return_times': True, 'return_train_score': True}
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator=Pipeline(steps=[('vect', CountVectorizer(analyze...l2', random_state=None, tol=0.0001, verbose=0))]), X=1       @PauladeLasHeras No te libraras de ayuda...y tan irresponsable 
Name: content, dtype: object, y=1       0
2       1
3      -1
4       1
6       ...6    1
1007    1
Name: polarity_bin, dtype: int64, scorer=make_scorer(roc_auc_score, needs_threshold=True), train=array([2010, 2014, 2017, ..., 6602, 6603, 6604]), test=array([   0,    1,    2, ..., 2417, 2419, 2445]), verbose=0, parameters={'cls__C': 0.2, 'cls__loss': 'hinge', 'cls__max_iter': 500, 'vect__max_df': 0.5, 'vect__max_features': 500, 'vect__min_df': 10, 'vect__ngram_range': (1, 1)}, fit_params={}, return_train_score=True, return_parameters=True, return_n_test_samples=True, return_times=True, error_score='raise')
    233 
    234     try:
    235         if y_train is None:
    236             estimator.fit(X_train, **fit_params)
    237         else:
--> 238             estimator.fit(X_train, y_train, **fit_params)
        estimator.fit = <bound method Pipeline.fit of Pipeline(steps=[('...2', random_state=None, tol=0.0001, verbose=0))])>
        X_train = 2584    La radio pública cumple 75 años. Enhorab...y tan irresponsable 
Name: content, dtype: object
        y_train = 2584    1
2588    1
2593    1
2594    1
2599    ...6    1
1007    1
Name: polarity_bin, dtype: int64
        fit_params = {}
    239 
    240     except Exception as e:
    241         # Note fit time as time until error
    242         fit_time = time.time() - start_time

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/sklearn/pipeline.py in fit(self=Pipeline(steps=[('vect', CountVectorizer(analyze...l2', random_state=None, tol=0.0001, verbose=0))]), X=2584    La radio pública cumple 75 años. Enhorab...y tan irresponsable 
Name: content, dtype: object, y=2584    1
2588    1
2593    1
2594    1
2599    ...6    1
1007    1
Name: polarity_bin, dtype: int64, **fit_params={})
    263         Returns
    264         -------
    265         self : Pipeline
    266             This estimator
    267         """
--> 268         Xt, fit_params = self._fit(X, y, **fit_params)
        Xt = undefined
        fit_params = {}
        self._fit = <bound method Pipeline._fit of Pipeline(steps=[(...2', random_state=None, tol=0.0001, verbose=0))])>
        X = 2584    La radio pública cumple 75 años. Enhorab...y tan irresponsable 
Name: content, dtype: object
        y = 2584    1
2588    1
2593    1
2594    1
2599    ...6    1
1007    1
Name: polarity_bin, dtype: int64
    269         if self._final_estimator is not None:
    270             self._final_estimator.fit(Xt, y, **fit_params)
    271         return self
    272 

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self=Pipeline(steps=[('vect', CountVectorizer(analyze...l2', random_state=None, tol=0.0001, verbose=0))]), X=2584    La radio pública cumple 75 años. Enhorab...y tan irresponsable 
Name: content, dtype: object, y=2584    1
2588    1
2593    1
2594    1
2599    ...6    1
1007    1
Name: polarity_bin, dtype: int64, **fit_params={})
    229         Xt = X
    230         for name, transform in self.steps[:-1]:
    231             if transform is None:
    232                 pass
    233             elif hasattr(transform, "fit_transform"):
--> 234                 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
        Xt = 2584    La radio pública cumple 75 años. Enhorab...y tan irresponsable 
Name: content, dtype: object
        transform.fit_transform = <bound method CountVectorizer.fit_transform of C...on tokenize at 0x7fb115ae6e18>, vocabulary=None)>
        y = 2584    1
2588    1
2593    1
2594    1
2599    ...6    1
1007    1
Name: polarity_bin, dtype: int64
        fit_params_steps = {'cls': {}, 'vect': {}}
        name = 'vect'
    235             else:
    236                 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
    237                               .transform(Xt)
    238         if self._final_estimator is None:

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in fit_transform(self=CountVectorizer(analyzer='word', binary=False, d...ion tokenize at 0x7fb115ae6e18>, vocabulary=None), raw_documents=2584    La radio pública cumple 75 años. Enhorab...y tan irresponsable 
Name: content, dtype: object, y=2584    1
2588    1
2593    1
2594    1
2599    ...6    1
1007    1
Name: polarity_bin, dtype: int64)
    834         max_df = self.max_df
    835         min_df = self.min_df
    836         max_features = self.max_features
    837 
    838         vocabulary, X = self._count_vocab(raw_documents,
--> 839                                           self.fixed_vocabulary_)
        self.fixed_vocabulary_ = False
    840 
    841         if self.binary:
    842             X.data.fill(1)
    843 

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in _count_vocab(self=CountVectorizer(analyzer='word', binary=False, d...ion tokenize at 0x7fb115ae6e18>, vocabulary=None), raw_documents=2584    La radio pública cumple 75 años. Enhorab...y tan irresponsable 
Name: content, dtype: object, fixed_vocab=False)
    757         indptr = _make_int_array()
    758         values = _make_int_array()
    759         indptr.append(0)
    760         for doc in raw_documents:
    761             feature_counter = {}
--> 762             for feature in analyze(doc):
        feature = undefined
        analyze = <function VectorizerMixin.build_analyzer.<locals>.<lambda>>
        doc = 'La radio pública cumple 75 años. Enhorabuena a t.... Es más necesaria que nunca. #75añosRNE @EDCHrne'
    763                 try:
    764                     feature_idx = vocabulary[feature]
    765                     if feature_idx not in feature_counter:
    766                         feature_counter[feature_idx] = 1

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in <lambda>(doc='La radio pública cumple 75 años. Enhorabuena a t.... Es más necesaria que nunca. #75añosRNE @EDCHrne')
    236         elif self.analyzer == 'word':
    237             stop_words = self.get_stop_words()
    238             tokenize = self.build_tokenizer()
    239 
    240             return lambda doc: self._word_ngrams(
--> 241                 tokenize(preprocess(self.decode(doc))), stop_words)
        doc = 'La radio pública cumple 75 años. Enhorabuena a t.... Es más necesaria que nunca. #75añosRNE @EDCHrne'
    242 
    243         else:
    244             raise ValueError('%s is not a valid tokenization scheme/analyzer' %
    245                              self.analyzer)

...........................................................................
/home/david.santosg/ml-football-tweets/<ipython-input-59-ba382de6c55a> in tokenize(text='la radio pública cumple  años enhorabuena a toda...osible es más necesaria que nunca añosrne edchrne')
     12 
     13 def tokenize(text):
     14     # remove non letters
     15     text = ''.join([c for c in text if c not in non_words])
     16     # tokenize
---> 17     tokens =  word_tokenize(text)
     18 
     19     # stem
     20     try:
     21         stems = stem_tokens(tokens, stemmer)

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/nltk/tokenize/__init__.py in word_tokenize(text='la radio pública cumple  años enhorabuena a toda...osible es más necesaria que nunca añosrne edchrne', language='english')
    104     for the specified language).
    105 
    106     :param text: text to split into sentences
    107     :param language: the model name in the Punkt corpus
    108     """
--> 109     return [token for sent in sent_tokenize(text, language)
        text = 'la radio pública cumple  años enhorabuena a toda...osible es más necesaria que nunca añosrne edchrne'
        language = 'english'
    110             for token in _treebank_word_tokenize(sent)]
    111 
    112 
    113 

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/nltk/tokenize/__init__.py in sent_tokenize(text='la radio pública cumple  años enhorabuena a toda...osible es más necesaria que nunca añosrne edchrne', language='english')
     88     for the specified language).
     89 
     90     :param text: text to split into sentences
     91     :param language: the model name in the Punkt corpus
     92     """
---> 93     tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
        tokenizer = undefined
        language = 'english'
     94     return tokenizer.tokenize(text)
     95 
     96 # Standard word tokenizer.
     97 _treebank_word_tokenize = TreebankWordTokenizer().tokenize

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/nltk/data.py in load(resource_url='nltk:tokenizers/punkt/PY3/english.pickle', format='pickle', cache=True, verbose=False, logic_parser=None, fstruct_reader=None, encoding=None)
    803     # Let the user know what's going on.
    804     if verbose:
    805         print('<<Loading %s>>' % (resource_url,))
    806 
    807     # Load the resource.
--> 808     opened_resource = _open(resource_url)
        opened_resource = undefined
        resource_url = 'nltk:tokenizers/punkt/PY3/english.pickle'
    809 
    810     if format == 'raw':
    811         resource_val = opened_resource.read()
    812     elif format == 'pickle':

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/nltk/data.py in _open(resource_url='nltk:tokenizers/punkt/PY3/english.pickle')
    921     """
    922     resource_url = normalize_resource_url(resource_url)
    923     protocol, path_ = split_resource_url(resource_url)
    924 
    925     if protocol is None or protocol.lower() == 'nltk':
--> 926         return find(path_, path + ['']).open()
        path_ = 'tokenizers/punkt/PY3/english.pickle'
    927     elif protocol.lower() == 'file':
    928         # urllib might not use mode='rb', so handle this one ourselves:
    929         return find(path_, ['']).open()
    930     else:

...........................................................................
/opt/Conda.io/lib/python3.6/site-packages/nltk/data.py in find(resource_name='tokenizers/punkt/PY3/english.pickle', paths=['/home/david.santosg/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data', ''])
    643         (resource_name,), initial_indent='  ', subsequent_indent='  ',
    644         width=66)
    645     msg += '\n  Searched in:' + ''.join('\n    - %r' % d for d in paths)
    646     sep = '*' * 70
    647     resource_not_found = '\n%s\n%s\n%s' % (sep, msg, sep)
--> 648     raise LookupError(resource_not_found)
        resource_not_found = '\n***********************************************...*************************************************'
    649 
    650 
    651 def retrieve(resource_url, filename=None, verbose=True):
    652     """

LookupError: 
**********************************************************************
  Resource 'tokenizers/punkt/PY3/english.pickle' not found.
  Please use the NLTK Downloader to obtain the resource:  >>>
  nltk.download()
  Searched in:
    - '/home/david.santosg/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************
___________________________________________________________________________

In [65]:
grid_search.best_params_

NotFittedError: This GridSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [66]:
from sklearn.externals import joblib
joblib.dump(grid_search, 'grid_search.pkl')

['grid_search.pkl']

In [67]:


model = LinearSVC(C=.2, loss='squared_hinge',max_iter=1000,multi_class='ovr',
              random_state=None,
              penalty='l2',
              tol=0.0001
)

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = spanish_stopwords,
    min_df = 50,
    max_df = 1.9,
    ngram_range=(1, 1),
    max_features=1000
)

corpus_data_features = vectorizer.fit_transform(tweets_corpus.content)
corpus_data_features_nd = corpus_data_features.toarray()



LookupError: 
**********************************************************************
  Resource 'tokenizers/punkt/PY3/english.pickle' not found.
  Please use the NLTK Downloader to obtain the resource:  >>>
  nltk.download()
  Searched in:
    - '/home/david.santosg/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************

In [68]:
scores = cross_val_score(
    model,
    corpus_data_features_nd[0:len(tweets_corpus)],
    y=tweets_corpus.polarity_bin,
    scoring='roc_auc',
    cv=5
    )

scores.mean()

NameError: name 'corpus_data_features_nd' is not defined

### Polarity Prediction

In [69]:
tweets = pd.read_csv('tweets_parsed.csv', encoding='utf-8')

FileNotFoundError: File b'tweets_parsed.csv' does not exist

In [70]:
tweets = tweets[tweets.tweet.str.len() < 150]
tweets.lat = pd.to_numeric(tweets.lat, errors='coerce')
tweets = tweets[tweets.lat.notnull()]


#We make sure only those tweets in the Murcia bounding box are kept
min_lon = -1.157420
max_lon = -1.081202
min_lat = 37.951741
max_lat = 38.029126

tweets = tweets[(tweets.lat.notnull()) & (tweets.lon.notnull())]

tweets = tweets[(tweets.lon > min_lon) & (tweets.lon < max_lon) & (tweets.lat > min_lat) & (tweets.lat < max_lat)]
tweets.shape

AttributeError: 'list' object has no attribute 'tweet'