## Tweet classification

File names

In [1]:
base_location = './'

# To read
tweets_raw_file = base_location + 'datasets/train.csv'
tweets_run_file = base_location + 'datasets/test_nolabel.csv'
corpus_tweets_2012_xml = base_location + 'general-train-tagged-3l.xml'
corpus_tweets_2017_xml = base_location + 'intertass-train-tagged.xml'

# To generate
corpus_tweets_2012_csv = base_location + 'general-train-tagged-3l.csv'
corpus_tweets_2017_csv = base_location + 'intertass-train-tagged.csv'
corpus_tweets_csv = base_location + 'corpus_tweets.csv'

Import libraries

In [2]:
import pandas as pd
import numpy as np

### Load datasets

In [3]:
tweets_raw = pd.read_csv(tweets_raw_file, encoding='utf-8')
tweets_run = pd.read_csv(tweets_run_file, encoding='utf-8')

print('Total tweets: %d' % len(tweets_raw))
print('Evaluated tweets so far: %d' % len(tweets_run))

Total tweets: 411
Evaluated tweets so far: 177


### Create train and test data

Aux function to convert evaluations to numeric values, accordnig to the rule:
- `Negativo`: -1.
- `Positivo`: 1.
- `Neutro`: 0.
- Anything else: 2.

In [4]:
def convert_to_numeric(evaluation):
    if evaluation == 'Positivo':
        return 1
    elif evaluation == 'Neutro':
        return 0
    elif evaluation == 'Negativo':
        return -1
    else:
        return 2    

Build new array of dictionaries with keys `id` (the task ID), `tweet` (the tweet string) and `score` (the tweet evaluation) by joining data from both CSV files.

In [8]:
# Build dictionary of tweets where key is the task__id
own_tweets = []
tweets_obj = {}
for index, row in tweets_raw.iterrows():
    own_tweets.append({
        'id': row.id,
        'tweet': row.text, 
        'polarity': row.polarity
    })

print('Total different tweets evaluated so far: %d' % len(own_tweets))

Total different tweets evaluated so far: 411


### POS Tagging

Import libraries to read XML:

In [9]:
from lxml import objectify

Import/read most recent corpus (2017):

In [10]:
# 4 values of sentiment: N, P, NONE, NEU
xml = objectify.parse(open(corpus_tweets_2017_xml))
root = xml.getroot()
general_tweets_corpus_train_2017 = pd.DataFrame(columns=('content', 'polarity'))
tweets = root.getchildren()
for i in range(0, len(tweets)):
    tweet = tweets[i]
    row = dict(zip(['content', 'polarity'], [tweet.content.text, tweet.sentiment.polarity.value.text]))
    row_s = pd.Series(row)
    row_s.name = i
    general_tweets_corpus_train_2017 = general_tweets_corpus_train_2017.append(row_s)
    
general_tweets_corpus_train_2017.to_csv(corpus_tweets_2017_csv, index=False, encoding='utf-8')

Import/read biggest corpus (2012), to concatenate it with the previous noe:

In [11]:
# 4 values of sentiment: N, P, NONE, NEU
xml = objectify.parse(open(corpus_tweets_2012_xml))
root = xml.getroot()
general_tweets_corpus_train_2012 = pd.DataFrame(columns=('content', 'polarity'))
tweets = root.getchildren()
for i in range(0, len(tweets)):
    tweet = tweets[i]
    row = dict(zip(['content', 'polarity'], [tweet.content.text, tweet.sentiments.polarity.value.text]))
    row_s = pd.Series(row)
    row_s.name = i
    general_tweets_corpus_train_2012 = general_tweets_corpus_train_2012.append(row_s)
    
general_tweets_corpus_train_2012.to_csv(corpus_tweets_2012_csv, index=False, encoding='utf-8')

Concatenate general corpus dataset with 2017 one, to have a better result:

In [12]:
tweets_corpus = pd.concat([
        general_tweets_corpus_train_2012,
        general_tweets_corpus_train_2017
    ])
tweets_corpus.sample(10)

Unnamed: 0,content,polarity
6744,“@policebluetour: Buenas noches a tod@s :-) #F...,P
220,@DieRaposa @DrXaverius y otros biólogos como é...,P
6338,Cafe Berlin. Flipando con el concierto en dire...,P
3450,Como siempre buenísimo Santiago Gonzalez. Sobr...,P
2112,Cuando la corrupción se convierte en nuestro p...,N
1552,"Claro,claro..... sobretodo ahora que no tengo ...",N
2868,"Rajoy: ""ya sabemos que las cosas están mal"". N...",N
2139,Esta es la mejor noticia q he leído hoy en la ...,P
6566,Yo mañana voy a trabajar.,NONE
6093,Hoy 19 de marzo los gaditanos celebramos un gr...,P


In [13]:
print('Total corpus tweets: %d' % len(tweets_corpus))

Total corpus tweets: 8227


Remove tweets without polarity (polarity `NONE`):

In [14]:
tweets_corpus = tweets_corpus.query('polarity != "NONE"')

Remove links:

In [15]:
tweets_corpus = tweets_corpus[-tweets_corpus.content.str.contains('^http.*$')]

In [16]:
import re
url_regex = re.compile('https?:\/\/t\.co\/[\w]{8,8}')
tweets_corpus_no_links = tweets_corpus
tweets_corpus_no_links['content'] = tweets_corpus_no_links['content'].map(lambda x: re.sub(url_regex, '', x))

In [17]:
print('Total corpus tweets after cleaning: %d' % len(tweets_corpus_no_links))

Total corpus tweets after cleaning: 6586


In [56]:
tweets_corpus.sample(10)

Unnamed: 0,content,polarity
1109,"@PabloAIglesias no se si mañana o pasado,jaja",P
2783,"#Andalucia necesita claridad en sus cuentas, t...",N
2263,El Gobierno recuerda a sindicatos y empresario...,N
6233,Según el relato de EP la única presión que se ...,N
3562,Si Chacón gana es urgente que aprenda la difer...,NEU
1284,Es lo maravilloso de este nuevo medio de expre...,P
330,Resumen de la larga y agria noche en Bruselas:...,N
4658,Bruselas será ligeramente flexible con los obj...,N
6951,Desde que en julio de 1988 empecé a trabajar e...,N
1240,Hoy en Cartaya con las personas mayores. Nos m...,P


### Tokenization and stemming

In [18]:
#download spanish stopwords
import nltk
nltk.download("stopwords")

from nltk.corpus import stopwords
spanish_stopwords = stopwords.words('spanish')

[nltk_data] Downloading package stopwords to /Users/dass/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
from string import punctuation
non_words = list(punctuation)

#we add spanish punctuation
non_words.extend(['¿', '¡'])
non_words.extend(map(str,range(10)))
non_words

['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~',
 '¿',
 '¡',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9']

In [20]:
from sklearn.feature_extraction.text import CountVectorizer       
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# based on http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html
stemmer = SnowballStemmer('spanish')
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # remove non letters
    text = ''.join([c for c in text if c not in non_words])
    # tokenize
    tokens =  word_tokenize(text)

    # stem
    try:
        stems = stem_tokens(tokens, stemmer)
    except Exception as e:
        print(e)
        print(text)
        stems = ['']
    return stems

### Model Evaluation

In [21]:
from sklearn.cross_validation import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline



In [22]:
tweets_corpus_no_links['polarity_bin'] = 0
tweets_corpus_no_links.polarity_bin[tweets_corpus_no_links.polarity.isin(['P'])] = 1
tweets_corpus_no_links.polarity_bin[tweets_corpus_no_links.polarity.isin(['N'])] = -1
tweets_corpus_no_links.polarity_bin.value_counts(normalize=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


 1    0.483602
-1    0.394777
 0    0.121622
Name: polarity_bin, dtype: float64

In [23]:
from sklearn.model_selection import GridSearchCV
vectorizer = CountVectorizer(
                analyzer = 'word',
                tokenizer = tokenize,
                lowercase = True,
                stop_words = spanish_stopwords)

pipeline = Pipeline([
    ('vect', vectorizer),
    ('cls', LinearSVC()),
])



parameters = {
    'vect__max_df': (0.5, 1.9),
    'vect__min_df': (10, 20,50),
    'vect__max_features': (500, 1000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'cls__C': (0.2, 0.5, 0.7),
    'cls__loss': ('hinge', 'squared_hinge'),
    'cls__max_iter': (500, 1000)
}


grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1 , scoring='roc_auc')
grid_search.fit(tweets_corpus_no_links.content, tweets_corpus_no_links.polarity_bin)

JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
/Users/dass/anaconda3/lib/python3.6/runpy.py in _run_module_as_main(mod_name='ipykernel_launcher', alter_argv=1)
    188         sys.exit(msg)
    189     main_globals = sys.modules["__main__"].__dict__
    190     if alter_argv:
    191         sys.argv[0] = mod_spec.origin
    192     return _run_code(code, main_globals, None,
--> 193                      "__main__", mod_spec)
        mod_spec = ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py')
    194 
    195 def run_module(mod_name, init_globals=None,
    196                run_name=None, alter_sys=False):
    197     """Execute a module's code without importing it

...........................................................................
/Users/dass/anaconda3/lib/python3.6/runpy.py in _run_code(code=<code object <module> at 0x10f927930, file "/Use...3.6/site-packages/ipykernel_launcher.py", line 5>, run_globals={'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/Users/dass/anaconda3/lib/python3.6/site-packages/__pycache__/ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/Users/dass/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from '/Users/dass/.../python3.6/site-packages/ipykernel/kernelapp.py'>, ...}, init_globals=None, mod_name='__main__', mod_spec=ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), pkg_name='', script_name=None)
     80                        __cached__ = cached,
     81                        __doc__ = None,
     82                        __loader__ = loader,
     83                        __package__ = pkg_name,
     84                        __spec__ = mod_spec)
---> 85     exec(code, run_globals)
        code = <code object <module> at 0x10f927930, file "/Use...3.6/site-packages/ipykernel_launcher.py", line 5>
        run_globals = {'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/Users/dass/anaconda3/lib/python3.6/site-packages/__pycache__/ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/Users/dass/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from '/Users/dass/.../python3.6/site-packages/ipykernel/kernelapp.py'>, ...}
     86     return run_globals
     87 
     88 def _run_module_code(code, init_globals=None,
     89                     mod_name=None, mod_spec=None,

...........................................................................
/Users/dass/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py in <module>()
     11     # This is added back by InteractiveShellApp.init_path()
     12     if sys.path[0] == '':
     13         del sys.path[0]
     14 
     15     from ipykernel import kernelapp as app
---> 16     app.launch_new_instance()

...........................................................................
/Users/dass/anaconda3/lib/python3.6/site-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    653 
    654         If a global instance already exists, this reinitializes and starts it
    655         """
    656         app = cls.instance(**kwargs)
    657         app.initialize(argv)
--> 658         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    659 
    660 #-----------------------------------------------------------------------------
    661 # utility functions, for convenience
    662 #-----------------------------------------------------------------------------

...........................................................................
/Users/dass/anaconda3/lib/python3.6/site-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    472             return self.subapp.start()
    473         if self.poller is not None:
    474             self.poller.start()
    475         self.kernel.start()
    476         try:
--> 477             ioloop.IOLoop.instance().start()
    478         except KeyboardInterrupt:
    479             pass
    480 
    481 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/Users/dass/anaconda3/lib/python3.6/site-packages/zmq/eventloop/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    172             )
    173         return loop
    174     
    175     def start(self):
    176         try:
--> 177             super(ZMQIOLoop, self).start()
        self.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    178         except ZMQError as e:
    179             if e.errno == ETERM:
    180                 # quietly return on ETERM
    181                 pass

...........................................................................
/Users/dass/anaconda3/lib/python3.6/site-packages/tornado/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    883                 self._events.update(event_pairs)
    884                 while self._events:
    885                     fd, events = self._events.popitem()
    886                     try:
    887                         fd_obj, handler_func = self._handlers[fd]
--> 888                         handler_func(fd_obj, events)
        handler_func = <function wrap.<locals>.null_wrapper>
        fd_obj = <zmq.sugar.socket.Socket object>
        events = 1
    889                     except (OSError, IOError) as e:
    890                         if errno_from_exception(e) == errno.EPIPE:
    891                             # Happens when the client closes the connection
    892                             pass

...........................................................................
/Users/dass/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    272         # Fast path when there are no active contexts.
    273         def null_wrapper(*args, **kwargs):
    274             try:
    275                 current_state = _state.contexts
    276                 _state.contexts = cap_contexts[0]
--> 277                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    278             finally:
    279                 _state.contexts = current_state
    280         null_wrapper._wrapped = True
    281         return null_wrapper

...........................................................................
/Users/dass/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    435             # dispatch events:
    436             if events & IOLoop.ERROR:
    437                 gen_log.error("got POLLERR event on ZMQStream, which doesn't make sense")
    438                 return
    439             if events & IOLoop.READ:
--> 440                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    441                 if not self.socket:
    442                     return
    443             if events & IOLoop.WRITE:
    444                 self._handle_send()

...........................................................................
/Users/dass/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    467                 gen_log.error("RECV Error: %s"%zmq.strerror(e.errno))
    468         else:
    469             if self._recv_callback:
    470                 callback = self._recv_callback
    471                 # self._recv_callback = None
--> 472                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function wrap.<locals>.null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    473                 
    474         # self.update_state()
    475         
    476 

...........................................................................
/Users/dass/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function wrap.<locals>.null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    409         close our socket."""
    410         try:
    411             # Use a NullContext to ensure that all StackContexts are run
    412             # inside our blanket exception handler rather than outside.
    413             with stack_context.NullContext():
--> 414                 callback(*args, **kwargs)
        callback = <function wrap.<locals>.null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    415         except:
    416             gen_log.error("Uncaught exception, closing connection.",
    417                           exc_info=True)
    418             # Close the socket on an uncaught exception from a user callback

...........................................................................
/Users/dass/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    272         # Fast path when there are no active contexts.
    273         def null_wrapper(*args, **kwargs):
    274             try:
    275                 current_state = _state.contexts
    276                 _state.contexts = cap_contexts[0]
--> 277                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    278             finally:
    279                 _state.contexts = current_state
    280         null_wrapper._wrapped = True
    281         return null_wrapper

...........................................................................
/Users/dass/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    278         if self.control_stream:
    279             self.control_stream.on_recv(self.dispatch_control, copy=False)
    280 
    281         def make_dispatcher(stream):
    282             def dispatcher(msg):
--> 283                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    284             return dispatcher
    285 
    286         for s in self.shell_streams:
    287             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/Users/dass/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': 'from sklearn.model_selection import GridSearchCV...nks.content, tweets_corpus_no_links.polarity_bin)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 5, 18, 11, 39, 45, 808970, tzinfo=tzutc()), 'msg_id': '531B8C85E1FF472783671D0B1FADF251', 'msg_type': 'execute_request', 'session': '45BE33CB150248179C8179A4E1D6C8CB', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '531B8C85E1FF472783671D0B1FADF251', 'msg_type': 'execute_request', 'parent_header': {}})
    230             self.log.warn("Unknown message type: %r", msg_type)
    231         else:
    232             self.log.debug("%s: %s", msg_type, msg)
    233             self.pre_handler_hook()
    234             try:
--> 235                 handler(stream, idents, msg)
        handler = <bound method Kernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = [b'45BE33CB150248179C8179A4E1D6C8CB']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': 'from sklearn.model_selection import GridSearchCV...nks.content, tweets_corpus_no_links.polarity_bin)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 5, 18, 11, 39, 45, 808970, tzinfo=tzutc()), 'msg_id': '531B8C85E1FF472783671D0B1FADF251', 'msg_type': 'execute_request', 'session': '45BE33CB150248179C8179A4E1D6C8CB', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '531B8C85E1FF472783671D0B1FADF251', 'msg_type': 'execute_request', 'parent_header': {}}
    236             except Exception:
    237                 self.log.error("Exception in message handler:", exc_info=True)
    238             finally:
    239                 self.post_handler_hook()

...........................................................................
/Users/dass/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=[b'45BE33CB150248179C8179A4E1D6C8CB'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': 'from sklearn.model_selection import GridSearchCV...nks.content, tweets_corpus_no_links.polarity_bin)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 5, 18, 11, 39, 45, 808970, tzinfo=tzutc()), 'msg_id': '531B8C85E1FF472783671D0B1FADF251', 'msg_type': 'execute_request', 'session': '45BE33CB150248179C8179A4E1D6C8CB', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '531B8C85E1FF472783671D0B1FADF251', 'msg_type': 'execute_request', 'parent_header': {}})
    394         if not silent:
    395             self.execution_count += 1
    396             self._publish_execute_input(code, parent, self.execution_count)
    397 
    398         reply_content = self.do_execute(code, silent, store_history,
--> 399                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    400 
    401         # Flush output before sending the reply.
    402         sys.stdout.flush()
    403         sys.stderr.flush()

...........................................................................
/Users/dass/anaconda3/lib/python3.6/site-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code='from sklearn.model_selection import GridSearchCV...nks.content, tweets_corpus_no_links.polarity_bin)', silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    191 
    192         self._forward_input(allow_stdin)
    193 
    194         reply_content = {}
    195         try:
--> 196             res = shell.run_cell(code, store_history=store_history, silent=silent)
        res = undefined
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = 'from sklearn.model_selection import GridSearchCV...nks.content, tweets_corpus_no_links.polarity_bin)'
        store_history = True
        silent = False
    197         finally:
    198             self._restore_input()
    199 
    200         if res.error_before_exec is not None:

...........................................................................
/Users/dass/anaconda3/lib/python3.6/site-packages/ipykernel/zmqshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, *args=('from sklearn.model_selection import GridSearchCV...nks.content, tweets_corpus_no_links.polarity_bin)',), **kwargs={'silent': False, 'store_history': True})
    528             )
    529         self.payload_manager.write_payload(payload)
    530 
    531     def run_cell(self, *args, **kwargs):
    532         self._last_traceback = None
--> 533         return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
        self.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        args = ('from sklearn.model_selection import GridSearchCV...nks.content, tweets_corpus_no_links.polarity_bin)',)
        kwargs = {'silent': False, 'store_history': True}
    534 
    535     def _showtraceback(self, etype, evalue, stb):
    536         # try to preserve ordering of tracebacks and print statements
    537         sys.stdout.flush()

...........................................................................
/Users/dass/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell='from sklearn.model_selection import GridSearchCV...nks.content, tweets_corpus_no_links.polarity_bin)', store_history=True, silent=False, shell_futures=True)
   2693                 self.displayhook.exec_result = result
   2694 
   2695                 # Execute the user code
   2696                 interactivity = "none" if silent else self.ast_node_interactivity
   2697                 has_raised = self.run_ast_nodes(code_ast.body, cell_name,
-> 2698                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler object>
   2699                 
   2700                 self.last_execution_succeeded = not has_raised
   2701 
   2702                 # Reset this so later displayed values do not modify the

...........................................................................
/Users/dass/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.ImportFrom object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Expr object>], cell_name='<ipython-input-23-bbf023576d52>', interactivity='last', compiler=<IPython.core.compilerop.CachingCompiler object>, result=<ExecutionResult object at 11a581c50, execution_..._before_exec=None error_in_exec=None result=None>)
   2803                     return True
   2804 
   2805             for i, node in enumerate(to_run_interactive):
   2806                 mod = ast.Interactive([node])
   2807                 code = compiler(mod, cell_name, "single")
-> 2808                 if self.run_code(code, result):
        self.run_code = <bound method InteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x11a6106f0, file "<ipython-input-23-bbf023576d52>", line 27>
        result = <ExecutionResult object at 11a581c50, execution_..._before_exec=None error_in_exec=None result=None>
   2809                     return True
   2810 
   2811             # Flush softspace
   2812             if softspace(sys.stdout, 0):

...........................................................................
/Users/dass/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x11a6106f0, file "<ipython-input-23-bbf023576d52>", line 27>, result=<ExecutionResult object at 11a581c50, execution_..._before_exec=None error_in_exec=None result=None>)
   2857         outflag = True  # happens in more places, so it's easier as default
   2858         try:
   2859             try:
   2860                 self.hooks.pre_run_code_hook()
   2861                 #rprint('Running code', repr(code_obj)) # dbg
-> 2862                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x11a6106f0, file "<ipython-input-23-bbf023576d52>", line 27>
        self.user_global_ns = {'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', "base_location = './'\n\n# To read\ntweets_raw_file ..._tweets_csv = base_location + 'corpus_tweets.csv'", 'import pandas as pd\nimport numpy as np', "tweets_raw = pd.read_csv(tweets_raw_file, encodi...('Evaluated tweets so far: %d' % len(tweets_run))", 'def convert_to_numeric(evaluation):\n    if evalu...\n        return -1\n    else:\n        return 2    ', "# Build dictionary of tweets where key is the ta...t tweets evalauted so far: %d' % len(own_tweets))", "# Build dictionary of tweets where key is the ta...t tweets evaluated so far: %d' % len(own_tweets))", "# Build dictionary of tweets where key is the ta... so far: %d' % len(own_tweets))\nprint(own_tweets)", "# Build dictionary of tweets where key is the ta...t tweets evaluated so far: %d' % len(own_tweets))", 'from lxml import objectify', "# 4 values of sentiment: N, P, NONE, NEU\nxml = o...s_tweets_2017_csv, index=False, encoding='utf-8')", "# 4 values of sentiment: N, P, NONE, NEU\nxml = o...s_tweets_2012_csv, index=False, encoding='utf-8')", 'tweets_corpus = pd.concat([\n        general_twee...corpus_train_2017\n    ])\ntweets_corpus.sample(10)', "print('Total corpus tweets: %d' % len(tweets_corpus))", 'tweets_corpus = tweets_corpus.query(\'polarity != "NONE"\')', "tweets_corpus = tweets_corpus[-tweets_corpus.content.str.contains('^http.*$')]", "import re\nurl_regex = re.compile('https?:\\/\\/t\\....content'].map(lambda x: re.sub(url_regex, '', x))", "print('Total corpus tweets after cleaning: %d' % len(tweets_corpus_no_links))", "#download spanish stopwords\nimport nltk\nnltk.dow...ds\nspanish_stopwords = stopwords.words('spanish')", 'from string import punctuation\nnon_words = list(...])\nnon_words.extend(map(str,range(10)))\nnon_words', ...], 'LinearSVC': <class 'sklearn.svm.classes.LinearSVC'>, 'Out': {12:                                                 ... marzo los gaditanos celebramos un gr...        P, 19: ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', ...], 22:  1    0.483602
-1    0.394777
 0    0.121622
Name: polarity_bin, dtype: float64}, 'Pipeline': <class 'sklearn.pipeline.Pipeline'>, 'SnowballStemmer': <class 'nltk.stem.snowball.SnowballStemmer'>, '_':  1    0.483602
-1    0.394777
 0    0.121622
Name: polarity_bin, dtype: float64, '_12':                                                 ... marzo los gaditanos celebramos un gr...        P, '_19': ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', ...], ...}
        self.user_ns = {'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', "base_location = './'\n\n# To read\ntweets_raw_file ..._tweets_csv = base_location + 'corpus_tweets.csv'", 'import pandas as pd\nimport numpy as np', "tweets_raw = pd.read_csv(tweets_raw_file, encodi...('Evaluated tweets so far: %d' % len(tweets_run))", 'def convert_to_numeric(evaluation):\n    if evalu...\n        return -1\n    else:\n        return 2    ', "# Build dictionary of tweets where key is the ta...t tweets evalauted so far: %d' % len(own_tweets))", "# Build dictionary of tweets where key is the ta...t tweets evaluated so far: %d' % len(own_tweets))", "# Build dictionary of tweets where key is the ta... so far: %d' % len(own_tweets))\nprint(own_tweets)", "# Build dictionary of tweets where key is the ta...t tweets evaluated so far: %d' % len(own_tweets))", 'from lxml import objectify', "# 4 values of sentiment: N, P, NONE, NEU\nxml = o...s_tweets_2017_csv, index=False, encoding='utf-8')", "# 4 values of sentiment: N, P, NONE, NEU\nxml = o...s_tweets_2012_csv, index=False, encoding='utf-8')", 'tweets_corpus = pd.concat([\n        general_twee...corpus_train_2017\n    ])\ntweets_corpus.sample(10)', "print('Total corpus tweets: %d' % len(tweets_corpus))", 'tweets_corpus = tweets_corpus.query(\'polarity != "NONE"\')', "tweets_corpus = tweets_corpus[-tweets_corpus.content.str.contains('^http.*$')]", "import re\nurl_regex = re.compile('https?:\\/\\/t\\....content'].map(lambda x: re.sub(url_regex, '', x))", "print('Total corpus tweets after cleaning: %d' % len(tweets_corpus_no_links))", "#download spanish stopwords\nimport nltk\nnltk.dow...ds\nspanish_stopwords = stopwords.words('spanish')", 'from string import punctuation\nnon_words = list(...])\nnon_words.extend(map(str,range(10)))\nnon_words', ...], 'LinearSVC': <class 'sklearn.svm.classes.LinearSVC'>, 'Out': {12:                                                 ... marzo los gaditanos celebramos un gr...        P, 19: ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', ...], 22:  1    0.483602
-1    0.394777
 0    0.121622
Name: polarity_bin, dtype: float64}, 'Pipeline': <class 'sklearn.pipeline.Pipeline'>, 'SnowballStemmer': <class 'nltk.stem.snowball.SnowballStemmer'>, '_':  1    0.483602
-1    0.394777
 0    0.121622
Name: polarity_bin, dtype: float64, '_12':                                                 ... marzo los gaditanos celebramos un gr...        P, '_19': ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', ...], ...}
   2863             finally:
   2864                 # Reset our crash handler in place
   2865                 sys.excepthook = old_excepthook
   2866         except SystemExit as e:

...........................................................................
/Users/dass/Documents/ml-football-tweets/<ipython-input-23-bbf023576d52> in <module>()
     22     'cls__max_iter': (500, 1000)
     23 }
     24 
     25 
     26 grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1 , scoring='roc_auc')
---> 27 grid_search.fit(tweets_corpus_no_links.content, tweets_corpus_no_links.polarity_bin)

...........................................................................
/Users/dass/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self=GridSearchCV(cv=None, error_score='raise',
     ...core='warn',
       scoring='roc_auc', verbose=0), X=1       @PauladeLasHeras No te libraras de ayuda...sable 
Name: content, Length: 6586, dtype: object, y=1       0
2       1
3      -1
4       1
6       ... 1
Name: polarity_bin, Length: 6586, dtype: int64, groups=None, **fit_params={})
    634                                   return_train_score=self.return_train_score,
    635                                   return_n_test_samples=True,
    636                                   return_times=True, return_parameters=False,
    637                                   error_score=self.error_score)
    638           for parameters, (train, test) in product(candidate_params,
--> 639                                                    cv.split(X, y, groups)))
        cv.split = <bound method StratifiedKFold.split of Stratifie...ld(n_splits=3, random_state=None, shuffle=False)>
        X = 1       @PauladeLasHeras No te libraras de ayuda...sable 
Name: content, Length: 6586, dtype: object
        y = 1       0
2       1
3      -1
4       1
6       ... 1
Name: polarity_bin, Length: 6586, dtype: int64
        groups = None
    640 
    641         # if one choose to see train score, "out" will contain train score info
    642         if self.return_train_score:
    643             (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,

...........................................................................
/Users/dass/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<generator object BaseSearchCV.fit.<locals>.<genexpr>>)
    784             if pre_dispatch == "all" or n_jobs == 1:
    785                 # The iterable was consumed all at once by the above for loop.
    786                 # No need to wait for async callbacks to trigger to
    787                 # consumption.
    788                 self._iterating = False
--> 789             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    790             # Make sure that we get a last message telling us we are done
    791             elapsed_time = time.time() - self._start_time
    792             self._print('Done %3i out of %3i | elapsed: %s finished',
    793                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError                                         Fri May 18 13:39:53 2018
PID: 26899                   Python 3.6.3: /Users/dass/anaconda3/bin/python
...........................................................................
/Users/dass/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        self.items = [(<function _fit_and_score>, (Pipeline(memory=None,
     steps=[('vect', Count...l2', random_state=None, tol=0.0001, verbose=0))]), 1       @PauladeLasHeras No te libraras de ayuda...sable 
Name: content, Length: 6586, dtype: object, 1       0
2       1
3      -1
4       1
6       ... 1
Name: polarity_bin, Length: 6586, dtype: int64, {'score': make_scorer(roc_auc_score, needs_threshold=True)}, array([2006, 2009, 2010, ..., 6583, 6584, 6585]), array([   0,    1,    2, ..., 2408, 2410, 2436]), 0, {'cls__C': 0.2, 'cls__loss': 'hinge', 'cls__max_iter': 500, 'vect__max_df': 0.5, 'vect__max_features': 500, 'vect__min_df': 10, 'vect__ngram_range': (1, 1)}), {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': 'warn'})]
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/Users/dass/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0=<list_iterator object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = (Pipeline(memory=None,
     steps=[('vect', Count...l2', random_state=None, tol=0.0001, verbose=0))]), 1       @PauladeLasHeras No te libraras de ayuda...sable 
Name: content, Length: 6586, dtype: object, 1       0
2       1
3      -1
4       1
6       ... 1
Name: polarity_bin, Length: 6586, dtype: int64, {'score': make_scorer(roc_auc_score, needs_threshold=True)}, array([2006, 2009, 2010, ..., 6583, 6584, 6585]), array([   0,    1,    2, ..., 2408, 2410, 2436]), 0, {'cls__C': 0.2, 'cls__loss': 'hinge', 'cls__max_iter': 500, 'vect__max_df': 0.5, 'vect__max_features': 500, 'vect__min_df': 10, 'vect__ngram_range': (1, 1)})
        kwargs = {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': 'warn'}
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/Users/dass/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator=Pipeline(memory=None,
     steps=[('vect', Count...l2', random_state=None, tol=0.0001, verbose=0))]), X=1       @PauladeLasHeras No te libraras de ayuda...sable 
Name: content, Length: 6586, dtype: object, y=1       0
2       1
3      -1
4       1
6       ... 1
Name: polarity_bin, Length: 6586, dtype: int64, scorer={'score': make_scorer(roc_auc_score, needs_threshold=True)}, train=array([2006, 2009, 2010, ..., 6583, 6584, 6585]), test=array([   0,    1,    2, ..., 2408, 2410, 2436]), verbose=0, parameters={'cls__C': 0.2, 'cls__loss': 'hinge', 'cls__max_iter': 500, 'vect__max_df': 0.5, 'vect__max_features': 500, 'vect__min_df': 10, 'vect__ngram_range': (1, 1)}, fit_params={}, return_train_score='warn', return_parameters=False, return_n_test_samples=True, return_times=True, error_score='raise')
    483                              " make sure that it has been spelled correctly.)")
    484 
    485     else:
    486         fit_time = time.time() - start_time
    487         # _score will return dict if is_multimetric is True
--> 488         test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
        test_scores = {}
        estimator = Pipeline(memory=None,
     steps=[('vect', Count...l2', random_state=None, tol=0.0001, verbose=0))])
        X_test = 1       @PauladeLasHeras No te libraras de ayuda...o e...
Name: content, Length: 2196, dtype: object
        y_test = 1       0
2       1
3      -1
4       1
6       ... 0
Name: polarity_bin, Length: 2196, dtype: int64
        scorer = {'score': make_scorer(roc_auc_score, needs_threshold=True)}
        is_multimetric = True
    489         score_time = time.time() - start_time - fit_time
    490         if return_train_score:
    491             train_scores = _score(estimator, X_train, y_train, scorer,
    492                                   is_multimetric)

...........................................................................
/Users/dass/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _score(estimator=Pipeline(memory=None,
     steps=[('vect', Count...l2', random_state=None, tol=0.0001, verbose=0))]), X_test=1       @PauladeLasHeras No te libraras de ayuda...o e...
Name: content, Length: 2196, dtype: object, y_test=1       0
2       1
3      -1
4       1
6       ... 0
Name: polarity_bin, Length: 2196, dtype: int64, scorer={'score': make_scorer(roc_auc_score, needs_threshold=True)}, is_multimetric=True)
    518 
    519     Will return a single float if is_multimetric is False and a dict of floats,
    520     if is_multimetric is True
    521     """
    522     if is_multimetric:
--> 523         return _multimetric_score(estimator, X_test, y_test, scorer)
        estimator = Pipeline(memory=None,
     steps=[('vect', Count...l2', random_state=None, tol=0.0001, verbose=0))])
        X_test = 1       @PauladeLasHeras No te libraras de ayuda...o e...
Name: content, Length: 2196, dtype: object
        y_test = 1       0
2       1
3      -1
4       1
6       ... 0
Name: polarity_bin, Length: 2196, dtype: int64
        scorer = {'score': make_scorer(roc_auc_score, needs_threshold=True)}
    524     else:
    525         if y_test is None:
    526             score = scorer(estimator, X_test)
    527         else:

...........................................................................
/Users/dass/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _multimetric_score(estimator=Pipeline(memory=None,
     steps=[('vect', Count...l2', random_state=None, tol=0.0001, verbose=0))]), X_test=1       @PauladeLasHeras No te libraras de ayuda...o e...
Name: content, Length: 2196, dtype: object, y_test=1       0
2       1
3      -1
4       1
6       ... 0
Name: polarity_bin, Length: 2196, dtype: int64, scorers={'score': make_scorer(roc_auc_score, needs_threshold=True)})
    548 
    549     for name, scorer in scorers.items():
    550         if y_test is None:
    551             score = scorer(estimator, X_test)
    552         else:
--> 553             score = scorer(estimator, X_test, y_test)
        score = undefined
        scorer = make_scorer(roc_auc_score, needs_threshold=True)
        estimator = Pipeline(memory=None,
     steps=[('vect', Count...l2', random_state=None, tol=0.0001, verbose=0))])
        X_test = 1       @PauladeLasHeras No te libraras de ayuda...o e...
Name: content, Length: 2196, dtype: object
        y_test = 1       0
2       1
3      -1
4       1
6       ... 0
Name: polarity_bin, Length: 2196, dtype: int64
    554 
    555         if hasattr(score, 'item'):
    556             try:
    557                 # e.g. unwrap memmapped scalars

...........................................................................
/Users/dass/anaconda3/lib/python3.6/site-packages/sklearn/metrics/scorer.py in __call__(self=make_scorer(roc_auc_score, needs_threshold=True), clf=Pipeline(memory=None,
     steps=[('vect', Count...l2', random_state=None, tol=0.0001, verbose=0))]), X=1       @PauladeLasHeras No te libraras de ayuda...o e...
Name: content, Length: 2196, dtype: object, y=1       0
2       1
3      -1
4       1
6       ... 0
Name: polarity_bin, Length: 2196, dtype: int64, sample_weight=None)
    176         """
    177         super(_ThresholdScorer, self).__call__(clf, X, y,
    178                                                sample_weight=sample_weight)
    179         y_type = type_of_target(y)
    180         if y_type not in ("binary", "multilabel-indicator"):
--> 181             raise ValueError("{0} format is not supported".format(y_type))
        y_type = 'multiclass'
    182 
    183         if is_regressor(clf):
    184             y_pred = clf.predict(X)
    185         else:

ValueError: multiclass format is not supported
___________________________________________________________________________

In [65]:
grid_search.best_params_

NotFittedError: This GridSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [66]:
from sklearn.externals import joblib
joblib.dump(grid_search, 'grid_search.pkl')

['grid_search.pkl']

In [67]:


model = LinearSVC(C=.2, loss='squared_hinge',max_iter=1000,multi_class='ovr',
              random_state=None,
              penalty='l2',
              tol=0.0001
)

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = spanish_stopwords,
    min_df = 50,
    max_df = 1.9,
    ngram_range=(1, 1),
    max_features=1000
)

corpus_data_features = vectorizer.fit_transform(tweets_corpus.content)
corpus_data_features_nd = corpus_data_features.toarray()



LookupError: 
**********************************************************************
  Resource 'tokenizers/punkt/PY3/english.pickle' not found.
  Please use the NLTK Downloader to obtain the resource:  >>>
  nltk.download()
  Searched in:
    - '/home/david.santosg/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************

In [68]:
scores = cross_val_score(
    model,
    corpus_data_features_nd[0:len(tweets_corpus)],
    y=tweets_corpus.polarity_bin,
    scoring='roc_auc',
    cv=5
    )

scores.mean()

NameError: name 'corpus_data_features_nd' is not defined

### Polarity Prediction

In [69]:
tweets = pd.read_csv('tweets_parsed.csv', encoding='utf-8')

FileNotFoundError: File b'tweets_parsed.csv' does not exist

In [70]:
tweets = tweets[tweets.tweet.str.len() < 150]
tweets.lat = pd.to_numeric(tweets.lat, errors='coerce')
tweets = tweets[tweets.lat.notnull()]


#We make sure only those tweets in the Murcia bounding box are kept
min_lon = -1.157420
max_lon = -1.081202
min_lat = 37.951741
max_lat = 38.029126

tweets = tweets[(tweets.lat.notnull()) & (tweets.lon.notnull())]

tweets = tweets[(tweets.lon > min_lon) & (tweets.lon < max_lon) & (tweets.lat > min_lat) & (tweets.lat < max_lat)]
tweets.shape

AttributeError: 'list' object has no attribute 'tweet'