# Inory in Spanish Tweets (Random Forest Model)

### Building a predictive model using 

In [2]:
import nltk
# nltk.download()

In [3]:
import pandas as pd

In [4]:
import numpy as np

In [5]:
import io

In [6]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [8]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [9]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [10]:
popular_words = stopwords.words('spanish')

In [11]:
with io.open('data/corpus.txt','r',encoding='utf8') as f:
    corpus = f.readlines()

In [12]:
supreme = []
for tweet in corpus:
        supreme.append(tweet.split('|',2)[-2:])

In [13]:
corpus_array = np.array(supreme)

In [16]:
df = pd.DataFrame(data=corpus_array,columns=['cat','tweet'])

In [17]:
df.head()

Unnamed: 0,cat,tweet
0,ironic,No hay casi nadie de Cuenca en Benidorm 😆😆😆 \n
1,ironic,QuienLoHubieraDicho hacer tuneles es mas facil...
2,not_ironic,"""La vida era madre fácil cuando no había Twitt..."
3,not_ironic,"""Oh no se para qué hablo con esta guachenga.. ..."
4,not_ironic,"""¿Qué fue lo primero que se te vino a la cabez..."


In [18]:
df.count()

cat      15306
tweet    15306
dtype: int64

In [48]:
pipeline_svm = Pipeline([
    ('bow', CountVectorizer(analyzer='word',stop_words=popular_words,min_df=10,ngram_range=(1,6))),
    ('tfidf', TfidfTransformer()), 
    ('classifier', SVC(kernel='linear')),  
])

In [41]:
pipeline_rand_forest = Pipeline([
    ('bow', CountVectorizer(analyzer='char',stop_words=popular_words,min_df=10,ngram_range=(1,6))),
    ('tfidf', TfidfTransformer()), 
    ('classifier', RandomForestClassifier(n_estimators=200)),  
])

In [31]:
tweet_train, tweet_test, label_train, label_test = \
train_test_split(df['tweet'], df['cat'], test_size=0.33)

print(len(tweet_train), len(tweet_test), len(tweet_train) + len(tweet_test))

(10255, 5051, 15306)


In [49]:
pipeline_svm.fit(tweet_train,label_train)

Pipeline(steps=[('bow', CountVectorizer(analyzer='word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=10,
        ngram_range=(1, 6), preprocessor=None,
        stop_words=[u'de', u...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [43]:
pipeline_rand_forest.fit(tweet_train,label_train)

Pipeline(steps=[('bow', CountVectorizer(analyzer='char', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=10,
        ngram_range=(1, 6), preprocessor=None,
        stop_words=[u'de', u...mators=200, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))])

In [54]:
print tweet_test.head()
print tweet_train.head()

9856     Odio descansar los lunes  beach espiñeiro play...
5564     "@ Que sea este lunes un buen inicio de semana...
3170     @ pero allí pues Ud troll a favor de Rodas (co...
9544     ¡Qué originalidad de chica! ¡Qué perdonalidad ...
13963    "Como quisiera ayudarte a olvidar tu pasado. 😋"\n
Name: tweet, dtype: object
614          "Son unas genias estas minas como cantan 😆"\n
14239    Que dura es la vida...... vacaciones  monster ...
5874                               Oleeee mi orgullo 👏👏 \n
13447    "@ gracias Don Gustavo,no hay que dejar que cu...
4020     "@  jaja es hermoso él... Ni se compara... Com...
Name: tweet, dtype: object


In [50]:
predictions_svm = pipeline_svm.predict(tweet_test)

In [45]:
predictions_tree = pipeline_rand_forest.predict(tweet_test)

In [51]:
print "Support Vector Machine"
print (classification_report(label_test,predictions_svm))

Support Vector Machine
             precision    recall  f1-score   support

     ironic       0.74      0.67      0.70      2514
 not_ironic       0.70      0.77      0.73      2537

avg / total       0.72      0.72      0.72      5051



In [47]:
print "Random Forest"
print (classification_report(label_test,predictions_tree))

Random Forest
             precision    recall  f1-score   support

     ironic       1.00      0.98      0.99      2514
 not_ironic       0.98      1.00      0.99      2537

avg / total       0.99      0.99      0.99      5051

