# Inory in Spanish Tweets (SVM Model)  
### Building a predictive model using RF algorithm  

Information about RF algorith and for better understanding:  
* [Theory and characteristics](http://www.statsoft.com/textbook/support-vector-machines)
* [Very useful examples](http://scikit-learn.org/stable/modules/svm.html)

In [1]:
import nltk
# nltk.download()

In [2]:
import pandas as pd
import numpy as np
import io

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [5]:
from sklearn.svm import SVC

In [6]:
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.pipeline import Pipeline

In [7]:
popular_words = stopwords.words('spanish')

In [11]:
def read_ds(ds_list):
    supreme = []
    for tweet in ds_list:
        supreme.append(tweet.split('|',2)[-2:])
    return supreme

def cat2bin(category_col):
    lb = preprocessing.LabelBinarizer() # convert cat into binary values
    cat_binary = lb.fit_transform(category_col) # We receive an numpy array with shape (#instances,1)
    instances = cat_binary.shape[0]
    return pd.Series(cat_binary.reshape((instances,))) # Almost all cross-validation metrics need 
                                                      #the categorical attribute in shape (#instances,0)

In [25]:
pipeline_svm = Pipeline([
    ('bow', CountVectorizer(analyzer='word',stop_words=popular_words,min_df=10,ngram_range=(1,6))),
    ('tfidf', TfidfTransformer()), 
    ('classifier', SVC(kernel='linear')),  
])

### SVM with 10-90 distribution

In [37]:
with io.open('data/corpus_10_90.txt','r',encoding='utf8') as f:
    corpus_10_90 = f.readlines()

In [38]:
arr_10_90 = np.array(reading_ds(corpus_10_90))
df_10_90 = pd.DataFrame(data=arr_10_90,columns=['cat','tweet'])
df_10_90.count()

cat      76530
tweet    76530
dtype: int64

In [39]:
cat_binary = cat2bin(df_10_90.cat)
cat_binary.shape

(76530,)

In [41]:
for score in ["accuracy", "precision", "recall", 'f1']:
        print score,
        print " : ",
        print cross_val_score(pipeline_svm,df_10_90.tweet,cat_binary,scoring=score,cv=10,n_jobs=-1).mean()

 accuracy  :  0.910910768397
precision  :  0.913490826588
recall  :  0.99526692044
f1  :  0.95262673878


### SVM with 30-70 distribution

In [42]:
with io.open('data/corpus_30_70.txt','r',encoding='utf8') as f:
    corpus_30_70 = f.readlines()

In [43]:
arr_30_70 = np.array(reading_ds(corpus_30_70))
df_30_70 = pd.DataFrame(data=arr_30_70,columns=['cat','tweet'])
df_30_70.count()

cat      25510
tweet    25510
dtype: int64

In [44]:
cat_binary = cat2bin(df_30_70.cat)
cat_binary.shape

(25510,)

In [45]:
for score in ["accuracy", "precision", "recall", 'f1']:
        print score,
        print " : ",
        print cross_val_score(pipeline_svm,df_30_70.tweet,cat_binary,scoring=score,cv=10,n_jobs=-1).mean()

accuracy  :  0.792669767694
precision  :  0.80432263833
recall  :  0.930167188936
f1  :  0.862662358688


### SVM with 50-50 distribution

In [46]:
with io.open('data/corpus_50_50.txt','r',encoding='utf8') as f:
    corpus_50_50 = f.readlines()

In [47]:
arr_50_50 = np.array(reading_ds(corpus_50_50))
df_50_50 = pd.DataFrame(data=arr_50_50,columns=['cat','tweet'])
df_50_50.count()

cat      15306
tweet    15306
dtype: int64

In [48]:
cat_binary = cat2bin(df_50_50.cat)
cat_binary.shape

(15306,)

In [49]:
for score in ["accuracy", "precision", "recall", 'f1']:
        print score,
        print " : ",
        print cross_val_score(pipeline_svm,df_50_50.tweet,cat_binary,scoring=score,cv=10,n_jobs=-1).mean()

accuracy  :  0.728472414205
precision  :  0.706313323718
recall  :  0.782307206608
f1  :  0.742329234118
