# Inory in Spanish Tweets (Random Forest Model)

### Building a predictive model using RF algorithm

Information about RF algorith and for better understanding:
* [Theory and characteristics](https://en.wikipedia.org/wiki/Random_forest)
* [Very useful example](https://chrisalbon.com/machine-learning/random_forest_classifier_example_scikit.html)

In [2]:
import nltk
# nltk.download()

In [3]:
import pandas as pd
import numpy as np
import io

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [6]:
from sklearn.ensemble import RandomForestClassifier

In [7]:
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.pipeline import Pipeline

In [8]:
popular_words = stopwords.words('spanish')

In [69]:
def reading_ds(ds_list):
    supreme = []
    for tweet in ds_list:
        supreme.append(tweet.split('|',2)[-2:])
    return supreme

In [15]:
pipeline_rand_forest = Pipeline([
    ('bow', CountVectorizer(analyzer='char',stop_words=popular_words,min_df=10,ngram_range=(1,6))),
    ('tfidf', TfidfTransformer()), 
    ('classifier', RandomForestClassifier(n_estimators=200)),  
])

### RF with 10-90 distribution

In [33]:
with io.open('data/corpus_10_90.txt','r',encoding='utf8') as f:
    corpus_10_90 = f.readlines()

In [42]:
corpus_array = np.array(reading_ds(corpus_10_90))

In [43]:
df = pd.DataFrame(data=corpus_array,columns=['cat','tweet'])

In [59]:
df.count()

cat      76530
tweet    76530
dtype: int64

In [16]:
lb = preprocessing.LabelBinarizer()
cat_binary = lb.fit_transform(df.cat) 
cat_binary.shape

(76530, 1)

In [18]:
ps =  pd.Series(cat_binary.reshape((76530,)))

In [19]:
for score in ["accuracy","precision", "recall", 'f1']:
        print score,
        print " : ",
        print cross_val_score(pipeline_rand_forest, df.tweet, ps, scoring=score, cv=5,n_jobs=-1).mean()

accuracy  :  0.946178023023
precision  :  0.943817878691
recall  :  0.999709624144
f1  :  0.97095940595


### RF with 30-70 distribution

In [60]:
with io.open('data/corpus_30_70.txt','r',encoding='utf8') as f1:
    corpus_30_70 = f1.readlines()

In [70]:
arr_30_70 = np.array(reading_ds(corpus_30_70))

In [72]:
df = pd.DataFrame(data=arr_30_70,columns=['cat','tweet'])

In [73]:
df.count()

cat      25510
tweet    25510
dtype: int64

In [74]:
cat_binary = lb.fit_transform(df.cat) 
cat_binary.shape

(25510, 1)

In [75]:
ps =  pd.Series(cat_binary.reshape((25510,)))

In [76]:
for score in ["accuracy", "precision", "recall", 'f1']:
        print score,
        print " : ",
        print cross_val_score(pipeline_rand_forest, df.tweet, ps, scoring=score, cv=10,n_jobs=-1).mean()

accuracy  :  0.923049777375
precision  :  0.907967914101
recall  :  0.99048008005
f1  :  0.947424709268


### RF with 50-50 distribution

In [85]:
with io.open('data/corpus_50_50.txt','r',encoding='utf8') as f:
    corpus_50_50 = f.readlines()




In [86]:
arr_50 = np.array(reading_ds(corpus_50_50))

In [87]:
df = pd.DataFrame(data=arr_50,columns=['cat','tweet'])
df.count()

cat      15306
tweet    15306
dtype: int64

In [88]:
cat_binary = lb.fit_transform(df.cat) 
cat_binary.shape

(15306, 1)

In [89]:
ps =  pd.Series(cat_binary.reshape((15306,)))

In [90]:
for score in ["accuracy", "precision", "recall", 'f1']:
        print score,
        print " : ",
        print cross_val_score(pipeline_rand_forest, df.tweet, ps, scoring=score, cv=10,n_jobs=-1).mean()

accuracy  :  0.913953907063
precision  :  0.874961377801
recall  :  0.966287479309
f1  :  0.918278566559
