In [1]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
import numpy as np

In [2]:
# loads the datasets and labels
with open('../data/us.text') as f:
    eng_text= f.readlines()
    
with open('../data/us.labels') as f:
    eng_labels = f.readlines()

with open('../data/es.text') as f:
    esp_text = f.readlines()

with open('../data/es.labels') as f:
    esp_labels = f.readlines()

In [3]:
# asserts that the number of tweets and labels are equal
assert len(eng_text) == len(eng_labels)
assert len(esp_text) == len(esp_labels)

In [4]:
# strips the tweets and labels of extra chars
eng_text = [tweet.strip() for tweet in eng_text]
eng_labels = [int(label.strip()) for label in eng_labels]
esp_text = [tweet.strip() for tweet in esp_text]
esp_labels = [int(label.strip()) for label in esp_labels]

In [5]:
print(len(eng_text))
print(len(esp_text))

490265
98683


In [6]:
# shuffle the dataset with the same seed
np.random.seed(4132)
np.random.shuffle(eng_text)
np.random.seed(4132)
np.random.shuffle(eng_labels)
np.random.seed(4132)
np.random.shuffle(esp_text)
np.random.seed(4132)
np.random.shuffle(esp_labels)


In [7]:
pipe = Pipeline([('tfidf', TfidfVectorizer()),
                 ('clf', MultinomialNB())])

cross_val_score(pipe, esp_text, esp_labels, cv=5)

array([ 0.21672407,  0.21726356,  0.21785389,  0.21898439,  0.21963706])