In [1]:
%cd ..

/home/catarinapereira/python_docker


In [2]:
from version import bumpversion

In [3]:
import numpy as np
import pandas as pd
import pickle
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

%matplotlib inline

data_train = pd.read_csv('/home/cunha/covid19-sample/training.csv', delimiter=';')
data_test = pd.read_csv('/home/cunha/covid19-sample/test.csv', delimiter=';')

In [4]:
data_train_tweets = []
data_train_targets = []

for index, row in data_train.iterrows():
    data_train_tweets.append(row.text)
    data_train_targets.append(int(row.country_code == 'US'))
    

### Extracting features from text files

##### Tokenizing text with scikit-learn

In [5]:
count_vect = CountVectorizer(stop_words="english")
X_train_counts = count_vect.fit_transform(data_train_tweets)

X_train_counts_shape = X_train_counts.shape
count_vect_vocabulary = count_vect.vocabulary_.get(u'algorithm')

print(X_train_counts_shape)

(9999, 38452)


##### From occurrences to frequencies

In [6]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_shape = X_train_tf.shape

print(X_train_shape)

(9999, 38452)


In [7]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

### Training a classifier

In [8]:
clf = MultinomialNB().fit(X_train_tfidf, data_train_targets)

In [9]:
X_new_counts = count_vect.transform(data_train_tweets)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)

accuracy = np.mean(predicted == data_train_targets)

### Building a pipeline

In [10]:
tweet_clf = Pipeline([
    ('vect', CountVectorizer(stop_words="english")),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [11]:
tweet_clf.fit(data_train_tweets, data_train_targets)

### Pickle Classifier

In [12]:
from datetime import datetime, timedelta

with open('/home/catarinapereira/python_docker/model.pickle', 'wb') as handle:
    bumpversion.bump_version()
    date = (datetime.now() - timedelta(hours=3)).strftime("%d-%m-%Y %H:%M:%S")
    
    with open('/home/catarinapereira/python_docker/version/version.bumpversion.cfg', 'r') as version_config:
        version = ((version_config.read()).split())[3]
        pickle.dump({"model": tweet_clf, "model_date": date, "version": version}, handle, protocol=pickle.HIGHEST_PROTOCOL)
