In [3]:
import pandas as pd
import numpy as np
from pprint import pprint
from time import time
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from joblib import dump

In [5]:
train = np.load('../data/interim/train_data.npy', allow_pickle=True)
val = np.load('../data/interim/val_data.npy', allow_pickle=True)

#### Import or retype label conversion function

In [6]:
def convert_cat_to_number(cat):
    if cat == 'negative':
        return 0
    elif cat == 'neutral':
        return 1
    else:
        return 2

#### Pre-processing
- Concatenante train and dev sets for use with Cross Validation
- Convert all airline sentiment to 0,1,2 labels
- Grab text data from 10th column

In [15]:
all_data = np.concatenate([train, val])
all_labels = [convert_cat_to_number(x) for x in all_data[:,1]]
all_text = all_data[:,10]

# ***Make a note here, transfer to Colab notebook to actually run model

#### Initiate ML Pipeline and define parametes for use in Randomized Search CV

In [18]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier())
])

parameters = {
    'tfidf__max_df': (0.9, 0.95, 1.0),
    'tfidf__min_df': (0.1, 0.05, 0.0),
    'tfidf__stop_words': ('english', None),
    'tfidf__max_features': (None, 5000, 10000, 25000, 50000, 75000),
    'tfidf__ngram_range': ((1,1), (1,2)),    #unigrams or bigrams
    'tfidf__norm': ('l1', 'l2'),
    'clf__n_estimators': (100, 200, 1000, 5000),
    'clf__max_depth': (None, 5,10,15),
    'clf__min_samples_split': (2,4,6)
}

In [20]:
random_search = RandomizedSearchCV(pipeline, parameters, n_iter=100, n_jobs=-1, verbose=1, cv=5, 
                                   scoring='f1_weighted', random_state=42)

In [None]:
print('Performing random search...')
print('pipeline:', [name for name, _ in pipeline.steps])
print('Parameters: \n')
pprint(parameters)
t0 = time()
random_search.fit(all_text, all_labels)
print(f'Completed in {time() - t0:.3fs}\n')
print(f'Best Score: {random_search.best_score_:.3f}')
print('Best Parameter set:')
best_parameters = random_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print(f'\t{param_name}: {best_parameters[param_name]}')
