I wanted to keep the lecture notebook and seperate out the wine stuff here since it upset me that we were
re-setting variable names each time we changed data sets. And also to avoid running the newsgroups cells
since that took a long time. 

In [1]:
# Import Statements
from sklearn.pipeline import Pipeline 
from sklearn.datasets import fetch_20newsgroups # demo data set 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import pandas as pd 
import spacy
nlp = spacy.load("en_core_web_lg")


In [2]:
# load the data
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

train.head(1)

Unnamed: 0,id,description,ratingCategory
0,1321,"\nSometimes, when whisky is batched, a few lef...",1


# Brute Force Run on TfidVectorizer

In [3]:
vect = TfidfVectorizer(stop_words='english', ngram_range=(1,3))
clf = RandomForestClassifier()
pipe = Pipeline([('vect', vect), ('clf', clf)])
target = 'ratingCategory'
features = 'description'
X_train = train[features]
y_train = train[target]

# model and vectorizer parameters
pipe_params = {
    'vect__max_df': (0.75, 1.0),
    'vect__min_df': (0.02, 0.05),
    'vect__max_features': (500,1000),
    'clf__n_estimators': (5,10),
    'clf__max_depth':(5,10,15,20)
}

In [4]:
# grid search
grid_search = GridSearchCV(pipe, pipe_params, cv=3, n_jobs=8, verbose=1)
grid_search.fit(X_train, y_train)
grid_search.best_score_

Fitting 3 folds for each of 64 candidates, totalling 192 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   12.0s
[Parallel(n_jobs=8)]: Done 192 out of 192 | elapsed:   51.6s finished


0.7164179104477612

In [5]:
# random search, working noticeably faster
random_search = RandomizedSearchCV(pipe, pipe_params, cv=3, n_jobs=8, verbose=1)
random_search.fit(X_train, y_train)
random_search.best_score_

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  30 out of  30 | elapsed:    8.4s finished


0.7149498409591387

# LSI SVD 

In [6]:
# instantiate SVD, removed algorithm arg since it defaults to whatever is most efficient
svd = TruncatedSVD(n_components=100, n_iter=3)

svd_params = {
    'lsi__svd__n_components': [10,100,250],
    'lsi__vect__max_df':[.9, .95, 1.0],
    'clf__n_estimators':[5,10,20]
}

# create two seperate pipelines
lsi_pipe = Pipeline([('vect', vect), ('svd', svd)])
svd_pipe = Pipeline([('lsi', lsi_pipe), ('clf', clf)])

# LSI SVD GRID SEARCH

In [7]:
svd_grid_search = GridSearchCV(svd_pipe, svd_params, cv=3, n_jobs=4, verbose=1)
svd_grid_search.fit(X_train, y_train)
svd_grid_search.best_score_

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  3.3min
[Parallel(n_jobs=4)]: Done  81 out of  81 | elapsed:  7.6min finished


0.7213114754098361

# LSI SVD RANDOM SEARCH

In [8]:
svd_random_search = RandomizedSearchCV(svd_pipe, svd_params, cv=3, n_jobs=4, verbose=1)
svd_random_search.fit(X_train, y_train)
svd_random_search.best_score_

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:  2.0min finished


0.7176413016882799

# Spacy Word Embeddings

In [9]:
test = nlp("And we are never ever ever, getting back together")
taylor_swift_vector = test.vector
print(len(taylor_swift_vector))

300


In [10]:
def get_word_vectors(docs: iter):
    return [nlp(doc).vector for doc in docs]

In [11]:
# fit word vectors onto model 
from scipy.stats import uniform

X_train_word_vectors = get_word_vectors(train['description'])
rfc_params = {
            'n_estimators': [200, 700],
            'max_features': ['auto', 'sqrt', 'log2']
                     }
classifier = RandomForestClassifier()

In [12]:
spacy_random = RandomizedSearchCV(classifier, rfc_params, n_jobs=4, verbose=1, cv=5)    
spacy_random.fit(X_train_word_vectors, y_train)
spacy_random.best_score_

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:  2.5min finished


0.7359921702960607

In [49]:
spacy_random.predict(y_train)

ValueError: Expected 2D array, got 1D array instead:
array=[1. 0. 1. ... 1. 1. 1.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
spacy_random.predict(test['description'])

In [13]:
spacy_grid = GridSearchCV(classifier, rfc_params, n_jobs=4, verbose=1)
spacy_grid.fit(X_train_word_vectors, y_train)
spacy_grid.best_score_

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=4)]: Done  18 out of  18 | elapsed:  1.2min finished


0.7367262050403719

In [36]:
type(svd_random_search)

sklearn.model_selection._search.RandomizedSearchCV

In [41]:
type(spacy_grid)

sklearn.model_selection._search.GridSearchCV

# Function to create Submissions

In [26]:
test = pd.read_csv('./data/test.csv')

In [38]:
test.head(1)

Unnamed: 0,id,description
0,3461,\nStyle: Speyside single malt scotch Color: Wa...


In [50]:
test["description"].dtype

dtype('O')

In [28]:
def create_submission(search, subNumber):
    pred = search.predict(test['description'])
    submission = pd.DataFrame({'id': test['id'], 'ratingCategory':pred})
    submission['ratingCategory'] = submission['ratingCategory'].astype('int64')
    assert(submission.shape == (1022, 2))
    submission.to_csv(f'./data/submission{str(subNumber)}.csv', index=False)

In [29]:
create_submission(grid_search, 1)

In [30]:
create_submission(random_search, 2)

In [31]:
create_submission(svd_grid_search, 3)

In [39]:
create_submission(svd_random_search, 99)

In [40]:
create_submission(spacy_grid, 100)

ValueError: could not convert string to float: '\nStyle: Speyside single malt scotch Color: Walnut Aroma: Richly sherried and thick, with notes of nuts and toffee. Wood resins contribute spice and variety. Fruitcake at Christmas. Palate: Thick, chewy in texture, and quite ripe. Again the fruitcake. Very deep and mature with some underlying maltiness. Dry, spicy, oak notes fight off all that sherry and add balance and complexity. Long, soothing finish. \r\n'

In [None]:
create_submission(spacy_random, 6)