In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb
import spacy
nlp = spacy.load("en_core_web_lg")

In [2]:
test_path = '/Users/nickburkhalter/Desktop/Lambda School/Unit 4/DS-Unit-4-Sprint-1-NLP/module3-document-classification/data/test.csv'
train_path = '/Users/nickburkhalter/Desktop/Lambda School/Unit 4/DS-Unit-4-Sprint-1-NLP/module3-document-classification/data/train.csv'

test = pd.read_csv(test_path)
train = pd.read_csv(train_path)

In [3]:
train = train.dropna()

In [4]:
print(train.shape)
train.head()

(2586, 3)


Unnamed: 0,id,description,category
0,1,A marriage of 13 and 18 year old bourbons. A m...,2
1,2,There have been some legendary Bowmores from t...,1
2,3,This bottling celebrates master distiller Park...,2
3,4,What impresses me most is how this whisky evol...,1
4,9,"A caramel-laden fruit bouquet, followed by une...",2


In [5]:
# TFIDF + RF 
# --------------------ROUND 1-----------------------
rf = RandomForestClassifier(random_state=42)
vect = TfidfVectorizer()

pipeline = Pipeline([('vect', vect), ('rf', rf)])

In [6]:
pipeline.fit(train['description'], train['category'])



Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...mators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False))])

In [7]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__min_df': (.02, .05),
    'vect__max_features': ( 100, 500, 1000),
    'rf__n_estimators': (20, 100, 400),
}

grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=10)

In [8]:
grid_search.fit(train.description, train.category)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   18.0s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   20.8s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   27.3s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   34.6s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   42.7s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   50.1s
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:   

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...mators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'vect__max_df': (0.5, 0.75, 1.0), 'vect__min_df': (0.02, 0.05), 'vect__max_features': (100, 500, 1000), 'rf__n_estimators': (20, 100, 400)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

In [9]:
grid_search.best_score_

0.8901778808971385

In [10]:
grid_search.predict(train['description'])

array([2, 1, 2, ..., 3, 1, 2])

In [11]:
y_pred = grid_search.predict(test['description'])

In [71]:
y_submission = pd.read_csv('/Users/nickburkhalter/Desktop/Lambda School/Unit 4/DS-Unit-4-Sprint-1-NLP/module3-document-classification/data/sample_submission.csv')
assert y_submission.shape[0] == y_pred.shape[0]

y_submission.category = y_pred.astype(int)
y_submission.to_csv('pred1.csv', index=None)

In [26]:
# TFIDF + SVD + SGDC
#-------------------ROUND 2---------------------
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import SGDClassifier


vect = TfidfVectorizer()
svd = TruncatedSVD(algorithm='randomized',
                   random_state=42)
sgdc = SGDClassifier(random_state=42)

pipe2 = Pipeline([('vect', vect), 
                 ('svd', svd), 
                 ('sgdc', sgdc)])

param2 = {
    #'vect__max_df': (1.0,),
    #'vect__min_df': (.02,),
    'svd__n_components': (100, 300, 1000),
    'sgdc__max_iter': (300, 1000, 3000),
}

grid2 = GridSearchCV(pipe2, param2, cv=5, n_jobs=-1, verbose=1000)

In [27]:
grid2.fit(train.description, train.category)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=object).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2068,), dtype=int64).
Pickling array (shape=(518,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=object).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2068,), dtype=int64).
Pickling array (shape=(518,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=object).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...dom_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'svd__n_components': (100, 300, 1000), 'sgdc__max_iter': (300, 1000, 3000)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1000)

In [28]:
grid2.best_score_

0.934261407579273

In [29]:
grid2.predict(train['description'])

array([2, 1, 2, ..., 3, 1, 2])

In [45]:
y_pred2 = grid2.predict(test['description'])

In [72]:
y_submission = pd.read_csv('/Users/nickburkhalter/Desktop/Lambda School/Unit 4/DS-Unit-4-Sprint-1-NLP/module3-document-classification/data/sample_submission.csv')
assert y_submission.shape[0] == y_pred2.shape[0]

y_submission.category = y_pred2.astype(int)
y_submission.to_csv('pred2.csv', index=None)

In [34]:
# TFIDF + SGDC
#-------------------ROUND 3---------------------
sgdc = SGDClassifier()
pipe3 = Pipeline([('vect', vect), 
                  ('sgdc', sgdc)])

param3 = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__min_df': (.02, .05),
    'vect__max_features': ( 100, 500, 1000),
    'sgdc__max_iter': (300, 1000, 3000),
}

grid3 = GridSearchCV(pipe3, param3, cv=5, n_jobs=-1, verbose=100)

In [35]:
grid3.fit(train['description'], train['category'])

Fitting 5 folds for each of 54 candidates, totalling 270 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=object).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2068,), dtype=int64).
Pickling array (shape=(518,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=object).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2068,), dtype=int64).
Pickling array (shape=(518,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=object).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickli



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'vect__max_df': (0.5, 0.75, 1.0), 'vect__min_df': (0.02, 0.05), 'vect__max_features': (100, 500, 1000), 'sgdc__max_iter': (300, 1000, 3000)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=100)

In [36]:
grid3.best_score_

0.9075792730085074

In [37]:
y_pred3 = grid3.predict(test['description'])

In [73]:
y_submission = pd.read_csv('/Users/nickburkhalter/Desktop/Lambda School/Unit 4/DS-Unit-4-Sprint-1-NLP/module3-document-classification/data/sample_submission.csv')
assert y_submission.shape[0] == y_pred3.shape[0]

y_submission.category = y_pred3.astype(int)
y_submission.to_csv('pred3.csv', index=None)

In [39]:
# TFIDF + SVD + SGDC
# Remove stopwords
#-------------------ROUND 4---------------------
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import SGDClassifier


vect = TfidfVectorizer(stop_words='english')
svd = TruncatedSVD(algorithm='randomized',
                   random_state=42)
sgdc = SGDClassifier(random_state=42)

pipe4 = Pipeline([('vect', vect), 
                 ('svd', svd), 
                 ('sgdc', sgdc)])

param4 = {
    #'vect__max_df': (1.0,),
    #'vect__min_df': (.02,),
    'svd__n_components': (100, 300, 1000),
    'sgdc__max_iter': (300, 1000, 3000),
}

grid4 = GridSearchCV(pipe4, param4, cv=5, n_jobs=-1, verbose=1000)

In [40]:
grid4.fit(train['description'], train['category'])

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=object).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2068,), dtype=int64).
Pickling array (shape=(518,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=object).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2068,), dtype=int64).
Pickling array (shape=(518,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=object).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...dom_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'svd__n_components': (100, 300, 1000), 'sgdc__max_iter': (300, 1000, 3000)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1000)

In [41]:
grid4.best_score_  # Promising!

0.9280742459396751

In [42]:
y_pred4 = grid4.predict(test['description'])

In [74]:
y_submission = pd.read_csv('/Users/nickburkhalter/Desktop/Lambda School/Unit 4/DS-Unit-4-Sprint-1-NLP/module3-document-classification/data/sample_submission.csv')
assert y_submission.shape[0] == y_pred4.shape[0]

y_submission.category = y_pred4.astype(int)
y_submission.to_csv('pred4.csv', index=None)

In [52]:
# TFIDF + SVD + SVC
#-------------------ROUND 5---------------------

# THIS ONE SUCKS - DON'T USE!!!
from sklearn.svm import SVC

In [57]:
vect = TfidfVectorizer(stop_words='english')
svd = TruncatedSVD(algorithm='randomized',
                   random_state=42)
svc = SVC(random_state=42)

pipe5 = Pipeline([('vect', vect), 
                 ('svd', svd), 
                 ('svc', svc)])

param5 = {
    'svd__n_components': (100, 300, 1000),
    'svc__degree': (5, 7),
    'svc__max_iter': (300, 1000, 3000),
}

grid5 = GridSearchCV(pipe5, param5, cv=5, n_jobs=-1, verbose=1000)

In [58]:
grid5.fit(train['description'], train['category'])

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=object).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2068,), dtype=int64).
Pickling array (shape=(518,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=object).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2068,), dtype=int64).
Pickling array (shape=(518,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=object).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Picklin



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'svd__n_components': (100, 300, 1000), 'svc__degree': (5, 7), 'svc__max_iter': (300, 1000, 3000)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1000)

In [59]:
grid5.best_score_  # CRAP!

0.7540603248259861

In [65]:
# TFIDF + SVD + SGD
# Increase # iterations on svd
#-------------------ROUND 6---------------------
vect = TfidfVectorizer()
svd = TruncatedSVD(algorithm='randomized',
                   n_iter=15,
                   random_state=42)
sgdc = SGDClassifier(random_state=42)

pipe6 = Pipeline([('vect', vect), 
                 ('svd', svd), 
                 ('sgdc', sgdc)])

param6 = {
    #'vect__max_df': (1.0,),
    #'vect__min_df': (.02,),
    'svd__n_components': (100, 300, 1000),
    'sgdc__max_iter': (300, 1000, 3000),
}

grid6 = GridSearchCV(pipe6, param6, cv=5, n_jobs=-1, verbose=1000)

In [66]:
grid6.fit(train['description'], train['category'])

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=object).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2068,), dtype=int64).
Pickling array (shape=(518,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=object).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2068,), dtype=int64).
Pickling array (shape=(518,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=object).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling array (shape=(2586,), dtype=int64).
Pickling



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...dom_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'svd__n_components': (100, 300, 1000), 'sgdc__max_iter': (300, 1000, 3000)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1000)

In [67]:
grid6.best_score_  # HELL YEAH!

0.9358081979891725

In [68]:
y_pred6 = grid6.predict(test['description'])

In [75]:
y_submission = pd.read_csv('/Users/nickburkhalter/Desktop/Lambda School/Unit 4/DS-Unit-4-Sprint-1-NLP/module3-document-classification/data/sample_submission.csv')
assert y_submission.shape[0] == y_pred6.shape[0]

y_submission.category = y_pred6.astype(int)
y_submission.to_csv('pred6.csv', index=None)

In [76]:
# MAJORITY CLASSIFIER

# Filenames of your submissions you want to ensemble
files = ['pred1.csv', 'pred2.csv', 'pred3.csv', 'pred4.csv', 'pred6.csv']

target = 'category'
submissions = (pd.read_csv(file)[[target]] for file in files)
ensemble = pd.concat(submissions, axis='columns')
majority_vote = ensemble.mode(axis='columns')[0].astype(int)

sample_submission = pd.read_csv('/Users/nickburkhalter/Desktop/Lambda School/Unit 4/DS-Unit-4-Sprint-1-NLP/module3-document-classification/data/sample_submission.csv')
submission = sample_submission.copy()
submission[target] = majority_vote
submission.to_csv('my-ultimate-ensemble-submission.csv', index=False)