In [104]:
from sklearn.pipeline import Pipeline

from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.model_selection import GridSearchCV
import spacy
nlp = spacy.load("en_core_web_lg")

In [2]:
vect = TfidfVectorizer(stop_words='english')
sgdc = SGDClassifier()

pipe = Pipeline([('vect', vect), ('clf', sgdc)])

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [30]:
test.head()

Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol
0,955,Fred Minnick,"Think carnival aromas—the good ones, anyway—me...",36.0,90,50.0
1,3532,Lew Bryson,"A blend of three bourbons, between 6 and 12 ye...",90.0,82,49.3
2,1390,Davin de Kergommeaux,"The nose is focused on cereal, hints of fresh ...",48.0,89,45.0
3,1024,Gavin Smith,Swiss-based Chapter 7 released this 19 year ol...,180.0,90,55.8
4,1902,Gavin Smith,Valkyrie replaces the current Dark Origins exp...,71.0,87,45.9


In [18]:
submission.head()

Unnamed: 0,id,category
0,955,1
1,3532,3
2,1390,2
3,1024,4
4,1902,2


In [4]:
train = train.dropna()

In [5]:
train_target = train['category']
train_description = train['description']

In [6]:
train_target.value_counts()

1.0    1538
2.0     445
3.0     299
4.0     194
Name: category, dtype: int64

In [7]:
pipe.fit(train_description, train_target)

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_patte...
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
           

In [8]:
predictions = pipe.predict(test['description']).astype(int)

In [27]:
type(predictions)

numpy.ndarray

In [34]:
new_submission = pd.DataFrame()
new_submission['id'] = submission['id']
new_submission['category'] = predictions
new_submission.to_csv('first-submission.csv', index=False)

In [31]:
new_submission = submission.copy()
new_submission['category'] = predictions
new_submission.to_csv('first-submission.csv', index=False)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [9]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'clf__max_iter':(20, 10, 100)
}
grid_search = GridSearchCV(pipe,parameters, cv=5, n_jobs=-1, verbose=1)

grid_search.fit(train_description, train_target)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    3.8s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [10]:
grid_search.best_params_

{'clf__max_iter': 20, 'vect__max_df': 0.5}

In [11]:
predictions = grid_search.predict(test['description']).astype(int)

In [40]:
new_submission = pd.DataFrame()
new_submission['id'] = submission['id']
new_submission['category'] = predictions
new_submission.to_csv('second-submission.csv', index=False)

In [12]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100, 
                   algorithm='randomized',
                   n_iter=10)

params = {
    'lsi__svd__n_components': [10, 100, 150, 200, 250, 300, 400]
}

lsi = Pipeline([('vect', vect), ('svd', svd)])

pipe = Pipeline([('lsi', lsi), ('clf', sgdc)])

pipe.fit(train_description, train_target)

grid_search = GridSearchCV(pipe,params, cv=10, n_jobs=-1, verbose=1)

grid_search.fit(train_description, train_target)

Fitting 10 folds for each of 7 candidates, totalling 70 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.3s
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:   49.2s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('lsi',
                                        Pipeline(memory=None,
                                                 steps=[('vect',
                                                         TfidfVectorizer(analyzer='word',
                                                                         binary=False,
                                                                         decode_error='strict',
                                                                         dtype=<class 'numpy.float64'>,
                                                                         encoding='utf-8',
                                                                         input='content',
                                                                         lowercase=True,
                                                                         max_df=1

In [49]:
grid_search.best_params_

{'lsi__svd__n_components': 400}

In [None]:
grid_search.

In [50]:
predictions = grid_search.predict(test['description']).astype(int)

new_submission = pd.DataFrame()
new_submission['id'] = submission['id']
new_submission['category'] = predictions
new_submission.to_csv('third-submission.csv', index=False)

In [13]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [82]:
def get_word_vectors(docs):
    return [nlp(doc).vector for doc in docs]

In [None]:
train_target

In [22]:
import numpy as np

In [24]:
X = get_word_vectors(train_description)

In [None]:
sgdc = SGDClassifier()

In [26]:
sgdc.fit(X, np.array(train_target))

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [28]:
X_test = get_word_vectors(test['description'])
y_pred = sgdc.predict(X_test)

In [30]:
new_submission = pd.DataFrame()
new_submission['id'] = submission['id']
new_submission['category'] = y_pred.astype(int)
new_submission.to_csv('fourth-submission.csv', index=False)

In [47]:
def get_word_vectors(doc):
    return nlp(doc).vector

In [49]:
from sklearn.feature_extraction.text import CountVectorizer

parameters = {
    'clf__max_iter':(20, 10, 100)
}

lsi = Pipeline([('vect', CountVectorizer(tokenizer=get_word_vectors))])

pipe = Pipeline([('lsi', lsi), ('clf', sgdc)])

pipe.fit(train_description, train_target)

grid_search = GridSearchCV(pipe,parameters, cv=10, n_jobs=-1, verbose=1)

In [None]:
grid_search.fit(train_description, train_target)

In [53]:
from sklearn.base import TransformerMixin
spacy.load('en')
from spacy.lang.en import English
parser = English()

In [86]:
class CleanText(TransformerMixin):
    def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def get_params(self, deep=True):
        return {}
    
def cleanText(text):
        text.strip().replace("\n", " ").replace("\r", " ")
        text = text.lower()
        return text

def tokenizeText(sample):
    doc = nlp(sample)

    return tokens
    
# def vectorizeText(docs):
    
        

In [87]:
parameters = {
    'clf__max_iter':(20, 10, 100)
}

lsi = Pipeline([('vect', CountVectorizer(tokenizer=tokenizeText))])

pipe = Pipeline([('lsi', lsi), ('clf', sgdc)])

pipe.fit(train_description, train_target)

grid_search = GridSearchCV(pipe,parameters, cv=10, n_jobs=-1, verbose=1)

In [None]:
grid_search.fit(train_description, train_target)

In [None]:
y_pred = grid_search.predict(test['description']).astype(int)

In [92]:
def tokenize(text):
    doc = nlp(text)
    return [token.text.lower().strip() for token in doc if (token.is_stop != True) and (token.is_punct != True)]

In [93]:
vectorizer = CountVectorizer(tokenizer=tokenize)

In [100]:
from joblib import parallel_backend

parameters = {
    'clf__max_iter':(20, 10, 100)
}

lsi = Pipeline([('vect', vectorizer)])

pipe = Pipeline([('lsi', lsi), ('clf', sgdc)])

pipe.fit(train_description, train_target)
with parallel_backend('multiprocessing'):
    grid_search = GridSearchCV(pipe, parameters, n_jobs=-1, cv=10, verbose=1)

In [102]:
with parallel_backend('multiprocessing'):
    grid_search.fit(train_description, train_target)

Fitting 10 folds for each of 3 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  6.8min finished


In [98]:
y_pred = grid_search.predict(test['description']).astype(int)

In [99]:
new_submission = pd.DataFrame()
new_submission['id'] = submission['id']
new_submission['category'] = y_pred
new_submission.to_csv('fifth-submission.csv', index=False)

In [105]:
import xgboost as xgb

In [None]:
dtrain = xgb.DMatrix()
dtest = xgb.DMatrix()

In [106]:
clf = xgb.XGBClassifier(random_state=42, seed=2)

In [107]:
pipe = Pipeline([('vect', vectorizer), ('clf', clf)])

In [108]:
pipe.named_steps['clf'].get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 100,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 42,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': 2,
 'silent': None,
 'subsample': 1,
 'verbosity': 1}

In [None]:
param_grid = {
       'clf__n_estimators': [250, 300, 350, 400],
       'clf__colsample_bytree': [0.4, 0.6, 0.7],
       'clf__subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1]
}

pipe.fit(train_description, train_target)
with parallel_backend('multiprocessing'):
    grid_search = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=10, verbose=1)

In [None]:
with parallel_backend('multiprocessing'):
    grid_search.fit(train_description, train_target)

In [None]:
grid_search.best_params_

In [114]:
y_pred = grid_search.predict(test['description']).astype(int)

In [115]:
new_submission = pd.DataFrame()
new_submission['id'] = submission['id']
new_submission['category'] = y_pred
new_submission.to_csv('sixth-submission.csv', index=False)