In [1]:
!pip install scikit-learn



In [79]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

In [30]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
train = train.dropna()

In [31]:
print(train.shape)
train.head()

(2476, 7)


Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol,category
0,1,John Hansell,A marriage of 13 and 18 year old bourbons. A m...,85.0,97,51.5,2.0
1,2,Dave Broom,There have been some legendary Bowmores from t...,13500.0,97,42.9,1.0
2,3,John Hansell,This bottling celebrates master distiller Park...,150.0,97,50.0,2.0
3,4,John Hansell,What impresses me most is how this whisky evol...,4500.0,97,40.5,1.0
5,9,Fred Minnick,"A caramel-laden fruit bouquet, followed by une...",150.0,96,54.49,2.0


In [32]:
print(test.shape)
test.head()

(288, 6)


Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol
0,955,Fred Minnick,"Think carnival aromas—the good ones, anyway—me...",36.0,90,50.0
1,3532,Lew Bryson,"A blend of three bourbons, between 6 and 12 ye...",90.0,82,49.3
2,1390,Davin de Kergommeaux,"The nose is focused on cereal, hints of fresh ...",48.0,89,45.0
3,1024,Gavin Smith,Swiss-based Chapter 7 released this 19 year ol...,180.0,90,55.8
4,1902,Gavin Smith,Valkyrie replaces the current Dark Origins exp...,71.0,87,45.9


# Noun Chunks

In [68]:
def pipeline(train,test):
    
    # features and target
    features = train.columns.drop(['id','category'])
    target = 'category'
    
    X_train = train[features]
    X_test = test[features]
    y_train = train[target]
    
    # ordinal encode author
    encoder = OrdinalEncoder()
    X_train['author'] = encoder.fit_transform(X_train['author'].values.reshape(-1,1))
    X_test['author'] = encoder.transform(X_test['author'].values.reshape(-1,1))
    
    # tokenization
    nlp = spacy.load('en_core_web_md')

    def tokenize(doc):
        d = nlp(doc)
        tokens = []
        
        # you can do things other than noun_chunks
        for chunk in d.noun_chunks:
            tokens.append(chunk.lemma_)
        
        return tokens

    # fit count vectorizer
    vect = CountVectorizer(analyzer=tokenize, max_df=8, min_df=3)
    vect.fit(X_train['description'])
    
    # tokenize description
    train_desc = X_train['description']
    train_desc = vect.transform(train_desc)
    test_desc = X_test['description']
    test_desc = vect.transform(test_desc)

    # create matrix
    train_matrix = train_desc.todense()
    test_matrix = test_desc.todense()
    
    # create df from matrix
    train_matrix_df = pd.DataFrame(train_matrix, columns=vect.get_feature_names())
    test_matrix_df = pd.DataFrame(test_matrix, columns=vect.get_feature_names())
    
    #get lengths of train and test
    train_len = len(X_train)
    test_len = len(X_test)

    # add id for merge
    X_train['id'] = list(range(train_len))
    train_matrix_df['id'] = list(range(train_len))
    X_test['id'] = list(range(test_len))
    test_matrix_df['id'] = list(range(test_len))

    # merge train with desc matrix
    X_train = X_train.merge(train_matrix_df, on='id', how='inner')
    X_test = X_test.merge(test_matrix_df, on='id', how='inner')
    
    # drop unnecessary columns
    X_train = X_train.drop(columns=['description', 'id'])
    X_test = X_test.drop(columns=['description', 'id'])
    
    # clean null values
    X_test['price'] = X_test['price'].fillna(method='ffill')
    X_test['pert_alcohol'] = X_test['pert_alcohol'].fillna(method='ffill')
    
    assert len(X_train) == len(train)
    assert len(X_test) == len(test)
    
    return X_train, y_train, X_test
    
X_train, y_train, X_test = pipeline(train, test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [None]:
print(X_train.shape)
X_train.head()

In [69]:
print(X_test.shape)
X_train.head()

(288, 1492)


Unnamed: 0,author,price,ratingValue,pert_alcohol,$ 60 cad,( 375 ml,( 500 bottle,"( 6,000 bottle",( a binny ’s beverage depot exclusive,( a u.s. exclusive,...,worm tub,worn leather,yellow plum,young expression,young whisky,zesty spice,£,£ 55 ( 500 ml,’s offering,“
0,8.0,85.0,97,51.5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,13500.0,97,42.9,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,8.0,150.0,97,50.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,8.0,4500.0,97,40.5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4.0,150.0,96,54.49,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [72]:
# fit random forest classifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

# make predictions
y_preds = rfc.predict(X_test)
y_preds = [int(i) for i in y_preds]
assert len(y_preds) == len(test)

# create submission csv
sample_submission = pd.read_csv('./data/sample_submission.csv')
sample_submission['category'] = y_preds
sample_submission.to_csv('team_whiskey_submission.csv', index=False)



In [74]:
# fit logistic regression model
lg = LogisticRegression(C=10, solver='lbfgs')
lg.fit(X_train, y_train)

# make predictions
y_preds = lg.predict(X_test)
y_preds = [int(i) for i in y_preds]
assert len(y_preds) == len(test)

# create submission csv
sample_submission = pd.read_csv('./data/sample_submission.csv')
sample_submission['category'] = y_preds
sample_submission.to_csv('team_whiskey_submission.csv', index=False)



# Other Tokenization methods

In [75]:
# functions for tokenizing

punctuations = string.punctuation
stop_words = spacy.lang.en.stop_words.STOP_WORDS
parser = English()

def spacy_tokenizer(sentence):
    tokens = parser(sentence)
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens]
    tokens = [word for word in tokens if word not in stop_words and word not in punctuations]
    return tokens

def clean_text(text):
    return text.strip().lower()

bow_vector = CountVectorizer(tokenizer=spacy_tokenizer, ngram_range=(1,1))
tfidf_vector = CountVectorizer(tokenizer=spacy_tokenizer)

class predictors(TransformerMixin):
    def transform(self,X,**transform_params):
        return [clean_text(text) for text in X]
    def fit(self,X,y=None,**fit_params):
        return self
    def get_params(self,deep=True):
        return {}

In [83]:
def pipeline(train,test):
    
    # features and target
    features = train.columns.drop(['id','category'])
    target = 'category'
    
    X_train = train[features]
    X_test = test[features]
    y_train = train[target]
    
    # ordinal encode author
    encoder = OrdinalEncoder()
    X_train['author'] = encoder.fit_transform(X_train['author'].values.reshape(-1,1))
    X_test['author'] = encoder.transform(X_test['author'].values.reshape(-1,1))
    
    # instantiate tokenization
    cleaner = predictors()
    bow_vector = CountVectorizer(tokenizer=spacy_tokenizer, ngram_range=(1,1))
    tfidf_vector = CountVectorizer(tokenizer=spacy_tokenizer)
    
    # tokenize description
    train_desc = X_train['description']
    train_desc = cleaner.fit_transform(train_desc)
    train_desc = bow_vector.fit_transform(train_desc)
#     train_desc = tfidf_vector.fit_transform(train_desc)
    test_desc = X_test['description']
    test_desc = cleaner.transform(test_desc) # only transform on test, no fit
    test_desc = bow_vector.transform(test_desc) # only transform on test, no fit
#     test_desc = tfidf_vector.transform(test_desc) # only transform on test, no fit
    
    # create matrix
    train_matrix = train_desc.todense()
    test_matrix = test_desc.todense()
    
    # create df from matrix
    train_matrix_df = pd.DataFrame(train_matrix, columns=bow_vector.get_feature_names())
    test_matrix_df = pd.DataFrame(test_matrix, columns=bow_vector.get_feature_names())
    
#     # keep only important features
#     feature_names = bow_vector.get_feature_names()[593:]
#     feature_names = feature_names[::-1]
#     feature_names = feature_names[27:]
#     train_matrix_df = train_matrix_df[feature_names]
#     test_matrix_df = test_matrix_df[feature_names]
    
    #get lengths of train and test
    train_len = len(X_train)
    test_len = len(X_test)

    # add id for merge
    X_train['id'] = list(range(train_len))
    train_matrix_df['id'] = list(range(train_len))
    X_test['id'] = list(range(test_len))
    test_matrix_df['id'] = list(range(test_len))

    # merge train with desc matrix
    X_train = X_train.merge(train_matrix_df, on='id', how='inner')
    X_test = X_test.merge(test_matrix_df, on='id', how='inner')
    
    # drop unnecessary columns
    X_train = X_train.drop(columns=['description', 'id'])
    X_test = X_test.drop(columns=['description', 'id'])
    
    # clean null values
    X_test['price_x'] = X_test['price_x'].fillna(method='ffill')
    X_test['pert_alcohol'] = X_test['pert_alcohol'].fillna(method='ffill')
    
    assert len(X_train) == len(train)
    assert len(X_test) == len(test)
    
    return X_train, y_train, X_test
    
X_train, y_train, X_test = pipeline(train, test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [77]:
print(X_train.shape)
X_train.head()

(2476, 7217)


Unnamed: 0,author,price_x,ratingValue,pert_alcohol,'s,--,-and,-at,-especially,-with,...,’98,’fiddich,’rothes,’s,“,”,•,…,€,﻿1
0,8.0,85.0,97,51.5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,13500.0,97,42.9,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,8.0,150.0,97,50.0,0,1,0,0,0,0,...,0,0,0,3,0,0,0,0,0,0
3,8.0,4500.0,97,40.5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4.0,150.0,96,54.49,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [78]:
print(X_test.shape)
X_test.head()

(288, 7217)


Unnamed: 0,author,price_x,ratingValue,pert_alcohol,'s,--,-and,-at,-especially,-with,...,’98,’fiddich,’rothes,’s,“,”,•,…,€,﻿1
0,4.0,36.0,90,50.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10.0,90.0,82,49.3,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
2,2.0,48.0,89,45.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5.0,180.0,90,55.8,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5.0,71.0,87,45.9,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [85]:
# find optimal parameters
classifier = LogisticRegression()
parameters = {
            'penalty':['l2'],
            'C':[1,10,100],
            'solver': ['lbfgs', 'newton-cg', 'liblinear', 'sag'],
        }
GS = GridSearchCV(classifier, parameters,cv=5,verbose=10)
GS.fit(X_train_new,y_train_new)
print (GS.best_params_)
print (GS.best_score_)

# C=10, penalty='l2', solver='lbfgs' # most optimal

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] C=1, penalty=l2, solver=lbfgs ...................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.5s remaining:    0.0s


[CV]  C=1, penalty=l2, solver=lbfgs, score=0.9193548387096774, total=   6.4s
[CV] C=1, penalty=l2, solver=lbfgs ...................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   12.0s remaining:    0.0s


[CV]  C=1, penalty=l2, solver=lbfgs, score=0.9274193548387096, total=   5.4s
[CV] C=1, penalty=l2, solver=lbfgs ...................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   18.5s remaining:    0.0s


[CV]  C=1, penalty=l2, solver=lbfgs, score=0.9576612903225806, total=   6.4s
[CV] C=1, penalty=l2, solver=lbfgs ...................................




[CV]  C=1, penalty=l2, solver=lbfgs, score=0.9232323232323232, total=   9.4s
[CV] C=1, penalty=l2, solver=lbfgs ...................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   28.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   35.0s remaining:    0.0s


[CV]  C=1, penalty=l2, solver=lbfgs, score=0.8924949290060852, total=   6.9s
[CV] C=1, penalty=l2, solver=newton-cg ...............................




[CV]  C=1, penalty=l2, solver=newton-cg, score=0.9213709677419355, total=  29.8s
[CV] C=1, penalty=l2, solver=newton-cg ...............................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.1min remaining:    0.0s


[CV]  C=1, penalty=l2, solver=newton-cg, score=0.9334677419354839, total=  31.3s
[CV] C=1, penalty=l2, solver=newton-cg ...............................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  1.6min remaining:    0.0s


[CV]  C=1, penalty=l2, solver=newton-cg, score=0.9576612903225806, total=  34.0s
[CV] C=1, penalty=l2, solver=newton-cg ...............................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  2.2min remaining:    0.0s


[CV]  C=1, penalty=l2, solver=newton-cg, score=0.9232323232323232, total=  28.6s
[CV] C=1, penalty=l2, solver=newton-cg ...............................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  2.7min remaining:    0.0s


[CV]  C=1, penalty=l2, solver=newton-cg, score=0.845841784989858, total=  34.9s
[CV] C=1, penalty=l2, solver=liblinear ...............................




[CV]  C=1, penalty=l2, solver=liblinear, score=0.9294354838709677, total=   0.6s
[CV] C=1, penalty=l2, solver=liblinear ...............................




[CV]  C=1, penalty=l2, solver=liblinear, score=0.9254032258064516, total=   0.5s
[CV] C=1, penalty=l2, solver=liblinear ...............................




[CV]  C=1, penalty=l2, solver=liblinear, score=0.9516129032258065, total=   0.5s
[CV] C=1, penalty=l2, solver=liblinear ...............................




[CV]  C=1, penalty=l2, solver=liblinear, score=0.9212121212121213, total=   0.6s
[CV] C=1, penalty=l2, solver=liblinear ...............................




[CV]  C=1, penalty=l2, solver=liblinear, score=0.9066937119675457, total=   0.6s
[CV] C=1, penalty=l2, solver=sag .....................................




[CV]  C=1, penalty=l2, solver=sag, score=0.6209677419354839, total=  59.2s
[CV] C=1, penalty=l2, solver=sag .....................................




[CV]  C=1, penalty=l2, solver=sag, score=0.6209677419354839, total=  55.6s
[CV] C=1, penalty=l2, solver=sag .....................................




[CV]  C=1, penalty=l2, solver=sag, score=0.6209677419354839, total=  55.3s
[CV] C=1, penalty=l2, solver=sag .....................................




[CV]  C=1, penalty=l2, solver=sag, score=0.6202020202020202, total=  52.2s
[CV] C=1, penalty=l2, solver=sag .....................................




[CV]  C=1, penalty=l2, solver=sag, score=0.6227180527383367, total=  55.4s
[CV] C=10, penalty=l2, solver=lbfgs ..................................




[CV]  C=10, penalty=l2, solver=lbfgs, score=0.9334677419354839, total=   5.6s
[CV] C=10, penalty=l2, solver=lbfgs ..................................




[CV]  C=10, penalty=l2, solver=lbfgs, score=0.9354838709677419, total=   5.7s
[CV] C=10, penalty=l2, solver=lbfgs ..................................




[CV]  C=10, penalty=l2, solver=lbfgs, score=0.9596774193548387, total=   5.8s
[CV] C=10, penalty=l2, solver=lbfgs ..................................




[CV]  C=10, penalty=l2, solver=lbfgs, score=0.9191919191919192, total=   6.0s
[CV] C=10, penalty=l2, solver=lbfgs ..................................




[CV]  C=10, penalty=l2, solver=lbfgs, score=0.8904665314401623, total=   6.2s
[CV] C=10, penalty=l2, solver=newton-cg ..............................




[CV]  C=10, penalty=l2, solver=newton-cg, score=0.9274193548387096, total=  27.1s
[CV] C=10, penalty=l2, solver=newton-cg ..............................




[CV]  C=10, penalty=l2, solver=newton-cg, score=0.9334677419354839, total=  26.6s
[CV] C=10, penalty=l2, solver=newton-cg ..............................




[CV]  C=10, penalty=l2, solver=newton-cg, score=0.9616935483870968, total=  30.8s
[CV] C=10, penalty=l2, solver=newton-cg ..............................




[CV]  C=10, penalty=l2, solver=newton-cg, score=0.9232323232323232, total=  29.2s
[CV] C=10, penalty=l2, solver=newton-cg ..............................




[CV]  C=10, penalty=l2, solver=newton-cg, score=0.8498985801217038, total=  37.4s
[CV] C=10, penalty=l2, solver=liblinear ..............................




[CV]  C=10, penalty=l2, solver=liblinear, score=0.9274193548387096, total=   0.6s
[CV] C=10, penalty=l2, solver=liblinear ..............................




[CV]  C=10, penalty=l2, solver=liblinear, score=0.9274193548387096, total=   0.4s
[CV] C=10, penalty=l2, solver=liblinear ..............................




[CV]  C=10, penalty=l2, solver=liblinear, score=0.9556451612903226, total=   0.5s
[CV] C=10, penalty=l2, solver=liblinear ..............................




[CV]  C=10, penalty=l2, solver=liblinear, score=0.9212121212121213, total=   0.6s
[CV] C=10, penalty=l2, solver=liblinear ..............................




[CV]  C=10, penalty=l2, solver=liblinear, score=0.9046653144016227, total=   0.6s
[CV] C=10, penalty=l2, solver=sag ....................................




[CV]  C=10, penalty=l2, solver=sag, score=0.6209677419354839, total=  58.8s
[CV] C=10, penalty=l2, solver=sag ....................................




[CV]  C=10, penalty=l2, solver=sag, score=0.6209677419354839, total= 1.2min
[CV] C=10, penalty=l2, solver=sag ....................................




[CV]  C=10, penalty=l2, solver=sag, score=0.6209677419354839, total= 1.1min
[CV] C=10, penalty=l2, solver=sag ....................................




[CV]  C=10, penalty=l2, solver=sag, score=0.6202020202020202, total= 1.2min
[CV] C=10, penalty=l2, solver=sag ....................................




[CV]  C=10, penalty=l2, solver=sag, score=0.6227180527383367, total= 1.1min
[CV] C=100, penalty=l2, solver=lbfgs .................................




[CV]  C=100, penalty=l2, solver=lbfgs, score=0.9274193548387096, total=   6.0s
[CV] C=100, penalty=l2, solver=lbfgs .................................




[CV]  C=100, penalty=l2, solver=lbfgs, score=0.9294354838709677, total=   7.1s
[CV] C=100, penalty=l2, solver=lbfgs .................................




[CV]  C=100, penalty=l2, solver=lbfgs, score=0.9556451612903226, total=   6.0s
[CV] C=100, penalty=l2, solver=lbfgs .................................




[CV]  C=100, penalty=l2, solver=lbfgs, score=0.9232323232323232, total=   7.1s
[CV] C=100, penalty=l2, solver=lbfgs .................................




[CV]  C=100, penalty=l2, solver=lbfgs, score=0.8945233265720081, total=   8.8s
[CV] C=100, penalty=l2, solver=newton-cg .............................




[CV]  C=100, penalty=l2, solver=newton-cg, score=0.9274193548387096, total=  33.1s
[CV] C=100, penalty=l2, solver=newton-cg .............................




[CV]  C=100, penalty=l2, solver=newton-cg, score=0.9334677419354839, total=  26.2s
[CV] C=100, penalty=l2, solver=newton-cg .............................




[CV]  C=100, penalty=l2, solver=newton-cg, score=0.9596774193548387, total=  32.0s
[CV] C=100, penalty=l2, solver=newton-cg .............................




[CV]  C=100, penalty=l2, solver=newton-cg, score=0.9212121212121213, total=  28.8s
[CV] C=100, penalty=l2, solver=newton-cg .............................




[CV]  C=100, penalty=l2, solver=newton-cg, score=0.8519269776876268, total=  43.6s
[CV] C=100, penalty=l2, solver=liblinear .............................




[CV]  C=100, penalty=l2, solver=liblinear, score=0.9274193548387096, total=   0.5s
[CV] C=100, penalty=l2, solver=liblinear .............................




[CV]  C=100, penalty=l2, solver=liblinear, score=0.9294354838709677, total=   0.5s
[CV] C=100, penalty=l2, solver=liblinear .............................




[CV]  C=100, penalty=l2, solver=liblinear, score=0.9556451612903226, total=   0.5s
[CV] C=100, penalty=l2, solver=liblinear .............................




[CV]  C=100, penalty=l2, solver=liblinear, score=0.9131313131313131, total=   0.6s
[CV] C=100, penalty=l2, solver=liblinear .............................




[CV]  C=100, penalty=l2, solver=liblinear, score=0.9046653144016227, total=   0.6s
[CV] C=100, penalty=l2, solver=sag ...................................




[CV]  C=100, penalty=l2, solver=sag, score=0.6209677419354839, total=  53.8s
[CV] C=100, penalty=l2, solver=sag ...................................




[CV]  C=100, penalty=l2, solver=sag, score=0.6209677419354839, total=  54.6s
[CV] C=100, penalty=l2, solver=sag ...................................




[CV]  C=100, penalty=l2, solver=sag, score=0.6209677419354839, total=  50.2s
[CV] C=100, penalty=l2, solver=sag ...................................




[CV]  C=100, penalty=l2, solver=sag, score=0.6202020202020202, total=  51.5s
[CV] C=100, penalty=l2, solver=sag ...................................




[CV]  C=100, penalty=l2, solver=sag, score=0.6227180527383367, total=  50.5s


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 24.3min finished


{'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.9277059773828756




In [84]:
# select model and fit
lg = LogisticRegression(C=10, solver='lbfgs')
lg.fit(X_train, y_train)

# make predictions
y_preds = lg.predict(X_test)
y_preds = [int(i) for i in y_preds]
assert len(y_preds) == len(test)

# create submission csv
sample_submission = pd.read_csv('./data/sample_submission.csv')
sample_submission['category'] = y_preds
sample_submission.to_csv('team_whiskey_submission.csv', index=False)

