In [1]:
# Import Statements
from sklearn.pipeline import Pipeline

from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
import pandas as pd

In [3]:
sample_submission = pd.read_csv('sample_submission.csv')
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

In [None]:
sample_submission.head()

In [None]:
train.head()

In [None]:
test.head()

In [4]:
X = train.drop(columns='category').copy()

In [5]:
y = train['category'].copy()

In [None]:
X.isnull().sum()

In [None]:
print(X['price'].median())
print(X['price'].mean())

In [None]:
print(X['pert_alcohol'].median())
print(X['pert_alcohol'].mean())

In [6]:
X['price'] = X['price'].fillna(X['price'].median())

In [7]:
X['pert_alcohol'] = X['pert_alcohol'].fillna(X['pert_alcohol'].median())

In [8]:
y = y.fillna(method='ffill')

In [9]:
# Create Pipeline

vect = TfidfVectorizer(stop_words='english')
sgdc = SGDClassifier()

pipe = Pipeline([('vect', vect), ('clf', sgdc)])

In [None]:
# Fit Pipeline
pipe.fit(X['description'], y)

In [None]:
y_pred = pipe.predict(test['description'])

In [16]:
submission = sample_submission.copy()

In [None]:
submission['category'] = y_pred.astype('int')

In [None]:
submission.head()

In [None]:
submission.to_csv('submission-02.csv', index=False)

In [None]:
# Experiment Management
from sklearn.model_selection import GridSearchCV

In [None]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'clf__max_iter':(20, 10, 100)
}

In [None]:
grid_search = GridSearchCV(pipe,parameters, cv=5, n_jobs=-1, verbose=1)

In [None]:
grid_search.fit(X['description'], y)

In [None]:
submission.head()

In [None]:
y.head()

In [None]:
y_pred = pipe.predict(test['description'])

In [None]:
y_pred

In [None]:
submission['category'] = y_pred.astype('int')

In [None]:
submission.head()

In [None]:
submission.to_csv('submission-03.csv', index=False)

In [10]:
# Import

from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100, 
                   algorithm='randomized',
                   n_iter=10)

In [11]:
# LSI

lsi = Pipeline([('vect', vect), ('svd', svd)])

In [12]:
# Pipe

pipe = Pipeline([('lsi', lsi), ('clf', sgdc)])

params = {
    'lsi__vect__max_df': (0.5, 0.75, 1.0)
}

In [13]:
pipe.fit(X['description'], y)

Pipeline(memory=None,
         steps=[('lsi',
                 Pipeline(memory=None,
                          steps=[('vect',
                                  TfidfVectorizer(analyzer='word', binary=False,
                                                  decode_error='strict',
                                                  dtype=<class 'numpy.float64'>,
                                                  encoding='utf-8',
                                                  input='content',
                                                  lowercase=True, max_df=1.0,
                                                  max_features=None, min_df=1,
                                                  ngram_range=(1, 1), norm='l2',
                                                  preprocessor=None,
                                                  smooth_idf=True,
                                                  stop_words='english',
                                                  strip_a

In [14]:
y_pred = pipe.predict(test['description'])

In [None]:
test.description

In [17]:
submission['category'] = y_pred.astype('int')

In [18]:
submission.to_csv('submission-04.csv', index=False)