In [1]:
import pandas as pd

df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol,category
0,1,John Hansell,A marriage of 13 and 18 year old bourbons. A m...,85.0,97,51.5,2.0
1,2,Dave Broom,There have been some legendary Bowmores from t...,13500.0,97,42.9,1.0
2,3,John Hansell,This bottling celebrates master distiller Park...,150.0,97,50.0,2.0
3,4,John Hansell,What impresses me most is how this whisky evol...,4500.0,97,40.5,1.0
4,6,Davin de Kergommeaux,"After 40 years in barrels, the trademark Canad...",199.0,96,45.0,


In [2]:
df.isna().sum()

id                0
author            0
description       0
price            63
ratingValue       0
pert_alcohol     60
category        288
dtype: int64

In [3]:
df['category'].value_counts()

1.0    1637
2.0     449
3.0     300
4.0     200
Name: category, dtype: int64

In [4]:
# Drop NaN from rows
df = df.dropna(subset=['category'])
df.head()

Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol,category
0,1,John Hansell,A marriage of 13 and 18 year old bourbons. A m...,85.0,97,51.5,2.0
1,2,Dave Broom,There have been some legendary Bowmores from t...,13500.0,97,42.9,1.0
2,3,John Hansell,This bottling celebrates master distiller Park...,150.0,97,50.0,2.0
3,4,John Hansell,What impresses me most is how this whisky evol...,4500.0,97,40.5,1.0
5,9,Fred Minnick,"A caramel-laden fruit bouquet, followed by une...",150.0,96,54.49,2.0


In [5]:
df.isna().sum()

id               0
author           0
description      0
price           54
ratingValue      0
pert_alcohol    56
category         0
dtype: int64

In [6]:
df['description'][0]

'A marriage of 13 and 18 year old bourbons. A mature yet very elegant whiskey, with a silky texture and so easy to embrace with a splash of water. Balanced notes of honeyed vanilla, soft caramel, a basket of complex orchard fruit, blackberry, papaya, and a dusting of cocoa and nutmeg; smooth finish. Sophisticated, stylish, with well-defined flavors. A classic!'

In [7]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol
0,955,Fred Minnick,"Think carnival aromas—the good ones, anyway—me...",36.0,90,50.0
1,3532,Lew Bryson,"A blend of three bourbons, between 6 and 12 ye...",90.0,82,49.3
2,1390,Davin de Kergommeaux,"The nose is focused on cereal, hints of fresh ...",48.0,89,45.0
3,1024,Gavin Smith,Swiss-based Chapter 7 released this 19 year ol...,180.0,90,55.8
4,1902,Gavin Smith,Valkyrie replaces the current Dark Origins exp...,71.0,87,45.9


In [8]:
df_test.isna().sum()

id              0
author          0
description     0
price           9
ratingValue     0
pert_alcohol    4
dtype: int64

In [9]:
df_test.shape

(288, 6)

# TFIDF

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

# Determine the feature and target
data = 'description'
target = 'category'

# Instantiate vectorizer object
vect = TfidfVectorizer(stop_words='english', max_features=5000) # Parameter min_df=integer - Need the required amount before it's consider
sgdc = SGDClassifier()

# Pipeline
pipe = Pipeline([('vect', vect), ('clf', sgdc)])

# Fit pipeline
pipe.fit(df[data], df[target])



Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [11]:
from sklearn.model_selection import GridSearchCV

# Name the parameters for search
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'clf__max_iter': (20, 10, 100)
}

In [12]:
# Search with respect to the parameters specified above
grid_search = GridSearchCV(pipe, parameters, cv=5, n_jobs=-1, verbose=1)

In [13]:
# Fit the model of the best grid search parameter
grid_search.fit(df[data], df[target])

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    9.7s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'vect__max_df': (0.5, 0.75, 1.0), 'clf__max_iter': (20, 10, 100)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [14]:
category_predict = grid_search.predict(df_test['description'])
category_predict

array([2., 2., 4., 1., 1., 1., 1., 1., 2., 1., 4., 4., 1., 1., 1., 1., 1.,
       1., 2., 1., 1., 1., 1., 1., 4., 1., 1., 1., 3., 1., 4., 2., 1., 1.,
       1., 1., 1., 3., 4., 3., 2., 1., 1., 3., 1., 1., 1., 2., 1., 1., 3.,
       1., 3., 1., 1., 1., 1., 1., 1., 1., 3., 1., 1., 1., 1., 4., 2., 3.,
       1., 1., 1., 3., 1., 1., 4., 1., 2., 2., 1., 1., 4., 2., 2., 1., 1.,
       3., 2., 4., 1., 3., 1., 1., 1., 1., 1., 4., 1., 1., 4., 3., 1., 1.,
       1., 2., 1., 1., 1., 2., 1., 2., 3., 1., 1., 1., 1., 3., 1., 1., 1.,
       1., 3., 1., 2., 1., 1., 1., 1., 2., 2., 4., 1., 1., 1., 1., 3., 2.,
       1., 1., 1., 1., 1., 3., 2., 1., 1., 3., 4., 1., 1., 1., 3., 1., 1.,
       1., 1., 1., 2., 1., 1., 1., 1., 1., 4., 1., 1., 1., 3., 1., 2., 2.,
       1., 3., 3., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 2., 1.,
       4., 1., 3., 1., 4., 1., 1., 2., 2., 1., 1., 2., 1., 1., 1., 1., 2.,
       2., 1., 1., 1., 1., 4., 1., 1., 3., 1., 2., 1., 1., 1., 1., 1., 1.,
       4., 2., 2., 2., 2.

In [15]:
# Change to integer
df_test['category'] = category_predict.astype(int)

In [17]:
# Get only id and category colmuns
df_test_final = df_test[['id', 'category']]
df_test_final.head()

Unnamed: 0,id,category
0,955,2
1,3532,2
2,1390,4
3,1024,1
4,1902,1


In [18]:
# Submission
df_test_final.to_csv('./submission.csv', index=False)

In [19]:
df_test_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288 entries, 0 to 287
Data columns (total 2 columns):
id          288 non-null int64
category    288 non-null int32
dtypes: int32(1), int64(1)
memory usage: 3.5 KB


# Latent Semantic Indexing

In [20]:
from sklearn.decomposition import TruncatedSVD

# Dimenstionality Reduction for text data
svd = TruncatedSVD(n_components=100, 
                   algorithm='randomized', 
                   n_iter=10)

In [21]:
# LSI

lsi = Pipeline([('vect', vect), ('svd', svd)])

In [22]:
# Pipe

pipe = Pipeline([('lsi', lsi), ('clf', sgdc)])

In [23]:
pipe.fit(df[data], df[target])



Pipeline(memory=None,
     steps=[('lsi', Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(1, 1), norm=...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [24]:
parameters = {
    'lsi__vect__max_df': (0.5, 0.75, 1.0),
    'clf__max_iter': (10, 20, 100)
}

In [25]:
grid_search_lsi = GridSearchCV(pipe, parameters, cv=5, n_jobs=-1, verbose=1)

In [26]:
grid_search_lsi.fit(df[data], df[target])

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   10.2s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('lsi', Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(1, 1), norm=...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'lsi__vect__max_df': (0.5, 0.75, 1.0), 'clf__max_iter': (10, 20, 100)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [27]:
# Predict
category_predict_lsi = grid_search_lsi.predict(df_test['description'])

In [31]:
# Change type of the category to integer
df_test['category'] = category_predict_lsi.astype(int)

In [32]:
df_test_final_lsi = df_test[['id', 'category']]
df_test_final_lsi.head()

Unnamed: 0,id,category
0,955,2
1,3532,3
2,1390,1
3,1024,1
4,1902,1


In [33]:
# Submission
df_test_final_lsi.to_csv('./submission_lsi.csv', index=False)

# Word Embeddings with Spacy

In [34]:
import spacy # Better at mapping similarity of words than tfidf
nlp = spacy.load('en_core_web_lg')

In [35]:
doc = nlp('Two bananas in pyjamas')

In [39]:
# Represent text in dimensional space/vectorized
bananas_vector = doc.vector
print(len(bananas_vector))

300


In [51]:
def get_word_vectors(docs):
    return [nlp(doc).vector for doc in docs]

In [52]:
X = get_word_vectors(df[data])
sgdc.fit(X, df[target])



SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)