In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [3]:
#add author to description
train['new_description'] = train.author + ' ' + train.description
test['new_description'] = test.author + ' ' + test.description

#convert author col to categorical
train.author = train.author.astype('category')
test.author = test.author.astype('category')

In [4]:
#clean/replace numeric feature NaNs w/ median vals:
median_price = 80
median_pert_alcohol = 46
train.price = train.price.fillna(median_price)
test.price = test.price.fillna(median_price)
train.pert_alcohol = train.pert_alcohol.fillna(median_pert_alcohol)
test.pert_alcohol = test.pert_alcohol.fillna(median_pert_alcohol) 

#drop rows w/ NaNs in target feature
train = train.dropna().reset_index(drop=True)


## SGDClassifier Pipeline w/ TD-IDF

In [5]:
#sklearn pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
vect = TfidfVectorizer(stop_words='english')
sgdc = SGDClassifier()
pipe = Pipeline([('vect', vect), ('classifier', sgdc)])

target = 'category'
X_train = train['description']
y_train = train[target]
X_test = test['description']

#Fit pipeline:
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

In [7]:
y_pred

array([2., 2., 4., 1., 1., 1., 1., 1., 2., 1., 4., 4., 1., 1., 1., 1., 1.,
       1., 2., 1., 1., 1., 1., 1., 4., 1., 1., 1., 3., 1., 4., 2., 1., 1.,
       1., 1., 1., 3., 4., 3., 2., 1., 1., 3., 1., 1., 1., 2., 1., 1., 3.,
       1., 3., 1., 1., 1., 1., 1., 1., 1., 3., 1., 1., 1., 1., 4., 2., 3.,
       1., 1., 1., 3., 1., 1., 4., 1., 3., 2., 1., 1., 4., 2., 2., 1., 1.,
       3., 2., 4., 1., 3., 1., 1., 1., 1., 1., 4., 1., 1., 4., 3., 1., 1.,
       1., 2., 1., 1., 1., 2., 1., 2., 3., 1., 1., 1., 1., 3., 1., 1., 1.,
       1., 3., 1., 2., 1., 1., 1., 1., 2., 2., 4., 1., 1., 1., 1., 3., 2.,
       1., 1., 1., 1., 1., 3., 2., 1., 1., 3., 4., 1., 1., 1., 3., 1., 1.,
       1., 1., 1., 2., 1., 1., 1., 1., 1., 4., 1., 1., 1., 3., 1., 2., 2.,
       1., 3., 3., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 2., 1.,
       4., 1., 3., 1., 4., 1., 1., 2., 2., 1., 1., 2., 1., 1., 1., 1., 2.,
       2., 1., 1., 1., 1., 4., 1., 1., 3., 1., 2., 1., 1., 1., 1., 1., 1.,
       4., 2., 2., 2., 2.

In [8]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'classifier__max_iter': (20, 10, 100)
}

grid_search = GridSearchCV(pipe, parameters, cv=5, n_jobs=-1, verbose=10)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done  43 out of  45 | elapsed:   11.5s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   11.7s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [9]:
best = grid_search.best_estimator_
grid_y_pred = best.predict(X_test)

best_df = pd.DataFrame(test.id)
best_df['category'] = grid_y_pred.astype('int')
best_df.to_csv('first_grid.csv', index=False)

In [10]:
baseline = pd.DataFrame(test.id)
baseline['category'] = y_pred.astype('int')
baseline.to_csv('baseline.csv', index=False)

In [11]:
baseline.head()

Unnamed: 0,id,category
0,955,2
1,3532,2
2,1390,4
3,1024,1
4,1902,1


### GridSearchCV 

In [12]:
vect = TfidfVectorizer(stop_words='english')
sgdc = SGDClassifier()
pipe = Pipeline([('vect', vect), ('classifier', sgdc)])

target = 'category'
X_train = train['new_description']
y_train = train[target]
X_test = test['new_description']

from sklearn.model_selection import GridSearchCV

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'classifier__max_iter': (20, 10, 100)
}

grid_search = GridSearchCV(pipe, parameters, cv=5, n_jobs=-1, verbose=10)
grid_search.fit(X_train, y_train)

best = grid_search.best_estimator_
grid_y_pred = best.predict(X_test)

best_df = pd.DataFrame(test.id)
best_df['category'] = grid_y_pred.astype('int')
best_df.to_csv('new_desc_grid.csv', index=False)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done  43 out of  45 | elapsed:    7.8s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    7.9s finished


In [13]:
best_df.head()

Unnamed: 0,id,category
0,955,2
1,3532,2
2,1390,4
3,1024,1
4,1902,1


## Latent Semantic Indexing:

In [14]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100, algorithm='randomized', n_iter=10, random_state=42)

In [15]:
lsi = Pipeline([('vect', vect), ('svd', svd)])
pipe = Pipeline([('lsi', lsi), ('classifier', sgdc)])

pipe.fit(X_train, y_train)
lsi_y_pred = pipe.predict(X_test)

In [16]:
lsi_df = pd.DataFrame(test.id)
lsi_df['category'] = lsi_y_pred.astype('int')
lsi_df.to_csv('baseline_lsi.csv', index=False)

## Spacy

In [17]:
import spacy
nlp = spacy.load('en_core_web_lg')

doc = nlp('Two bananas in pajamas')

bananas_vector = doc.vector
print(len(bananas_vector))

300


In [18]:
def get_word_vectors(docs):
    return [nlp(doc).vector for doc in docs]

In [19]:
data = train.new_description
X_train = get_word_vectors(data)
y_train = train[target]
X_test = get_word_vectors(test.new_description)
sgdc.fit(X_train, y_train)
spacy_y_pred = sgdc.predict(X_test)

In [20]:
spacy_df = pd.DataFrame(test.id)
spacy_df['category'] = spacy_y_pred.astype('int')
spacy_df.to_csv('spacy1.csv', index=False)

In [26]:
spacy_df.head()

Unnamed: 0,id,category
0,955,2
1,3532,2
2,1390,4
3,1024,1
4,1902,1


## Random Forest using the numeric features:

In [21]:
#one hot encode author feature:

import category_encoders as ce
one_hot_ce = ce.OneHotEncoder(cols=['author'], verbose=10, use_cat_names=True)
train = one_hot_ce.fit_transform(train)
test['category'] = 1
test = one_hot_ce.transform(test)
test = test.drop("category", axis=1)

In [22]:
from sklearn.ensemble import RandomForestClassifier

X_train = train.drop(['description', 'new_description', target], axis=1)
y_train = train[target]
X_test = test.drop(['description', 'new_description'], axis=1)

X_train.shape, y_train.shape, X_test.shape

((2586, 16), (2586,), (288, 16))

In [23]:
rf = RandomForestClassifier(n_jobs=1)

params = {
    'max_depth': [3,5,10],
    'min_samples_split': [2,3,4],
    'min_samples_leaf': [1,2],
    'n_estimators': [5,10,25,100,250]    
}

grid_search_rf = GridSearchCV(rf, param_grid=params, cv=4, verbose=3, scoring='accuracy', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

Fitting 4 folds for each of 90 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:   16.5s
[Parallel(n_jobs=-1)]: Done 330 tasks      | elapsed:   35.7s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:   40.0s finished


GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=1,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid='w

In [24]:
results = pd.DataFrame(grid_search_rf.cv_results_).sort_values('rank_test_score')
results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
10,0.025583,0.000539,0.005611,7.2e-05,3,1,4,5,"{'max_depth': 3, 'min_samples_leaf': 1, 'min_s...",0.783951,0.843653,0.843653,0.764706,0.808971,0.035317,1
21,0.060644,0.008718,0.01238,0.004217,3,2,3,10,"{'max_depth': 3, 'min_samples_leaf': 2, 'min_s...",0.763889,0.857585,0.843653,0.727554,0.798144,0.054211,2
25,0.024536,0.003744,0.005118,0.000491,3,2,4,5,"{'max_depth': 3, 'min_samples_leaf': 2, 'min_s...",0.768519,0.780186,0.806502,0.747678,0.775715,0.021239,3
32,0.171958,0.045948,0.013037,0.003268,5,1,2,25,"{'max_depth': 5, 'min_samples_leaf': 1, 'min_s...",0.459877,0.921053,0.905573,0.77709,0.765661,0.185432,4
5,0.028492,0.004067,0.005828,0.000875,3,1,3,5,"{'max_depth': 3, 'min_samples_leaf': 1, 'min_s...",0.79784,0.812693,0.876161,0.527864,0.753674,0.133583,5


In [25]:
best_rf = grid_search_rf.best_estimator_
best_rf.fit(X_train, y_train)
rf_y_pred = best_rf.predict(X_test)

best_rf_df = pd.DataFrame(test.id)
best_rf_df['category'] = rf_y_pred.astype('int')
best_rf_df.to_csv('rf_grid.csv', index=False)