## Importing required packages

In [1]:

import nltk
import pandas as pd
import re
import string

from sklearn import metrics
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


## Importing Data

In [2]:
cols_required = ['text', 'label1']  # only required columns
train_data = pd.read_csv('corpus_training.csv', usecols=cols_required)  # training data
test_data = pd.read_csv('corpus_test.csv', usecols=cols_required)  # test_data

train_data.head(10)

Unnamed: 0,text,label1
0,Alpine Granada Brown Wood Dining Chairs (Set o...,dining
1,Graham Hills Cottage Cracked Wheat Six Drawer ...,bedroom
2,Safavieh Hollis Clear Console TableWhether pla...,living
3,"Equity 20080 3.5"" Travel Alarm ClockFeatures: ...",accessories
4,Jennifer Taylor Michelle Tufted Square Ottoman...,living
5,Set of Three Boxed Hand-painted Mini-Pillar Ca...,accessories
6,Furniture of America Merlam Faux Leather Futon...,living
7,"Wooden 3 Panel Mirror Vanity Set with Stool, R...",living
8,Bobkona Prissy Accent ChairPair this versatile...,living
9,Full Size Khaki Suede 8-inch Dual Latex Futon ...,living


## Initializing stopwords and lemmatizer

In [3]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords += [  # stopword obtained from Analysis.ipynb
    'inch',
    'x',
    'furniture',
    'pattern',
    'please',
    'note',
    'features',
    'material',
    'dimensions',
    'style',
    'set',
    'includes',
    'area',
    'design',
    'replacement',
    'finish',
    ]
lm = nltk.WordNetLemmatizer()


## Label Encoding for target column

In [4]:
label = LabelEncoder()
train_data['label'] = label.fit_transform(train_data['label1'])
test_data['label'] = label.fit_transform(test_data['label1'])
train_data.sample(5)


Unnamed: 0,text,label1,label
6509,Industrial Metal Queen-size Platform BedA gent...,bedroom,1
12603,Contemporary style 5-Tier Bookcase With 5 Open...,office,6
7241,The Curated Nomad Del Sur Counter-height Frenc...,dining,2
14179,Avery Home Lighting Stuart 1-Light Adjustable ...,lighting,4
4990,ECARPETGALLERY Hand-knotted Bakhtiar Red Wool...,accessories,0


In [5]:
test_data.sample(5)

Unnamed: 0,text,label1,label
2183,Pillows + Décor > Gifts > Holiday > Holiday N...,accessories,0
146,Bed & Bath > Bedding > Duvet Covers + Shams P...,bedroom,1
1762,Lighting > Ceiling Windsor Smith for Arterior...,lighting,4
886,Furniture > Living Room Furniture > Accent Cha...,living,5
905,Furniture > Living Room Furniture > Accent Cha...,living,5


## Mapping of labels

In [6]:
le_name_mapping = dict(zip(label.classes_,
                       label.transform(label.classes_)))
for (k, v) in le_name_mapping.items():
    print (k, ':', v)

 accessories : 0
 bedroom : 1
 dining : 2
 kids : 3
 lighting : 4
 living : 5
 office : 6
 outdoor : 7


## Cleaning the text by removing punctutaions, tokenizing and lemmatizing.

In [7]:
def clean_text(text):
    """ 
        description: removes punctuation, stopwords and then applies lemmatization on tokens
        input: string
        output: list (tokens of string)
    
    """

    text = text.split(':')[0]  # skipping contents past first ':' since they are of not much importance

    text = ''.join([word.lower() for word in text if word
                   not in string.punctuation])  # removing punctuation

    text = re.sub(" \d+", ' ', text)  # removing numbers
    tokens = re.split('\W+', text)  # splitting string into tokens
    text = [lm.lemmatize(word) for word in tokens if word
            not in stopwords]  # apply lemmatizer on each token

    return text

## Train test split

In [8]:
# Training dataset

X_train = train_data[['text']]
y_train = train_data['label']

# Test dataset

X_test = test_data[['text']]
y_test = test_data['label']

## Text Vectorization TF-IDF

In [9]:
# TfidfVectorizer performs count vectorization and tfidf transformation in one go.

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(X_train['text'])

tfidf_train = tfidf_vect_fit.transform(X_train['text'])  # Training dataset
tfidf_test = tfidf_vect_fit.transform(X_test['text'])  # Test dataset

## Converting sparse matrices to dataframe

In [10]:
# Training dataset vector dataframe

X_train_vect = pd.SparseDataFrame(tfidf_train,
                                  columns=tfidf_vect_fit.get_feature_names(),
                                  default_fill_value=0)

# Test dataset vector dataframe

X_test_vect = pd.SparseDataFrame(tfidf_test,
                                 columns=tfidf_vect_fit.get_feature_names(),
                                 default_fill_value=0)

## Gradient Boosting

In [11]:
gb = GradientBoostingClassifier()  # default uses learning rate of 0.1, n_estimators of 100 and max_depth of 3
gb_model = gb.fit(X_train_vect, y_train)
y_pred = gb_model.predict(X_test_vect)

## Accuracy and classification report

In [12]:
print("Accuracy:", accuracy_score(y_test, y_pred))

print(metrics.classification_report(y_test, y_pred, target_names = train_data['label1'].unique() ))

print(metrics.confusion_matrix(y_test, y_pred, labels = [0, 1, 2, 3, 4, 5, 6, 7]))

Accuracy: 0.909829619921363
              precision    recall  f1-score   support

      dining       0.94      0.98      0.96      1855
     bedroom       0.84      0.79      0.82       416
      living       0.92      1.00      0.96       257
 accessories       0.65      0.15      0.25       112
     outdoor       0.99      0.88      0.93       454
      office       0.85      0.93      0.89       647
    lighting       0.94      0.98      0.96        61
        kids       0.00      0.00      0.00        13

    accuracy                           0.91      3815
   macro avg       0.77      0.71      0.72      3815
weighted avg       0.91      0.91      0.90      3815

[[1810   19    2    8    0    8    3    5]
 [  17  330    3    1    0   63    0    2]
 [   0    0  256    0    0    0    0    1]
 [  35   38    0   17    0   21    0    1]
 [  42    4    5    0  398    3    0    2]
 [  29    0   12    0    1  600    1    4]
 [   0    0    0    0    1    0   60    0]
 [   1    0    1    

## Accuracy = 90.98%