# NLP Assignment: Predicting the lables of the paragraph.

### Importing libraries

In [17]:
%matplotlib inline

import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

### Reading files

In [18]:
train_data = pd.read_csv('C:/Users/prashant bajetha/train_data.csv')
train_label = pd.read_csv('C:/Users/prashant bajetha/train_label.csv')
test_df = pd.read_csv('C:/Users/prashant bajetha/test_data.csv')

In [19]:
train_data.head()

Unnamed: 0,text,id
0,"Keep your gloves, hats, coats and jackets toge...",122885
1,The Home Dynamix Serendipity Ivory 5 ft. 2 in....,188958
2,The Bosch 18-Volt lithium-ion line of Cordless...,146065
3,Restore your Porter-Cable sander or polisher t...,165138
4,The SPIKECUBE Surge Suppressor from Tripp Lite...,185565


In [20]:
train_label.head()

Unnamed: 0,id,label
0,100003,Shape
1,100004,Voltage (volts)
2,100004,Wattage (watts)
3,100006,Wattage (watts)
4,100007,ENERGY STAR Certified


In [21]:
test_df.head()

Unnamed: 0,text,id
0,These machine screw nuts are designed to be us...,114689
1,The M18 FUEL Drill/Driver is the Most Powerful...,183172
2,Steel City 2-Gang 30 cu. in. Square Electrical...,217304
3,Native Collection Plus has Shaw's SilentStep P...,184115
4,Fasade decorative 4 ft. x 8 ft. vinyl wall pan...,103786


# Making train_label more understandable

### This problem is a Multi label classification problem in which each instance(row) is labelled with one or more than one label. So, to better understand the data set, i performed below operation.....

In [22]:
n_label = train_label.label.unique()
n_label

array(['Shape', 'Voltage (volts)', 'Wattage (watts)',
       'ENERGY STAR Certified', 'Finish', 'Indoor/Outdoor',
       'Package Quantity', 'Features', 'Included', 'Hardware Included',
       'Color', 'Assembly Required', 'Tools Product Type',
       'Commercial / Residential', 'Flooring Product Type'], dtype=object)

In [23]:
original_label = train_label.copy()      # Making a original copy of train_label for use.
train_label.drop('label',axis = 1,inplace = True)      # dropping the label column.
train_label = train_label.drop_duplicates(subset=['id'])  # Only keeping the unique id's.
labels = ['Shape','Voltage (volts)', 'Wattage (watts)','ENERGY STAR Certified','Finish','Indoor/Outdoor','Package Quantity',
          'Features','Included','Hardware Included','Color','Assembly Required','Tools Product Type','Commercial / Residential',
          'Flooring Product Type']
for col in labels:
    train_label[col] = 0           # coverting train_label to get each label as a column and initialising the value with 0.
train_label.shape

(41569, 16)

In [28]:
for index, row in original_label.iterrows():     # replacing 0 with 1 where the label for a particular id are given.
    ID = row['id']
    Label = row['label']
    train_label.loc[train_label['id']==ID,Label] = 1
train_label.head()


Unnamed: 0,id,Shape,Voltage (volts),Wattage (watts),ENERGY STAR Certified,Finish,Indoor/Outdoor,Package Quantity,Features,Included,Hardware Included,Color,Assembly Required,Tools Product Type,Commercial / Residential,Flooring Product Type
0,100003,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,100004,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
2,100006,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,100007,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,100008,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0


### Counting the number of labels for each sample

In [30]:
counts = []
categories = list(train_label.columns.values)
for i in categories:
    counts.append((i, train_label[i].sum()))
df_stats = pd.DataFrame(counts, columns=['category', 'number_of_samples'])
df_stats


Unnamed: 0,category,number_of_samples
0,id,7055062493
1,Shape,1468
2,Voltage (volts),2485
3,Wattage (watts),1727
4,ENERGY STAR Certified,2954
5,Finish,1461
6,Indoor/Outdoor,4303
7,Package Quantity,2019
8,Features,1783
9,Included,2020


# Data preprocessing

In [39]:
#combining both dataframe to include text column 
train_data.sort_values(['id'], ascending=[True], inplace=True)
train_df = pd.merge(train_data, train_label, on='id')
train_df.head()

Unnamed: 0.1,Unnamed: 0,text,id,Shape,Voltage (volts),Wattage (watts),ENERGY STAR Certified,Finish,Indoor/Outdoor,Package Quantity,Features,Included,Hardware Included,Color,Assembly Required,Tools Product Type,Commercial / Residential,Flooring Product Type
0,0,Classic architecture meets contemporary design...,100003,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,The Grape Solar 265-Watt Polycrystalline PV So...,100004,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
2,2,Achieving delicious results is almost effortle...,100006,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,3,The Quantum Adjustable 2-Light LED Black Emerg...,100007,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,4,The Teks #10 x 1-1/2 in. Zinc-Plated Steel Was...,100008,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0


In [42]:
import re 
import nltk
from nltk.corpus import stopwords
def preprocessing(dataset):
    corpus = []
    for i in range(0,len(dataset)):
        clean_text = re.sub('[^a-zA-Z]', ' ', dataset['text'][i])   
        clean_text = clean_text.lower()     # converts all the letters in small letter
        clean_text = clean_text.split()
        lm = WordNetLemmatizer()
        clean_text = [lm.lemmatize(word) for word in clean_text if not word in set(stopwords.words('english'))]  # it is checking for any stopwords and then converting the word into its root.
        clean_text = ' '.join(clean_text)
        corpus.append(clean_text)
    return corpus 

train_clean_text = preprocessing(train_df)
test_clean_text = preprocessing(test_df)

# replacing the text column with clean
train_df['text'] = train_clean_text
test_df['text'] = test_clean_text

train_df.head()


Unnamed: 0.1,Unnamed: 0,text,id,Shape,Voltage (volts),Wattage (watts),ENERGY STAR Certified,Finish,Indoor/Outdoor,Package Quantity,Features,Included,Hardware Included,Color,Assembly Required,Tools Product Type,Commercial / Residential,Flooring Product Type
0,0,classic architecture meet contemporary design ...,100003,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,grape solar watt polycrystalline pv solar pane...,100004,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
2,2,achieving delicious result almost effortless w...,100006,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,3,quantum adjustable light led black emergency l...,100007,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,4,teks x zinc plated steel washer head hex self ...,100008,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0


In [35]:
#Splitting training data to train the model on one part and test it for another part.
train, test = train_test_split(train_df, random_state=1, test_size=0.30, shuffle=True)
X_train = train.text
X_test = test.text

###  Pipeline is to help automate machine learning workflows. So we use pipeline to train different classifier. 
# Naive Bayes


In [36]:
categories =  ['Indoor/Outdoor',
                      'Commercial / Residential',
                       'ENERGY STAR Certified',
                       'Hardware Included',
                       'Package Quantity',
                       'Flooring Product Type',
                       'Color',
                       'Tools Product Type',
                       'Included',
                       'Voltage (volts)',
                       'Assembly Required',
                       'Features',
                       'Wattage (watts)',
                       'Finish',
                       'Shape']
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer()),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])
for category in categories:
    print('... Processing {}'.format(category))
    # train the model on each label
    NB_pipeline.fit(X_train, train[category])
    # testing accuracy
    prediction = NB_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

... Processing Indoor/Outdoor
Test accuracy is 0.8389864485606607
... Processing Commercial / Residential
Test accuracy is 0.9113944350894074
... Processing ENERGY STAR Certified
Test accuracy is 0.8854141608531794
... Processing Hardware Included
Test accuracy is 0.8711410472295726
... Processing Package Quantity
Test accuracy is 0.8943148103600352
... Processing Flooring Product Type
Test accuracy is 0.9625531232459306
... Processing Color
Test accuracy is 0.8923101595702029
... Processing Tools Product Type
Test accuracy is 0.9031352738352979
... Processing Included
Test accuracy is 0.8961590890866811
... Processing Voltage (volts)
Test accuracy is 0.8935931360756956
... Processing Assembly Required
Test accuracy is 0.9004089487611258
... Processing Features
Test accuracy is 0.9220591772913158
... Processing Wattage (watts)
Test accuracy is 0.9169272712693449
... Processing Finish
Test accuracy is 0.9156442947638521
... Processing Shape
Test accuracy is 0.9133990858792398


# Logistic Regression

In [37]:
LogReg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer()),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
            ])
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    LogReg_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = LogReg_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

... Processing Indoor/Outdoor
Test accuracy is 0.935610616630583
... Processing Commercial / Residential
Test accuracy is 0.9654398203832892
... Processing ENERGY STAR Certified
Test accuracy is 0.9688076337102077
... Processing Hardware Included
Test accuracy is 0.9469168470852377
... Processing Package Quantity
Test accuracy is 0.9611899607088445
... Processing Flooring Product Type
Test accuracy is 0.9898965600192446
... Processing Color
Test accuracy is 0.9482800096223238
... Processing Tools Product Type
Test accuracy is 0.9780290273434368
... Processing Included
Test accuracy is 0.9655200064148826
... Processing Voltage (volts)
Test accuracy is 0.9682463314890546
... Processing Assembly Required
Test accuracy is 0.9695293079945474
... Processing Features
Test accuracy is 0.9828401892390346
... Processing Wattage (watts)
Test accuracy is 0.9669633549835619
... Processing Finish
Test accuracy is 0.9684868895838344
... Processing Shape
Test accuracy is 0.9610295886456579


### Logistic Regression here predicts the output with better test accuracy. So final predictions are made with Logistic regression

# Predicting probabilites

In [38]:
#predicting probabilities
train_set = train_df.text
test_set = test_df.text
#making empty dataframe
submission = pd.DataFrame()
submission['id'] = test_df.id
for category in categories:
    #fitting the model to entire dataset.
    LogReg_pipeline.fit(train_set,train_df[category])           
    pred_prob = LogReg_pipeline.predict_proba(test_set)
    submission[category] = pred_prob[:,1]
    
submission.head()

Unnamed: 0,id,Indoor/Outdoor,Commercial / Residential,ENERGY STAR Certified,Hardware Included,Package Quantity,Flooring Product Type,Color,Tools Product Type,Included,Voltage (volts),Assembly Required,Features,Wattage (watts),Finish,Shape
0,114689,0.497,0.008,0.004,0.005,0.995,0.004,0.006,0.009,0.011,0.007,0.002,0.003,0.003,0.61,0.009
1,183172,0.008,0.009,0.011,0.009,0.006,0.004,0.015,0.997,0.008,0.952,0.006,0.005,0.007,0.004,0.002
2,217304,0.024,0.002,0.005,0.015,0.913,0.003,0.024,0.039,0.077,0.047,0.03,0.005,0.021,0.035,0.026
3,184115,0.893,0.96,0.025,0.032,0.013,0.965,0.043,0.005,0.026,0.004,0.008,0.013,0.016,0.015,0.015
4,103786,0.203,0.156,0.019,0.122,0.002,0.03,0.393,0.0,0.057,0.014,0.02,0.053,0.036,0.033,0.031
