In [None]:
#Importing all the libraries
import numpy as np
import pandas as pd
import nltk
import re
import string
import optuna
import joblib
import pickle
import lightgbm as lgb
from sklearn import metrics
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer   
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

class Model:
        
    def clean1(self, s):
        ps = PorterStemmer()
        tkz = TweetTokenizer()
        lm = WordNetLemmatizer()

        s = re.sub('[^a-zA-Z]', ' ', s) 
        clean_str = tkz.tokenize(s) 
        clean_str = [word.lower() for word in clean_str]
        clean_str = [word for word in clean_str if word not in string.punctuation]
        clean_str = [lm.lemmatize(word) for word in clean_str if not word in set(stopwords.words('english'))]
        clean_str = ' '.join(clean_str)
        return clean_str
    
    def clean2(self, s):
        ps = PorterStemmer()
        tkz = TweetTokenizer()
        lm = WordNetLemmatizer()

        s = re.sub('[^a-zA-Z]', ' ', s) 
        clean_str = tkz.tokenize(s) 
        clean_str = [word.lower() for word in clean_str]
        clean_str = [word for word in clean_str if word not in string.punctuation]
        clean_str = [ps.stem(word) for word in clean_str if not word in set(stopwords.words('english'))]
        clean_str = ' '.join(clean_str)
        return clean_str

    tfidf_transformer = TfidfVectorizer()
    def vectorize1(self, train_data): 
        return Model.tfidf_transformer.fit_transform(train_data)
    
    def vectorize2(self, test_data):
        return Model.tfidf_transformer.transform(test_data)

    def train(self, x_train, y_train, params):
        d_train = lgb.Dataset(x_train, label=y_train)
        clf = lgb.train(params, d_train, 100)
        return clf

    def predict(self, clf, x_test):
        y_pred = clf.predict(x_test)
        return y_pred

    def analyze(self, y_pred, test_data):
        print(metrics.classification_report(test_data, y_pred,))
        print("\n\nThe Confusion Matrix\n")
        print(metrics.confusion_matrix(test_data, y_pred))

    def train_sgd(self, train_data, target):
        clf = SGDClassifier()
        clf.fit(train_data, target)
        return clf


In [None]:
'''
Loaded the dataset and cleaned it
Saved it to the system as cleaning takes lot of time
Next time we use the cleaned file'
'''
# dataset = pd.read_csv('PData.csv')
# dataset["lemmatized data"] = dataset["item_description"].apply(model.clean1)
# dataset["stemmed data"] = dataset["item_description"].apply(model.clean2)
# dataset.to_csv('clean_pdata.csv')

"\nLoaded the dataset and cleaned it\nSaved it to the system as cleaning takes lot of time\nNext time we use the cleaned file'\n"

In [None]:
'''
Instantatiojning the model 
'''
model = Model()
dataset = pd.read_csv('clean_pdata.csv')
no_of_unq_cat = len(np.unique(dataset['category']))
print(no_of_unq_cat)

99


In [None]:
'''
Splitting the dataset in train and test set
Vectorizing the train and test set
Training and predicting with Stocastic Gradient Descent
'''
X = dataset['lemmatized data']
y = dataset['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

train_vec = model.vectorize1(X_train)
test_vec = model.vectorize2(X_test)

clf = model.train_sgd(train_vec, y_train)
y_pred = model.predict(clf, test_vec)
model.analyze(y_pred, y_test)

  _warn_prf(average, modifier, msg_start, len(result))


                                    precision    recall  f1-score   support

           academic & professional       0.78      0.38      0.52       104
                       accessories       0.83      0.92      0.87      1404
                          apparels       0.76      0.90      0.82      1015
                 audio video & tvs       0.82      0.33      0.47        27
       baby & kids room essentials       0.80      0.16      0.27        50
             baby care & maternity       0.86      0.65      0.74       224
               baby gear & nursery       1.00      0.71      0.83        51
                         badminton       0.91      0.91      0.91        33
                    bags & luggage       0.74      0.72      0.73       208
                    bags & wallets       0.95      0.87      0.91       412
                          bakeware       0.73      0.91      0.81       116
              bathroom accessories       0.86      0.82      0.84       323
           

In [None]:
'''
Label encoding our target variable
'''

y2 = y
l=LabelEncoder() 
l.fit(y2) 
y2=pd.Series(l.transform(y2))  
print(l.classes_, y2) 

['academic & professional' 'accessories' 'apparels' 'audio video & tvs'
 'baby & kids room essentials' 'baby care & maternity'
 'baby gear & nursery' 'badminton' 'bags & luggage' 'bags & wallets'
 'bakeware' 'bathroom accessories' 'bike accessories' 'body care'
 'bottles flasks & jugs' 'boys' 'calculators' 'camera & accessories'
 'car care & tool kits' 'car exterior accessories'
 'car interior accessories' 'certified refurbished mobiles'
 'comfort & safety' 'computer accessories'
 'computer accessories & peripherals' 'cooking essentials' 'cookware'
 'cricket' 'diaries & notebooks' 'dining & serving' 'entrance exam'
 'ethnic wear' 'eyewear' 'face care' 'fashion accessories'
 'files & folders' 'fitness accessories' 'fitness equipment' 'foods'
 'football' 'footwear' 'gaming' 'gardening' 'gas stoves' 'girls'
 'hair care' 'hardware fittings' 'headphones & headsets'
 'health & nutrition' 'helmets' 'home appliances' 'home care' 'home decor'
 'home furnishings' 'imitation jewellery' 'infants'


In [None]:
'''
Working with the stemmed data
Splitting the dataset in train and test set
Vectorizing the train and test set
Training and predicting with Stocastic Gradient Descent
'''
X2 = dataset['stemmed data']
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = 0.20)

train_vec2 = model.vectorize1(X2_train)
test_vec2 = model.vectorize2(X2_test)

clf2 = model.train_sgd(train_vec2, y2_train)
y_pred2 = model.predict(clf2, test_vec2)
model.analyze(y_pred2, y2_test)

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.78      0.58      0.66        85
           1       0.83      0.93      0.88      1353
           2       0.76      0.89      0.82      1033
           3       0.73      0.65      0.69        17
           4       0.67      0.08      0.15        49
           5       0.93      0.72      0.81       209
           6       0.96      0.55      0.70        42
           7       0.97      0.94      0.96        34
           8       0.74      0.65      0.69       225
           9       0.93      0.82      0.88       411
          10       0.77      0.86      0.81       114
          11       0.88      0.78      0.83       338
          12       0.87      0.89      0.88       664
          13       0.00      0.00      0.00        31
          14       0.00      0.00      0.00        37
          15       0.87      0.11      0.19       184
          16       0.90      0.90      0.90        29
          17       0.89    

In [None]:
params = {  'boosting_type' : 'gbdt',
            'objective' : 'multiclass',
            'num_class' : 99,
            'metric' : 'multi_logloss',
            'lambda_l1': 2.047032859234417e-06,
            'lambda_l2': 0.8240583764526691,
            'num_leaves': 200,
            'max_depth': 18,
            'feature_fraction': 0.7267639893023067,
            'bagging_fraction': 0.9955512019618731,
            'bagging_freq': 3,
            'min_child_samples': 5
         }

In [None]:
'''
Training with LightGBM
'''
clf3 = model.train(train_vec2, y2_train, params)
y_pred3 = model.predict(clf3, test_vec2)
y_pred3 = [np.argmax(arr) for arr in y_pred3]
model.analyze(y_pred3, y2_test)

              precision    recall  f1-score   support

           0       0.79      0.62      0.70        85
           1       0.93      0.95      0.94      1353
           2       0.80      0.88      0.84      1033
           3       0.57      0.76      0.65        17
           4       0.81      0.71      0.76        49
           5       0.89      0.88      0.89       209
           6       0.95      0.88      0.91        42
           7       1.00      0.97      0.99        34
           8       0.83      0.79      0.81       225
           9       0.96      0.93      0.95       411
          10       0.92      0.87      0.89       114
          11       0.91      0.87      0.89       338
          12       0.94      0.91      0.92       664
          13       0.93      0.45      0.61        31
          14       0.53      0.24      0.33        37
          15       0.54      0.33      0.41       184
          16       0.96      0.93      0.95        29
          17       0.91    

In [None]:
np.set_printoptions(threshold=np.inf)
print(metrics.confusion_matrix(y_pred3, y2_test))

[[   53     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     1     0     9     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     1     1     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     1     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     1     0     0     0     0     0     0     0
      0     0     0]
 [    0  1279     0     0     0     0     0     1     2     5     0     0
      6     0     0     0     0     0     0     0     1     0     0     0
      0     0     0     0     0     0     0     1     1     0    33    18
      3     0     0     0     5     1     0     0     0     0     0     0
      4     0     0     0     0     2     0     1     0     1     0     0
      0     0    

      0     0    39]]


In [None]:
'''
Saving a trained classifier for later use
'''
joblib.dump(clf2, 'sgd_clf.pkl')
joblib.dump(clf3, 'lgb_clf.pkl')

['lgb_clf.pkl']

In [None]:
'''
Load a pretrained classifier
'''
import joblib

sgd_clf = joblib.load('sgd_clf.pkl')
lgb_clf = joblib.load('lgb_clf.pkl')

In [None]:
def fun(s):
    cln_str = [model.clean2(s)]
    vec = model.vectorize2(cln_str)
    pred1 = sgd_clf.predict(vec)
    pred2 = lgb_clf.predict(vec)
    pred2 = [np.argmax(pred2)]
    print("Predicted category by Stocastic Gradient Descent Classifier:   ",l.classes_[pred1])  
    print("Predicted category by LightGBM Classifier :   ",l.classes_[pred2]) 

In [None]:
s = input("Give description of a product to be classified : ")
fun(s)

Give description of a product to be classified : nice red saree
Predicted category by Stocastic Gradient Descent Classifier:    ['ethnic wear']
Predicted category by LightGBM Classifier :    ['ethnic wear']


In [None]:
'''

def objective(trial):
    
    train_x, valid_x, train_y, valid_y = train_test_split(X2, y2, test_size=0.25)
    model.vectorize(train_x, valid_x)
    train_x = model.train_vector
    valid_x = model.test_vector
    train_y = np.array(train_y)
    valid_y = np.array(valid_y)
    dtrain = lgb.Dataset(train_x, label=train_y)

    param = {
            "objective": "multiclass",
            "metric": "multi_logloss",
            "num_class" : 99,
            "boosting_type": "gbdt",
            "gpu_use_dp": True,
            "save_binary": True,
            "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
            "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
            "num_leaves": trial.suggest_int("num_leaves", 2, 256),
            "max_depth": trial.suggest_int("max_depth",5,25),
            "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
            "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
            "bagging_freq": trial.suggest_int("bagging_freq", 1, 4),
            "min_child_samples": trial.suggest_int("min_child_samples", 5, 100)
        }
    gbm = lgb.train(param, dtrain)
    preds = gbm.predict(valid_x)
    pred_labels =[np.argmax(arr) for arr in preds] 
    accuracy = metrics.accuracy_score(valid_y, pred_labels)
    return accuracy


if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))
        
'''

'\n\ndef objective(trial):\n    \n    train_x, valid_x, train_y, valid_y = train_test_split(X2, y2, test_size=0.25)\n    model.vectorize(train_x, valid_x)\n    train_x = model.train_vector\n    valid_x = model.test_vector\n    train_y = np.array(train_y)\n    valid_y = np.array(valid_y)\n    dtrain = lgb.Dataset(train_x, label=train_y)\n\n    param = {\n            "objective": "multiclass",\n            "metric": "multi_logloss",\n            "num_class" : 99,\n            "boosting_type": "gbdt",\n            "gpu_use_dp": True,\n            "save_binary": True,\n            "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),\n            "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),\n            "num_leaves": trial.suggest_int("num_leaves", 2, 256),\n            "max_depth": trial.suggest_int("max_depth",5,25),\n            "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),\n            "bagging_fraction": trial.suggest_uniform("ba

In [None]:
'''
Using the tuned poarameters obtained from optuna
Training with LightGBM
We can use this part after running the optuna instance
'''
# params = trial.params
# params['boosting_type'] = 'gbdt'
# params['objective'] = 'multiclass'
# params['num_class'] = 99
# params['metric'] = 'multi_logloss'

# clf3 = model.train(train_vec2, y2_train, params)
# y_pred3 = model.predict(clf3, test_vec2)
# y_pred3 = [np.argmax(arr) for arr in y_pred3]
# model.analyze(y_pred3, y2_test)

'\nUsing the tuned poarameters obtained from optuna\nTraining with LightGBM\nWe can use this part after running the optuna instance\n'