Group 5 <br>
Python Implementation => Deliverable


## Implementing a ML model to predict the category of the Product using the name of the Product.

In [None]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from gensim.models import Word2Vec,KeyedVectors
from sklearn.preprocessing import LabelEncoder
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

###Data Input

In [None]:
mapping_data = pd.read_csv("/content/drive/MyDrive/QUANTUM ANALYTICA- CANNABIS/product_category_mappings_new_1.csv")

###Data Cleaning

In [None]:
#removing special characters and digits form the product name column.
mapping_data['product_name'] = mapping_data['product_name'].str.replace(r'\d+','')

#retaining only english keywords in product names
for i, row in mapping_data.iterrows():
    text_t = ' '.join([w for w in row['product_name'].split() if wordnet.synsets(w)])
    mapping_data.at[i,'product_name'] = text_t

#converting the product_names to lower and remving stopwords if any
for i, row in mapping_data.iterrows():
    word_list_t = row['product_name'].lower().split() 
    filtered_words = [word for word in word_list_t if word not in stopwords.words('english')]
    text_t = ' '.join(filtered_words)
    mapping_data.at[i,'product_name']= text_t

#Tokenization
mapping_data['product_names_token'] = mapping_data['product_name'].apply(lambda x: x.split())

### Loading Pre Trained Word Embedding Model

In [None]:
#loading the Google Word Embeddings
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

#Load the word2vec pre trained Model to get the word embeddings for each token
model = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz',binary=True,limit=1000000)

--2021-06-27 13:44:28--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.199.8
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.199.8|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2021-06-27 13:44:51 (70.2 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



### Data Preparation

In [None]:
def token_check(x,model):
  """
  1.Check if the token exists in the word2vec model vocab. 
  2.Check if the length of the token is greater than 3 
  """
  token_list=[]
  count_dict= {}
  for i in x:
      if i not in count_dict and i in model.vocab:
        count_dict[i] = 1
      #if len(i) > 3 and i in model.vocab:
        token_list.append(i)
      else:
          continue
  return token_list

def word_vector(tokens,size,model):
  """Averaging the word vectors"""
  vec = np.zeros(size).reshape((1, size))
  count = 0
  for word in tokens:
    vec += model[word].reshape((1, size))
    count += 1.
  if count != 0:
      vec /= count
  return vec

In [None]:
mapping_data['product_names_token'] = mapping_data['product_names_token'].apply(lambda x:token_check(x,model))
mapping_data['length_of_tokens'] = mapping_data['product_names_token'].apply(lambda x: len(x))

new_data = mapping_data[mapping_data['length_of_tokens'] > 0]
new_data.reset_index(drop=True, inplace=True)
wordvec_arrays = np.zeros((len(new_data.product_names_token),300))

for i in range(len(new_data.product_names_token)):
  wordvec_arrays[i,:] = word_vector(new_data.product_names_token[i],300,model)

vectorized_df = pd.DataFrame(wordvec_arrays)

In [None]:
vectorized_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
0,-0.017822,-0.007568,0.176758,0.070618,0.094482,-0.149536,0.11412,-0.095215,0.040161,0.036133,0.039429,-0.05835,-0.059143,-0.153625,-0.291016,0.138184,0.03418,0.25293,-0.038086,-0.077393,-0.161011,0.331055,0.026123,-0.283691,-0.084656,0.210693,-0.012939,-0.013184,0.138672,-0.073578,0.098633,0.104492,0.182617,0.079895,0.029358,-0.016327,0.147949,0.056549,-0.08313,0.109741,...,0.035034,-0.075562,-0.082031,0.169159,0.114624,0.186279,-0.066406,-0.077759,-0.099609,-0.099365,0.070526,-0.051636,0.08252,0.052612,-0.044434,-0.026855,-0.162598,-0.119751,-0.069092,0.226562,0.111572,-0.003174,0.05957,0.025146,-0.015137,0.099487,-0.041016,-0.015869,-0.086792,-0.053467,-0.067017,-0.016846,-0.158447,-0.073517,0.01709,-0.124512,0.053955,-0.021606,0.035706,-0.252441
1,-0.112305,-0.285645,0.063354,0.031982,-0.238281,0.169922,0.28125,-0.088867,0.307617,-0.044922,-0.10083,-0.033081,0.162109,0.269043,-0.216309,0.286133,-0.035645,0.081299,0.119629,-0.287842,0.264648,0.139343,0.046265,0.098816,-0.123047,-0.051544,-0.041199,0.192871,0.075684,-0.124023,-0.020386,-0.047852,-0.126709,0.20752,0.224609,0.194824,-0.102661,0.237305,-0.06311,-0.205322,...,0.086914,-0.107178,0.125488,0.188965,0.057617,0.317871,-0.134521,-0.082031,-0.227051,-0.090942,0.18457,-0.00354,0.191406,0.166748,-0.053772,0.118042,0.010742,-0.139648,0.208252,0.165771,0.096924,-0.220215,-0.032227,0.237305,-0.15918,-0.040283,-0.14624,0.145508,0.155029,-0.057251,0.049805,-0.055904,0.132812,0.190308,0.030273,0.049316,-0.091675,0.006836,-0.165039,0.044312
2,-0.018311,0.018433,0.126526,0.141357,-0.094727,-0.134766,0.079941,-0.159912,-0.057007,0.104492,-0.043213,0.020874,-0.068176,-0.031067,-0.224854,0.055664,-0.195312,0.185547,-0.151123,-0.202637,-0.013123,0.102661,0.054932,-0.28418,-0.148926,0.012085,-0.139893,0.176025,0.050415,-0.010498,0.019531,0.168213,0.17041,0.022278,-0.020508,-0.029541,0.199219,0.045715,-0.084351,0.05188,...,0.081543,0.002197,-0.206787,-0.004974,0.137451,0.007812,-0.1521,-0.096313,0.154297,-0.063461,0.035645,0.040283,-0.043213,0.069092,0.099609,0.035095,-0.132324,-0.188721,-0.055786,0.143059,-0.071533,0.154785,0.249512,0.010925,0.059021,-0.029053,0.069481,0.007568,-0.202637,-0.039062,-0.009399,-0.076843,-0.135986,0.028412,0.081375,-0.18457,0.155762,-0.002441,0.082092,-0.076973
3,-0.007324,0.156738,0.156738,-0.111694,-0.036316,-0.186401,0.054688,-0.072754,0.213867,0.057861,0.049774,-0.065674,-0.008789,0.024963,-0.257324,-0.178223,-0.0271,0.165039,-0.033123,-0.118835,0.080566,0.055298,0.030396,-0.072083,-0.038818,-0.013184,-0.134766,0.104721,0.044922,-0.064453,-0.161621,0.134521,-0.044861,0.041077,-0.093018,-0.095062,0.06665,-0.001845,0.05957,0.055176,...,-0.039062,0.05127,-0.111328,-0.077881,0.323242,-0.087402,-0.247559,-0.042236,-0.142822,-0.224609,-0.012228,0.104004,0.133789,0.095703,0.018311,-0.012207,-0.246094,-0.410156,-0.019043,0.187866,0.017578,-0.022949,0.203613,-0.000488,-0.057617,-0.105225,-0.250488,-0.024414,-0.146973,-0.06543,-0.040649,-0.06897,-0.103516,0.040771,-0.079834,-0.018555,0.019287,-0.062561,0.144287,0.009277
4,0.121094,-0.046875,0.035889,0.283203,-0.166016,-0.080566,0.061768,-0.484375,-0.094727,-0.004883,-0.134766,0.113281,0.212891,0.099121,-0.134766,-0.225586,-0.242188,0.15625,-0.069824,-0.21875,-0.147461,0.298828,-0.010193,-0.22168,-0.225586,-0.031738,-0.173828,0.046875,0.025024,-0.22168,-0.043945,0.241211,0.125977,0.016235,-0.108398,0.068359,0.363281,-0.004944,-0.088379,0.222656,...,0.160156,0.039551,-0.096191,0.24707,0.066895,-0.038574,-0.061035,0.255859,0.105957,-0.00267,0.085938,0.08252,-0.10791,0.010986,0.408203,-0.026733,-0.117188,-0.230469,0.040527,0.165039,-0.032227,0.137695,0.28125,-0.050781,0.007111,0.01178,0.069824,-0.18457,-0.049316,-0.15625,0.105469,-0.091797,-0.120117,-0.21582,0.15918,-0.065918,0.061523,-0.040039,0.259766,0.25


### Data PreProcessing

In [None]:
#one hot encoding the target product categories
target_one_hot_encoded = pd.get_dummies(new_data['category'])

# creating instance of labelencoder
labelencoder = LabelEncoder()

# Assigning numerical values and storing in another column
new_data['category_code'] = labelencoder.fit_transform(new_data['category'])

#merging the vectorized and one hot encoded dataframes.
dataset_final = pd.merge(vectorized_df,new_data['category_code'],left_index=True, right_index=True)

#random shuffling the data before splitting
dataset_final_shuffled = dataset_final.reindex(np.random.permutation(dataset_final.index))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
dataset_final_shuffled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,category_code
3586,-0.232422,0.237305,0.179688,-0.004761,0.183594,-0.025513,0.039062,-0.482422,-0.292969,0.099609,-0.213867,-0.124512,0.005188,0.100586,-0.230469,0.150391,-0.177734,0.103516,-0.070801,-0.227539,0.048584,0.267578,-0.126953,-0.146484,0.027588,-0.120605,-0.086426,0.429688,-0.386719,0.169922,-0.10498,0.105469,0.271484,-0.051514,-0.137695,0.207031,-0.117676,0.036865,0.013,0.133789,...,0.15918,-0.189453,0.220703,0.130859,-0.086426,-0.148438,-0.018921,-0.149414,-0.25,0.119141,0.402344,-0.095703,0.053955,0.000835,-0.000238,-0.004425,-0.100586,-0.152344,0.238281,-0.003738,0.070312,0.125,0.289062,-0.125,-0.067383,0.166016,0.025391,-0.139648,0.060303,0.098145,-0.139648,-0.181641,0.004974,0.080078,-0.138672,-0.076172,-0.097656,0.196289,-0.114746,5
2968,-0.157251,-0.003418,0.09502,0.328711,0.058789,-0.023419,0.090479,-0.230371,-0.150391,0.179492,-0.014893,-0.049121,-0.099561,0.021423,-0.165869,0.168811,-0.278027,0.046894,-0.150586,-0.138379,-0.002899,0.059259,0.114941,-0.041101,0.012671,-0.004858,-0.137109,0.084888,0.005072,0.066211,0.008276,-0.004053,-0.108435,0.045264,-0.194104,0.008276,0.034831,-0.051117,0.100781,0.152057,...,0.159521,-0.26149,0.135156,-0.019678,-0.066455,-0.087183,0.110547,-0.103955,-0.191357,0.003882,0.267578,0.079761,-0.152246,0.017383,-0.134082,-0.191748,-0.086401,-0.0627,0.123157,-0.053479,0.142993,0.139789,0.061646,-0.241504,-0.203369,-0.173999,0.205249,-0.085156,0.18877,-0.228516,-0.17793,0.067969,-0.028632,0.117853,-0.070178,-0.037891,0.2375,0.107593,-0.015039,5
9461,-0.115804,0.092794,0.142253,0.015747,-0.115234,0.046967,0.027018,-0.101318,-0.006836,0.251546,-0.070312,-0.081543,-0.196802,-0.004395,-0.196045,0.100586,-0.16569,0.205078,-0.187826,-0.158773,-0.038086,0.050598,-0.097656,0.037882,-0.136434,-0.015828,-0.255941,0.160156,-0.084473,-0.038167,0.035807,-0.124512,0.081055,0.103516,-0.094076,0.173991,-0.055216,0.060343,-0.01416,0.112305,...,0.191732,-0.150553,-0.050537,0.07373,-0.072835,0.046224,-0.050456,-0.201823,-0.047852,0.158529,-0.045247,-0.134277,0.080892,0.05778,-0.197428,-0.092285,-0.172852,-0.054138,0.154622,-0.141357,0.076335,0.202881,-0.100708,-0.096842,-0.2052,-0.013509,-0.051351,-0.053223,0.043294,0.015462,-0.019043,-0.027669,0.009115,0.094727,-0.096354,-0.076497,0.022298,-0.123861,0.155599,3
4896,-0.107962,0.001011,0.042201,0.179984,0.055246,-0.035889,-0.031642,-0.195871,-0.111747,0.113926,-0.019409,-0.166643,-0.07693,0.101772,-0.144357,0.094831,-0.187709,0.065465,-0.040109,-0.161272,0.025844,0.082493,0.015206,-0.117083,0.076416,-0.083984,-0.177734,0.209734,0.020015,-0.050694,0.043039,-0.042557,-0.046038,-0.011754,-0.109358,-0.063198,-0.133079,-0.097203,-0.014256,0.238669,...,-0.023891,-0.189638,0.020054,-0.001212,-0.099124,-0.138375,0.005598,-0.188177,-0.216343,-0.012835,0.199044,-0.017795,0.114432,0.078768,-0.04234,-0.11911,-0.039934,-0.148856,0.115234,-0.057401,-0.012451,0.16762,0.099269,-0.163853,-0.124329,-0.000174,0.108555,-0.066616,0.183629,0.001055,-0.106916,-0.039342,0.067169,0.059082,-0.067387,-0.020595,0.122201,0.190988,-0.087306,5
5096,-0.462891,0.02417,0.223633,0.253906,0.365234,-0.093262,0.143555,-0.049316,-0.24707,0.061523,-0.337891,0.100586,-0.271484,0.038086,-0.12793,-0.105469,-0.507812,0.072754,0.185547,-0.402344,-0.178711,-0.073242,-0.165039,-0.402344,-0.069824,-0.285156,-0.382812,0.431641,0.001434,-0.244141,0.084961,0.194336,-0.511719,0.05249,-0.574219,-0.116211,-0.125,-0.020508,0.032227,-0.00412,...,-0.072754,0.003098,-0.660156,0.083496,0.054688,0.099121,-0.106934,0.038574,-0.285156,-0.205078,0.316406,0.147461,0.09375,0.191406,-0.369141,-0.40625,0.185547,0.212891,0.211914,-0.367188,0.257812,0.310547,0.010864,-0.494141,-0.390625,0.062012,0.640625,-0.236328,0.087402,-0.100586,-0.349609,0.257812,0.285156,0.109863,0.004822,-0.419922,0.140625,0.400391,-0.039551,2


## Modeling

### Support Vector Machine

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

target = target_one_hot_encoded.columns
X = dataset_final_shuffled.drop('category_code', axis=1)
y = dataset_final_shuffled['category_code']

X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.75,test_size=0.25,random_state=101)

model = SVC()
param_grid = { 'C':[0.1,1],'kernel':['rbf'],'gamma': [1, 0.1]}
grid = GridSearchCV(model,param_grid)
grid.fit(X_train,y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 1], 'gamma': [1, 0.1], 'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [None]:
print(grid.best_params_)
print(grid.score(X_test,y_test))

{'C': 1, 'gamma': 1, 'kernel': 'rbf'}
0.7546481954064892


In [None]:
print(grid.best_params_)
print(grid.score(X_train,y_train))

{'C': 1, 'gamma': 1, 'kernel': 'rbf'}
0.9212253829321663


In [None]:
print(grid)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 1], 'gamma': [1, 0.1], 'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)


In [None]:
# summarize the fit of the model
from sklearn import metrics
# make predictions
expected = y_train
predicted = grid.predict(X_train)

print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))


              precision    recall  f1-score   support

           0       0.96      0.78      0.86       415
           1       0.95      0.62      0.75       145
           2       0.92      0.43      0.59        76
           3       0.95      0.80      0.87       387
           4       0.89      0.95      0.92      1695
           5       0.98      0.99      0.99      2342
           6       0.84      0.93      0.88       804
           7       0.89      0.98      0.94      1041
           8       1.00      0.40      0.57        53
           9       0.90      0.86      0.88       912
          10       1.00      0.93      0.97        46
          11       0.95      0.50      0.66        40
          12       0.96      0.95      0.95       157
          13       0.99      0.86      0.92       113

    accuracy                           0.92      8226
   macro avg       0.94      0.79      0.84      8226
weighted avg       0.92      0.92      0.92      8226

[[ 323    4    0    0    

### Linear Discriminant Analysis

In [None]:
# evaluate a lda model on the dataset
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
# grid search solver for lda
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
#splitting of data
from sklearn.model_selection import train_test_split

target = target_one_hot_encoded.columns
X = dataset_final_shuffled.drop('category_code', axis=1)
y = dataset_final_shuffled['category_code']

X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.75,test_size=0.25,random_state=101)

# define model
model = LinearDiscriminantAnalysis()

# define model evaluation method
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=1)

# define grid
grid = dict()
grid['solver'] = ['svd', 'lsqr', 'eigen']
# define search
search = GridSearchCV(model, grid, scoring='accuracy', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(X_train, y_train)

In [None]:
# summarize
print('Mean Accuracy: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

Mean Accuracy: 0.664
Config: {'solver': 'lsqr'}


In [None]:
results.score(X_test,y_test)

0.6616842872767044

### Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, accuracy_score, roc_auc_score, roc_curve
from sklearn.model_selection import GridSearchCV

estimator = lgb.LGBMClassifier(boosting_type='gbdt', 
                               learning_rate = 0.125, 
                               metric = 'multiclass', n_estimators = 20,
                               num_leaves = 38)

param_grid = {
    'n_estimators': [x for x in [75,100]],
    'learning_rate': [0.01,0.1,0.2],
    'num_leaves': [5,10],
    'boosting_type' : ['gbdt'],
    'objective' : ['multiclass'],
    'metric': ['multiclass'],
    'lambda_l1' : [0.10],
    'lambda_l2' : [0.20],
    'random_state' : [501]}

gridsearch = GridSearchCV(estimator, param_grid)

gridsearch.fit(X_train, y_train.values.ravel(),eval_set = [(X_test, y_test)],eval_metric = ['multi_logloss'],early_stopping_rounds = 10)

print('Best parameters found by grid search are:', gridsearch.best_params_)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[57]	valid_0's multi_logloss: 1.68057	valid_0's multi_logloss: 1.68057
[58]	valid_0's multi_logloss: 1.67649	valid_0's multi_logloss: 1.67649
[59]	valid_0's multi_logloss: 1.67253	valid_0's multi_logloss: 1.67253
[60]	valid_0's multi_logloss: 1.66843	valid_0's multi_logloss: 1.66843
[61]	valid_0's multi_logloss: 1.66453	valid_0's multi_logloss: 1.66453
[62]	valid_0's multi_logloss: 1.66065	valid_0's multi_logloss: 1.66065
[63]	valid_0's multi_logloss: 1.65679	valid_0's multi_logloss: 1.65679
[64]	valid_0's multi_logloss: 1.65279	valid_0's multi_logloss: 1.65279
[65]	valid_0's multi_logloss: 1.64893	valid_0's multi_logloss: 1.64893
[66]	valid_0's multi_logloss: 1.64498	valid_0's multi_logloss: 1.64498
[67]	valid_0's multi_logloss: 1.64096	valid_0's multi_logloss: 1.64096
[68]	valid_0's multi_logloss: 1.63719	valid_0's multi_logloss: 1.63719
[69]	valid_0's multi_logloss: 1.6333	valid_0's multi_logloss: 1.6333
[70]	valid_0's

In [None]:
gbm = lgb.LGBMClassifier(boosting_type= 'gbdt', lambda_l1= 0.1, lambda_l2= 0.2,learning_rate= 0.2, metric= 'multiclass', n_estimators = 100, num_leaves = 10,objective= 'multiclass', random_state= 501)

gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric=['multiclass'],
early_stopping_rounds=20)


[1]	valid_0's multi_logloss: 1.84723
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's multi_logloss: 1.73949
[3]	valid_0's multi_logloss: 1.65169
[4]	valid_0's multi_logloss: 1.58072
[5]	valid_0's multi_logloss: 1.51868
[6]	valid_0's multi_logloss: 1.46674
[7]	valid_0's multi_logloss: 1.41964
[8]	valid_0's multi_logloss: 1.37827
[9]	valid_0's multi_logloss: 1.34367
[10]	valid_0's multi_logloss: 1.31045
[11]	valid_0's multi_logloss: 1.28046
[12]	valid_0's multi_logloss: 1.25366
[13]	valid_0's multi_logloss: 1.22946
[14]	valid_0's multi_logloss: 1.20829
[15]	valid_0's multi_logloss: 1.18746
[16]	valid_0's multi_logloss: 1.16869
[17]	valid_0's multi_logloss: 1.15324
[18]	valid_0's multi_logloss: 1.13798
[19]	valid_0's multi_logloss: 1.12339
[20]	valid_0's multi_logloss: 1.10764
[21]	valid_0's multi_logloss: 1.09362
[22]	valid_0's multi_logloss: 1.08091
[23]	valid_0's multi_logloss: 1.07094
[24]	valid_0's multi_logloss: 1.05961
[25]	valid_0's multi_logloss: 1.048

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', lambda_l1=0.1, lambda_l2=0.2,
               learning_rate=0.2, max_depth=-1, metric='multiclass',
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=10,
               objective='multiclass', random_state=501, reg_alpha=0.0,
               reg_lambda=0.0, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

In [None]:
gbm.score(X_train,y_train)

0.9337466569414054

In [None]:
gbm.score(X_test,y_test)

0.7305869485964273

### Random Forest Model

In [None]:
#splitting of data
from sklearn.model_selection import train_test_split

target = target_one_hot_encoded.columns
X = dataset_final_shuffled.drop('category_code', axis=1)
y = dataset_final_shuffled['category_code']

X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.75,test_size=0.25,random_state=101)

In [None]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

In [None]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7149106817353262


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.54      0.13      0.22       141
           1       0.42      0.22      0.29        45
           2       0.00      0.00      0.00        31
           3       0.68      0.32      0.44       134
           4       0.62      0.82      0.70       571
           5       0.83      0.96      0.89       779
           6       0.64      0.50      0.56       269
           7       0.72      0.86      0.78       355
           8       0.75      0.19      0.30        16
           9       0.69      0.63      0.66       300
          10       1.00      0.14      0.25        14
          11       0.25      0.11      0.15         9
          12       0.77      0.70      0.73        43
          13       1.00      0.31      0.47        36

    accuracy                           0.71      2743
   macro avg       0.64      0.42      0.46      2743
weighted avg       0.70      0.71      0.69      2743



In [None]:
 print(confusion_matrix(y_test, y_pred))

[[ 19   3   0   0  17  16   3  76   0   1   0   0   6   0]
 [  3  10   0   0   7   4   0  21   0   0   0   0   0   0]
 [  0   0   0   0  23   4   0   0   0   3   0   1   0   0]
 [  1   0   0  43  69  17   2   2   0   0   0   0   0   0]
 [  1   0   1  15 467  36  15   5   1  28   0   2   0   0]
 [  0   0   0   0  22 746   4   1   0   5   0   0   1   0]
 [  0   2   0   3  38  36 135   6   0  48   0   0   1   0]
 [  7   9   0   0  16   8   9 304   0   1   0   0   1   0]
 [  0   0   0   0  12   0   1   0   3   0   0   0   0   0]
 [  0   0   0   2  51  14  40   3   0 190   0   0   0   0]
 [  0   0   0   0   8   3   0   1   0   0   2   0   0   0]
 [  0   0   0   0   8   0   0   0   0   0   0   1   0   0]
 [  4   0   0   0   2   7   0   0   0   0   0   0  30   0]
 [  0   0   1   0  15   3   1   4   0   1   0   0   0  11]]


####Hyper Parameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [None]:
# Use the random grid to search for best hyperparameters
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 5, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train,y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 16.1min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [None]:
rf_random.best_params_

{'bootstrap': False,
 'max_depth': 60,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 600}

In [None]:
### Best parameters and fit in the model
best_random = rf_random.best_estimator_

In [None]:
best_random.fit(X_train,y_train)

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=60, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=600,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
y_pred = best_random.predict(X_test)

# Model Accuracy, how often is the classifier correct..
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7276704338315713


In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.50      0.15      0.23       141
           1       0.44      0.24      0.31        45
           2       0.00      0.00      0.00        31
           3       0.70      0.33      0.45       134
           4       0.65      0.83      0.73       571
           5       0.84      0.97      0.90       779
           6       0.67      0.54      0.60       269
           7       0.72      0.85      0.78       355
           8       0.60      0.19      0.29        16
           9       0.71      0.65      0.68       300
          10       1.00      0.07      0.13        14
          11       0.20      0.11      0.14         9
          12       0.79      0.72      0.76        43
          13       1.00      0.28      0.43        36

    accuracy                           0.73      2743
   macro avg       0.63      0.42      0.46      2743
weighted avg       0.71      0.73      0.70      2743



In [None]:
y_pred

array([ 5,  5,  7, ...,  4, 13,  5])

###Predictions over Products Data

In [None]:
### import product mappings
mapping_data = pd.read_csv("/content/drive/MyDrive/QUANTUM ANALYTICA- CANNABIS/northeastern_product_data.csv")

In [None]:
#removing special characters and digits form the product name column.
mapping_data['product_name'] = mapping_data['product_name'].str.replace(r'\d+','')

In [None]:
mapping_data = mapping_data.applymap(str)

#retaining only english keywords in product names
for i, row in mapping_data.iterrows():
    text_t = ' '.join([w for w in row['product_name'].split() if wordnet.synsets(w)])
    mapping_data.at[i,'product_name'] = text_t

#converting the product_names to lower and remving stopwords if any
for i, row in mapping_data.iterrows():
    word_list_t = row['product_name'].lower().split() 
    filtered_words = [word for word in word_list_t if word not in stopwords.words('english')]
    text_t = ' '.join(filtered_words)
    mapping_data.at[i,'product_name']= text_t

#Tokenization
mapping_data['product_names_token'] = mapping_data['product_name'].apply(lambda x: x.split())

#vectorization
def token_check(x,model):
  """
  1.Check if the token exists in the word2vec model vocab. 
  2.Check if the length of the token is greater than 3 
  """
  token_list=[]
  count_dict= {}
  for i in x:
      if i not in count_dict and i in model.vocab:
        count_dict[i] = 1
      #if len(i) > 3 and i in model.vocab:
        token_list.append(i)
      else:
          continue
  return token_list

def word_vector(tokens,size,model):

  """Averaging the word vectors"""
  
  vec = np.zeros(size).reshape((1, size))
  count = 0
  for word in tokens:
    vec += model[word].reshape((1, size))
    count += 1.
  if count != 0:
      vec /= count
  return vec


#loading the Google Word Embeddings
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

#Load the word2vec pre trained Model to get the word embeddings for each token
model = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz',binary=True,limit=1000000)

mapping_data['product_names_token'] = mapping_data['product_names_token'].apply(lambda x:token_check(x,model))

--2021-06-27 04:17:26--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.72.102
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.72.102|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



In [None]:
mapping_data['length_of_tokens'] = mapping_data['product_names_token'].apply(lambda x: len(x))
new_data = mapping_data[mapping_data['length_of_tokens'] > 0]

In [None]:
new_data.reset_index(drop=True, inplace=True)
wordvec_arrays = np.zeros((len(new_data.product_names_token),300))

for i in range(len(new_data.product_names_token)):
  wordvec_arrays[i,:] = word_vector(new_data.product_names_token[i],300,model)

vectorized_df = pd.DataFrame(wordvec_arrays)

In [None]:
vectorized_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
0,0.10498,0.018433,0.008972,-0.012817,-0.028809,-0.059326,-0.089844,-0.068359,0.172852,0.019043,-0.039551,0.160156,0.053955,-0.004364,-0.080566,0.085938,-0.181641,0.24707,-0.240234,-0.261719,-0.122559,-0.143555,-0.078125,-0.361328,-0.151367,0.099121,-0.392578,0.09668,0.217773,0.103516,0.081543,0.055176,0.025757,-0.088379,-0.023682,0.131836,0.064453,0.003021,-0.148438,0.147461,...,-0.043213,0.043213,-0.040039,0.160156,0.074219,0.053711,-0.036865,0.074707,-0.103516,-0.040527,0.039551,0.04248,-0.194336,0.220703,0.179688,-0.119141,-0.103516,-0.275391,-0.106445,0.108398,0.141602,-0.081543,0.122559,-0.081543,-0.238281,-0.055176,-0.15332,0.00647,-0.078125,-0.443359,-0.014648,-0.024292,-0.28125,0.083496,-0.086426,-0.004333,0.175781,0.092285,0.061035,-0.132812
1,0.088867,0.077555,0.028158,0.045634,0.013346,-0.12679,0.007772,-0.32487,-0.021322,0.377604,-0.062581,0.001383,0.046305,-0.078776,-0.102539,0.083944,0.023763,0.315755,0.011068,-0.273926,-0.091227,-0.068726,0.058431,-0.201823,0.040609,-0.027751,-0.107259,0.144857,0.007161,-0.134562,-0.000814,-0.003255,0.177734,0.077311,-0.078491,-0.027181,0.003703,-0.143066,0.192708,0.095622,...,0.096761,-0.100993,-0.09464,-0.0896,0.143392,0.004567,-0.020182,-0.078786,0.041504,0.060221,-0.020345,0.173991,0.000936,0.051351,0.153768,-0.023885,-0.159912,-0.134928,-0.17981,0.176107,0.070312,-0.006671,0.078623,0.013102,-0.098958,-0.043294,-0.038086,-0.269531,0.101115,0.063314,0.091634,-0.084717,0.132202,0.012533,-0.103841,-0.073242,0.059733,0.040609,0.234029,-0.09549
2,0.001465,0.006775,0.201843,0.09668,-0.053833,-0.112305,0.033691,-0.077881,-0.135132,0.212036,0.068115,0.02002,-0.053726,-0.199219,-0.053711,0.057373,-0.31543,0.04303,0.123444,0.095306,-0.085693,0.068359,0.061707,-0.013062,0.130127,-0.220215,-0.045532,0.057373,0.00708,-0.145752,-0.172363,-0.114178,-0.158203,0.26123,-0.081543,-0.049072,0.065063,0.080933,0.051514,0.096375,...,0.13501,-0.334961,-0.250488,0.060364,0.478516,0.116699,0.019775,-0.21582,-0.109619,-0.059204,0.230469,-0.157715,0.108398,-0.080566,-0.136662,-0.083374,0.097412,-0.027649,-0.099335,-0.013062,-0.029785,-0.111328,0.272949,0.049561,-0.03894,-0.566406,-0.09642,0.126404,0.06105,0.165283,-0.004395,-0.017822,-0.199707,0.237793,0.076172,-0.077148,-0.041016,0.11676,0.062134,-0.127441
3,0.012492,0.031169,0.193522,0.154297,-0.15918,0.01476,0.006185,-0.224365,-0.070475,0.157227,-0.086589,-0.022502,-0.161296,-0.001322,-0.228027,0.18988,-0.173991,0.124462,-0.174967,-0.130371,0.015951,0.026194,0.092163,-0.052673,0.093058,0.004842,0.054281,0.030518,-0.039714,0.009277,-0.083984,0.108805,-0.055705,-0.137207,-0.085775,0.076945,0.040365,-0.017161,0.191406,0.058187,...,0.076233,0.265625,-0.282552,0.233887,-0.033691,-0.015096,-0.178385,0.103353,-0.111898,-0.178955,0.030884,0.281576,0.15686,-0.151693,-0.056478,-0.097061,-0.217936,-0.168945,-0.184578,-0.023214,0.032369,0.161336,0.15388,0.012044,-0.341471,-0.134928,-0.101359,0.053548,-0.043701,0.219727,-0.288411,-0.121867,0.054301,-0.092927,0.086589,-0.143717,0.037272,0.258464,-0.175496,-0.076335
4,0.059692,0.122803,0.234863,0.016785,0.04657,0.113525,0.044556,-0.34082,0.229004,0.147827,0.219238,-0.052734,-0.083252,0.063354,0.011108,0.186615,0.009537,0.072021,0.057861,0.145264,-0.032959,-0.014648,-0.114746,-0.03009,-0.002563,-0.203857,0.057922,-0.080078,0.093994,-0.00769,0.000397,-0.007812,-0.042542,-0.002441,0.030884,0.081604,0.133789,0.001953,-0.070469,0.055115,...,-0.066895,-0.094482,-0.091675,0.1427,0.066895,0.114746,-0.023026,0.107056,0.122314,0.013344,-0.045532,-0.067017,-0.151611,-0.147217,0.033691,0.192383,-0.003479,-0.022461,-0.104004,-0.09137,0.077637,-0.062744,0.058472,0.133301,-0.043762,0.015869,-0.186035,-0.059875,0.07605,0.057007,0.163574,-0.139404,-0.141388,0.01123,-0.044525,-0.058594,-0.044189,-0.066895,-0.090088,0.150787


In [None]:
final_pred = clf.predict(vectorized_df)
final_pred

array([ 6,  5, 10, ...,  5,  5,  6])

In [None]:
new_data['product_category_predicted'] = labelencoder.inverse_transform(final_pred)
new_data.to_csv(f'/content/drive/MyDrive/northeastern_product_data_predicted.csv', columns=new_data.columns,sep=",",quotechar='"',index=False,quoting=csv.QUOTE_ALL)