In [None]:
import pandas as pd
import numpy as np
import matplotlib
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import pickle

In [None]:
nltk.download('stopwords')

def preprocess_text(text):
    text=text.lower()
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    text=text.strip()
    text += " "
    return text

# STOPWORD REMOVAL
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)#LEMMATIZATION

def tokenizeIngredients(text):
    snowball = SnowballStemmer(language='english')
    token_words=word_tokenize(text)
    stem_sentence= ""
    for word in token_words:
        stem_sentence += snowball.stem(word) + " "
    return stem_sentence

In [None]:
food_items = pd.read_csv('test.csv', delimiter=",")
columns = ['eanCode','name','categories','ingredients','nutrient_data','url','swap_cat']
food_items = food_items.loc[:,columns]
food_items = food_items.fillna('NA')
food_items = food_items.sort_values('ingredients',key=lambda x: x.str.len())

start = "text"
ingred_list = []

for i in range(food_items['ingredients'].size):
    ingredients = ''
    for ingred in food_items['ingredients'][i].split(','):
        ingredients += preprocess_text(ingred) + ' '
    ingred_list.append(ingredients )
food_items['clean_ingredients'] = ingred_list

food_items_docs = pd.DataFrame(food_items['name'] + "; " + food_items['categories']  + "; " + food_items['clean_ingredients'])
food_items_docs = food_items_docs.iloc[:,0].astype('U')
food_items_text = []
for item in food_items_docs:
    food_items_text.append(tokenizeIngredients(preprocess_text(item)))

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

#cross valid
X = food_items_text
y = food_items['swap_cat']
count_vect_cv = CountVectorizer(stop_words=[w for w in stopwords.words('english')])
X_counts = count_vect_cv.fit_transform(X)
tf_transformer_cv = TfidfTransformer(use_idf=False).fit(X_counts)
X_tfidf = tf_transformer_cv.transform(X_counts)
word_df = pd.DataFrame(count_vect_cv.vocabulary_,index=[0]).transpose()


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import svm

lin_clf = svm.LinearSVC(C=1.0,max_iter=10000,random_state=42)
sgd_clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=3, tol=None)
svc = svm.SVC(random_state=42)

In [None]:
svc_pipeline=Pipeline([
    ('vect',  CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf',svc)
])
parameters = {
    # vectorizer hyper-parameters
    'vect__ngram_range': [(1,1),(1,3)],
    'vect__max_df': [0.4],
    'vect__max_features': [300,5000],
    'vect__stop_words': [{'english'},['food','id','base', 'and']],
    'clf__kernel': ['linear', 'sigmoid'],
    'clf__C':[0.5,1,1.5,1.8],
    'clf__gamma':['auto','scale'],
    'clf__tol':[0.1,0.01,0.001],
}

# create grid search object, and use the pipeline as an estimator
svcgrid_search = GridSearchCV(svc_pipeline, parameters, n_jobs=-1)

# fit the grid search on the training data
svcgrid_search.fit(X,y)

# get the list of optimal parameters
df = pd.DataFrame(svcgrid_search.cv_results_)
svcgrid_search.best_params_

In [None]:
df.sort_values('rank_test_score')

In [None]:
from sklearn.model_selection import GridSearchCV
lin_svc_pipeline = Pipeline([
    ('vect',  CountVectorizer(stop_words={'english'})),
    ('tfidf', TfidfTransformer()),
     ('clf', lin_clf)
])
parameters = {
    # vectorizer hyper-parameters
    'vect__stop_words': [{'english'},['food','id','base', 'and', 'beverage']],
    'vect__ngram_range': [(1,1),(1, 3)],
    'vect__max_df': [0.2,0.4],
    'vect__max_features': [300,700,1000],
    # classifiers
    'clf__C':[0.5,1,1.5,1.8],
    'clf__tol':[0.1,0.01,0.001],
    'clf__loss':['hinge', 'squared_hinge'],
    'clf__max_iter':[1000,5000,7000]

}

# create grid search object, and use the pipeline as an estimator
grid_search = GridSearchCV(lin_svc_pipeline, parameters, n_jobs=-1)

# fit the grid search on the training data
grid_search.fit(X,y)

# get the list of optimal parameters
df = pd.DataFrame(grid_search.cv_results_)
grid_search.best_params_

In [None]:
df.sort_values('rank_test_score')

In [None]:
sgd_pipeline = Pipeline([
    ('vect',  CountVectorizer(stop_words={'english'})),
    ('tfidf', TfidfTransformer()),
     ('clf', sgd_clf)
])
parameters = {
    # vectorizer hyper-parameters
    'vect__ngram_range': [(1,1),(1, 3)],
    'vect__max_df': [0.2,0.4],
    'vect__max_features': [300,5000],
    'vect__stop_words': [{'english'},['food','id','base', 'and', 'beverage']],
    # classifier hyper-parameers
    'clf__loss':['hinge','modified_huber','perceptron','squared_hinge'],
    'clf__alpha':[0.0001,0.001,0.01],
    'clf__tol':[0.00001,0.0001],
    'clf__max_iter':[5000]

}

# create grid search object, and use the pipeline as an estimator
grid_search = GridSearchCV(sgd_pipeline, parameters, n_jobs=-1,error_score='raise')

# fit the grid search on the training data
grid_search.fit(X,y)

# get the list of optimal parameters
df = pd.DataFrame(grid_search.cv_results_)
grid_search.best_params_

In [None]:
df.sort_values('rank_test_score')

In [None]:
text_clf = Pipeline([
    ('vect', CountVectorizer(stop_words={'english'},ngram_range=(1,3),max_features=300,max_df= 0.4)),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='modified_huber', alpha=0.01, random_state=42, tol=1e-05,max_iter=5000)),
])

text_clf.fit(X,y)
scores = cross_val_score(text_clf, X, y, cv= StratifiedKFold(n_splits=5))
print(scores.mean())

#filename = 'item_category_svc_model.sav'
#pickle.dump(text_clf, open(filename, 'wb'))

In [None]:
import plotly.express as px
import seaborn as sns
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

conf_mat = confusion_matrix(y,text_clf.predict(X))
fig, ax = plt.subplots(figsize=(8,8))
sns.heatmap(conf_mat, annot=True, cmap="Blues", fmt='d', xticklabels=food_items['swap_cat'].unique(),yticklabels=food_items['swap_cat'].unique()
)