In [1]:
import pandas as pd
import json
import re
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from nltk.stem import WordNetLemmatizer

In [2]:
#Import train data into dictionary
jsonfile = r'.\Data\train.json'
with open(jsonfile) as train_json:
    json_dict = json.load(train_json)

In [3]:
# Converting train data into data frame
sno = []
cuisine = []
ingredients = []
for i in range(len(json_dict)):
    sno.append(json_dict[i]['id'])
    cuisine.append(json_dict[i]['cuisine'])
    ingredients.append(json_dict[i]['ingredients'])

In [4]:
df = pd.DataFrame({'id':sno, 
                   'cuisine':cuisine, 
                   'ingredients':ingredients})

In [6]:
#Lemmatize the ingredients field
df['ingredients_mod'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in df['ingredients']]

In [7]:
# Adding Veg/Non-Veg information into train data set
mylist =['fish', 'goat', 'chicken','beef','pork','prawn','egg','Katsuobushi','mackrel','fillet','lamb','steak','salmon','shrimp','bacon','ham','turkey','duck','seafood','squid']
pattern = '|'.join(mylist)
df['veg']=df.ingredients_mod.str.contains(pattern) 
df.loc[df.veg == True,'veg'] = 'non-vegetarian'
df.loc[df.veg == False,'veg'] = 'vegetarian'

In [8]:
#Removing the stop words from ingredients and vectorizing
vectorizer = TfidfVectorizer(stop_words='english',
                             ngram_range = ( 1 , 1 ),analyzer="word", 
                             max_df = .57 , binary=False , token_pattern=r'\w+' , sublinear_tf=False)
train_tfidf=vectorizer.fit_transform(df['ingredients_mod'])

X = train_tfidf
Y = df['cuisine']
Z = df['veg']

In [9]:
#Using SVM to classify the BBC dataset
lsvc_veg = svm.LinearSVC(C=1)
lsvc_veg.fit(X,Z)
filename = r'.\Model\model.pkl'
with open(filename,'wb') as f:
    pickle.dump(lsvc_veg,f)