In [3]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import word_tokenize, PorterStemmer          
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import cross_val_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import string

train = pd.read_json('cooking_train.json')
test = pd.read_json('cooking_test.json')

def joining_preprocessor(line):
    return ' '.join(line).lower()

def get_word_freq(corpus,n_gram=(1,1)):
    vec = CountVectorizer(preprocessor=joining_preprocessor,ngram_range=n_gram).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    return [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]

def get_top_n_words(corpus, n=None, n_gram=(1,1)):
    words_freq = get_word_freq(corpus,n_gram)
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

def get_not_used_again(train,test):
    ps = PorterStemmer()
    all_words_train = get_top_n_words(train['ingredients'])
    all_words_test = get_top_n_words(test['ingredients'])
    list_word_train = [ps.stem(t) for t,freq in all_words_train]
    list_word_test = [ps.stem(t) for t,freq in all_words_test]
    return list(set(list_word_train) - set(list_word_test))

def get_counting_information(pd):
    return np.array(pd['ingredients'].str.len()),np.array(pd['ingredients'].apply(lambda x: len(' '.join(x).split(sep=' '))))

not_used_again_word = get_not_used_again(train,test)

def preprocessor(line):
    joining = ' '.join(line).lower()
    to_remove = string.digits
    to_exchange_with_space = string.punctuation + '\u00ae' + '\u2122'
    without_punctuation = joining.translate(str.maketrans(to_exchange_with_space,' ' * len(to_exchange_with_space)))
    return without_punctuation.translate(str.maketrans('', '', to_remove))

# def get_most_popular_ingredients(n):
#     unique_ingredients = []
#     for arr in train.ingredients:
#         unique_ingredients = unique_ingredients + list(set(arr) - set(unique_ingredients))
#     
#     ingredients = {}
#     
#     for ing in unique_ingredients:
#         ingredients[ing] = 0
#     
#     for l in train.ingredients:
#         for element in l:
#             ingredients[element] += 1
#     most_ingredients = [(k, ingredients[k]) for k in sorted(ingredients, key=ingredients.get, reverse=True)]
#     return most_ingredients[:n]

#POMYSŁ Z DORZUCANIEM DO TOKENÓW CAŁYCH PRZEPISÓW

# class CustomAnalyzer:
#     def __init__(self):
#        self.wnl = WordNetLemmatizer()
#     def __call__(self, doc):
#         tokens =[]
#         rest = []
#         for receipe in doc:
#             if receipe in most_ingredients:
#                 tokens.append(receipe)
#             else:
#                 rest.append(receipe)
#         clean_line = preprocessor(rest)
#         return tokens +  [self.wnl.lemmatize(t) for t in word_tokenize(clean_line)]
    
class LemmaTokenizer:
        def __init__(self):
           self.wnl = WordNetLemmatizer()
           self.ps = PorterStemmer()
           self.stopwords = stopwords.words('english')
        def __call__(self, doc):
            stem =  [self.ps.stem(t) for t in word_tokenize(doc) if t not in self.stopwords]
            return [t for t in stem  if t not in not_used_again_word]
           
def preparing_data(data):
    vect = TfidfVectorizer(preprocessor=preprocessor, tokenizer=LemmaTokenizer())
    words = vect.fit_transform(data['ingredients'])
    word_number, ingredients_number = get_counting_information(data)
     # hstack((words,word_number[:,None]))
    return words

In [4]:
X_dataset = train['ingredients']
y_dataset = train['cuisine']


svc_model = SVC(C=200, kernel='rbf', gamma=1, shrinking=True, tol=0.01, decision_function_shape='ovr')


et_pipeline = Pipeline([
        ('vectorizer', TfidfVectorizer(preprocessor=preprocessor,tokenizer=LemmaTokenizer())),
        ('classifier', svc_model)
    ])

In [None]:
res = cross_val_score(et_pipeline, X_dataset, y_dataset,verbose=1,n_jobs=2)
print(res)
sum(res) / len(res)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


In [5]:
train = pd.read_json('cooking_train.json')
test = pd.read_json('cooking_test.json')
X_dataset = train['ingredients']
y_dataset = train['cuisine']
test = pd.read_json('cooking_test.json')
X_test = test['ingredients']

et_pipeline.fit(X_dataset, y_dataset)

Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=<function preprocessor at 0x7f0a97cc4ae8>,
                                 smooth_idf=True, stop_words=None,
                                 strip_accents=...
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<__main__.LemmaTokenizer object at 0x7f0adc278dd8>,
                                 use_idf=True, vocabulary=None)),
                ('classifier',
                 SVC(C=200, break_ties=False, cache_size=200, class_weight

In [5]:
prediction = et_pipeline.predict(X_test)

submission = test.copy()
submission['cuisine'] = prediction
submission.to_csv('output/svc_one_vs_rest_submision.csv', index=False, columns=['id', 'cuisine'])
