In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import cm
%matplotlib inline
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer #
from sklearn.feature_extraction.text import TfidfVectorizer #
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB
import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords #
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 

# make sure stopwords are downloaded
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\akams\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\akams\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\akams\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Import and load data

In [8]:
df = pd.read_csv('final_bbc_data.csv', index_col = 0)
print(df.shape)
df.head()

(900, 2)


Unnamed: 0,Ingredients,Cuisine
0,1 ½kg chicken thighs and drumsticks 180g sea s...,1
1,"1 large chicken 5 beef short ribs (about 3kg),...",1
2,"150g unsalted butter , softened 80g golden cas...",1
3,"1 whole chicken , jointed, or 8 bone-in chicke...",1
4,500g macaroni 1l whole milk 2 bay leaves 60g b...,1


# Functions

In [99]:
# grab specified string from dataframe
def testing_str(index_num):
    read = df['Ingredients'][index_num]
    return read

In [130]:
def shallow_cleaning(_corpus):
    _corpus = _corpus.lower() # lowercase
    _corpus = re.sub('\[.*/()]', '', _corpus) # removes data in brackets
    _corpus = re.sub('[%s]' % re.escape(string.punctuation), '', _corpus) # list of punctuation, get rid of any punctuation
    _corpus = re.sub('\w*\d\w*', '', _corpus) # removes all numbers and any words that comtain them
    return _corpus
# apply the function and assign to variable
shallow_clean = lambda x: shallow_cleaning(x)

In [131]:
shallow_df = pd.DataFrame(df['Ingredients'].apply(shallow_clean))

In [132]:
shallow_df

Unnamed: 0,Ingredients
0,½kg chicken thighs and drumsticks sea salt f...
1,large chicken beef short ribs about ribs se...
2,unsalted butter softened golden caster suga...
3,whole chicken jointed or bonein chicken pie...
4,macaroni whole milk bay leaves butter pla...
...,...
895,urid dal black gram vegetable oil tsp fresh...
896,strong white flour tsp salt sachet fastact...
897,chopped rhubarb light soft brown sugar tsp...
898,gram flour selfraising flour ½ tsp red chill...


In [None]:
def plot_fdist(arg, n):
    fdist = FreqDist(arg)
    plt.figure(figsize=(8, 8))
    return fdist.plot(n)

## tokenize

In [84]:
# split documents into tokens (xgrams, stopwords, etc.)
tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')
tokenized_read = tokenizer.tokenize(read)
# words have been turned into individual tokens
tokenized_read

['1',
 'kg',
 'chicken',
 'thighs',
 'and',
 'drumsticks',
 '180g',
 'sea',
 'salt',
 'flakes',
 '90g',
 'soft',
 'dark',
 'brown',
 'sugar',
 '2',
 'tbsp',
 'chilli',
 'flakes',
 '2',
 'tbsp',
 'sweet',
 'smoked',
 'paprika',
 '1',
 'tbsp',
 'ground',
 'cumin',
 '1',
 'tbsp',
 'sea',
 'salt',
 'flakes',
 '1',
 'tbsp',
 'dark',
 'brown',
 'sugar',
 '75g',
 'butter',
 '125ml',
 'hot',
 'chilli',
 'sauce',
 '1',
 'tbsp',
 'maple',
 'syrup']

Will probably need custom stop words for the culinary dictionary?

In [114]:
# looking at default english stopwords
# get stopwords module from nltk and set to 'english' to get default english stopwords from nltk
def lst_stopwords(tar):
    stop_words=set(stopwords.words("english"))
    filtered_list = []
    stopwordss = []
    for i in tar:
        stopwordss.append(i)
#         print("Filtered out stopwords:", stopwordss)
        if i not in stop_words:
            filtered_list.append(i)
    print("Filterd List:",filtered_list)

In [112]:
stop_words=set(stopwords.words("english"))

In [105]:
# compare the lengths of filtered and unfiltered
print(len(tokenized_read))
print(len(filtered_read))
# 'and' is the filtered out word

37
47


In [54]:
# Stemming: change = chang
from nltk.stem import PorterStemmer
ps = PorterStemmer()

stemmed_read=[]
for w in filtered_read:
    stemmed_read.append(ps.stem(w))

print(stemmed_read)

['1', 'kg', 'chicken', 'thigh', 'drumstick', '180g', 'sea', 'salt', 'flake', '90g', 'soft', 'dark', 'brown', 'sugar', '2', 'tbsp', 'chilli', 'flake', '2', 'tbsp', 'sweet', 'smoke', 'paprika', '1', 'tbsp', 'ground', 'cumin', '1', 'tbsp', 'sea', 'salt', 'flake', '1', 'tbsp', 'dark', 'brown', 'sugar', '75g', 'butter', '125ml', 'hot', 'chilli', 'sauc', '1', 'tbsp', 'mapl', 'syrup']


In [58]:
# lemmatezization: changer = change
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer() 

lemmatized_read=[]
for w in filtered_read:
    lemmatized_read.append(lemmatizer.lemmatize(w))

print(lemmatized_read)

['1', 'kg', 'chicken', 'thigh', 'drumstick', '180g', 'sea', 'salt', 'flake', '90g', 'soft', 'dark', 'brown', 'sugar', '2', 'tbsp', 'chilli', 'flake', '2', 'tbsp', 'sweet', 'smoked', 'paprika', '1', 'tbsp', 'ground', 'cumin', '1', 'tbsp', 'sea', 'salt', 'flake', '1', 'tbsp', 'dark', 'brown', 'sugar', '75g', 'butter', '125ml', 'hot', 'chilli', 'sauce', '1', 'tbsp', 'maple', 'syrup']


# test 2

In [95]:
tokenized_read = tokenizer.tokenize(read)
tokenized_read_2 = tokenizer.tokenize(read_2)
tokenized_read_3 = tokenizer.tokenize(read_3)

In [96]:
tokenized_read_3

['dried',
 'morita',
 'chillies',
 'dried',
 'ancho',
 'chilli',
 'dried',
 'pasilla',
 'mixe',
 'chilli',
 'garlic',
 'cloves',
 'sea',
 'salt',
 'tbsp',
 'balsamic',
 'vinegar',
 'avocado',
 'leaves',
 'grapeseed',
 'oil',
 'plus',
 'more',
 'for',
 'frying',
 'lamb',
 'shanks',
 'banana',
 'leaves',
 'vegetable',
 'or',
 'beef',
 'stock',
 'bulb',
 'garlic',
 'cloves',
 'peeled',
 'red',
 'onions',
 'sliced',
 'tomatoes',
 'thickly',
 'sliced',
 'tomatillos',
 'roughly',
 'chopped',
 'green',
 'jalape',
 'os',
 'garlic',
 'cloves',
 'peeled',
 'small',
 'bunch',
 'of',
 'coriander',
 'lime',
 'juiced',
 'avocado',
 'peeled',
 'with',
 'stone',
 'removed',
 'corn',
 'tortillas',
 'Mexican',
 'crema',
 'or',
 'soured',
 'cream',
 'queso',
 'fresco',
 'or',
 'feta',
 'cos',
 'lettuce',
 'shredded',
 'toothpicks']

In [None]:
# implementing it in python
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
# creates bagofwords
# turns documents into bag of words
# Convert a collection of text documents to a matrix of token counts

docs = ['i love dogs','i love cats','i love all animals']

# you dont have to tokenize the words
# jujst create the countvecotizer and pass in parameters
vec = CountVectorizer(stop_words=stop_words, lowercase = True, ngram_range = (1,2)) #give me unigram and end with bigram
X = vec.fit_transform(docs)


df = pd.DataFrame(X.toarray(), columns = vec.get_feature_names())
df