In [1]:
import pandas as pd
import numpy as np
import pickle
import utils
from collections import Counter
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans

In [2]:
data = pd.read_csv('challenge_dat.csv')

In [3]:
data.head()

Unnamed: 0,RESTAURANT_NAME,CITY,COUNTRY,R_PRIMARY_LOC_TAG,R_OTHER_LOC_TAG,R_MENU_CATEGORY,R_OTHER_MENU_TAGS,ITEM_NAME,DESCRIPTION,IS_ALCOHOL
0,Hero Sandwich,Hong Kong,Hong Kong,american,western,Sandwiches,"american, western, burgers, Pasta, Smoothies, ...",Corn Beef Baked Potato,鹹牛肉焗薯,False
1,Hamachi-Ya,Sydney,Australia,japanese,,japanese,"sushi, bento, Cream of the crop, Big flavours,...",Spicy Garlic Edamame,Poached green soy beans tossed with salt fresh...,False
2,Hamachi-Ya,Sydney,Australia,japanese,,japanese,"sushi, bento, Cream of the crop, Big flavours,...",Soft Shell Crab (2pcs),Deep-fried soft shell crab with light ponzu sauce,False
3,Hamachi-Ya,Sydney,Australia,japanese,,japanese,"sushi, bento, Cream of the crop, Big flavours,...",Harumaki (4Pcs),Deep-fried Japanese style vegetable springroll,False
4,Hamachi-Ya,Sydney,Australia,japanese,,japanese,"sushi, bento, Cream of the crop, Big flavours,...",Avocado,,False


In [4]:
ascii_data = data.copy()
#remove non ascii from description, and text items
columns_to_ascii = ['R_OTHER_MENU_TAGS','ITEM_NAME','DESCRIPTION']

In [5]:
ascii_data = ascii_data.dropna(subset=['R_OTHER_MENU_TAGS','DESCRIPTION','ITEM_NAME','R_MENU_CATEGORY','R_PRIMARY_LOC_TAG'])
print(len(ascii_data), ' after na removal')
for col in columns_to_ascii:
    ascii_data[col] = ascii_data[col].astype(str).apply(lambda x: ''.join(['' if ord(i) < 32 or ord(i) > 126 else i for i in x]))
    ascii_data[col].replace('', np.nan, inplace=True)
#remove nan
df = ascii_data.dropna(subset=['R_OTHER_MENU_TAGS','DESCRIPTION','ITEM_NAME','R_MENU_CATEGORY','R_PRIMARY_LOC_TAG'])
print(len(df), ' after ascii')

775109  after na removal
738849  after ascii


In [6]:
#merging the 3 columns
df['food'] = df['R_OTHER_MENU_TAGS'] + "," + df['ITEM_NAME'] + "," + df['DESCRIPTION']
df = df.drop(columns_to_ascii, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [7]:
df = df.loc[:10000]
tok_df = df.copy()
tok_df['food'] = tok_df['food'].apply(utils.tokenize)

In [8]:
#print(df[df['food'].str.contains('13th')]['food'].tolist())

In [9]:
all_words = []
for row in tok_df['food']:
    for word in row.split(','):
        all_words.append(word)

In [10]:
#tok_df['food'].to_csv('10000.csv')

In [11]:
Counter(all_words).most_common(10)

[('for', 1222),
 ('with', 1170),
 ('and', 1117),
 ('lunch', 842),
 ('salad', 747),
 ('onli', 744),
 ('deliveroo', 738),
 ('great', 710),
 ('chicken', 572),
 ('flavour', 560)]

In [12]:
#remove unwanted words
generic_words = ['with', 'and', 'the', 'our', 'your']
for word in generic_words:
    if word in all_words:
        all_words.remove(word)
        
word_set = set(all_words)

In [13]:
encoded_df = tok_df.copy()

In [14]:
#for everyword
for word in word_set:
    #create column with 1 if appears 0 else
    encoded_df[word] = tok_df['food'].str.contains(word)

In [15]:
encoded_df = pd.get_dummies(encoded_df)

In [16]:
encoded_df.head()

Unnamed: 0,IS_ALCOHOL,food,lamsvle,montelliana,involtini,kickstart,sedara,uitstekend,voor,unleaven,...,R_MENU_CATEGORY_noodles,R_MENU_CATEGORY_persian,R_MENU_CATEGORY_pizza,R_MENU_CATEGORY_seafood,R_MENU_CATEGORY_south american,R_MENU_CATEGORY_spanish,R_MENU_CATEGORY_sushi,R_MENU_CATEGORY_thai,R_MENU_CATEGORY_vegetarian,R_MENU_CATEGORY_western
1,False,False,False,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
2,False,False,False,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
3,False,False,False,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
5,False,False,False,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
6,False,False,False,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0


In [80]:
pca = PCA(n_components=2)
pca.fit(encoded_df)

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [81]:
pca_df = pca.fit_transform(encoded_df)
kmeans = KMeans(n_clusters=6, random_state=0).fit(pca_df)

In [None]:
colors = {0: 'y', 1: u'orchid', 2: u'darkcyan', 3: u'grey', 4: u'dodgerblue', 5: u'turquoise', 6: u'darkviolet'}
for point in pca_df:
    plt.scatter(point[0],point[1],c=colors[kmeans.predict(point.reshape(1, -1))[0]])
plt.title('PCA analysis for encoded set of all tokenized words')
plt.scatter([i[0] for i in kmeans.cluster_centers_], [i[1] for i in kmeans.cluster_centers_], c='k')
plt.show()

for i in range(5):
    plt.annotate(encoded_df[i], xy=kmeans.predict(pca_df[i]))