<a href="https://colab.research.google.com/github/YuruHuang/COVID-ZIP/blob/master/Menu_Healthiness_FastAI_ULMfit_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this notebook, I will replicate what Tom B did with the takeaway classifier, except that I will replace the classification header with a regression header. 

The goals for this notebook is two-fold: 

1. Run ULMfit for the regression task using fastai 
2. Resample the data to get a balanced dataset 

# Data preprocessing 


In [2]:
! [ -e /content ] && pip install -Uqq fastai  # upgrade fastai on colab

In [3]:
## Import all the libraries 
!pip install unidecode
from unidecode import unidecode
import string
import pandas as pd
from fastai.vision.all import *
from fastai.text.all import *
from fastai.collab import *
from fastai.tabular.all import *
import string
import string
from sklearn.model_selection import train_test_split

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting unidecode
  Downloading Unidecode-1.3.4-py3-none-any.whl (235 kB)
[K     |████████████████████████████████| 235 kB 5.1 MB/s 
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.4


In [4]:
train_ds = pd.read_csv('train_smogn.csv')
test_ds = pd.read_csv('test_smogn.csv')

In [5]:
def rest_name_cleaning(rest_name):
  '''
  This function cleans the restaurant name string 
  '''
  rest_name_clean = rest_name.lower()
  rest_name_clean = rest_name_clean.strip()
  ascii_name = unidecode(rest_name_clean)
  ascii_name_fix = ascii_name.replace('`',"'")
  ascii_name_fix = ascii_name_fix.replace('~'," ")
  ascii_name_fix = ascii_name_fix.replace(']'," ")
  ascii_name_fix = ascii_name_fix.replace('['," ")
  ascii_name_fix = ascii_name_fix.replace('\\'," ")
  ascii_name_fix = ascii_name_fix.replace('%'," ")
  ascii_name_fix = ascii_name_fix.replace('*'," ")
  ascii_name_fix = ascii_name_fix.replace('+'," and ")
  ascii_name_fix = ascii_name_fix.replace('&'," and ")
  ascii_name_fix = ascii_name_fix.replace('/'," ") 
  
  return ascii_name_fix

In [6]:
# 0-12 if over 12 then 12
def clean_data(data):
  data.loc[data['predict']>12, 'predict'] = 12
  data.loc[data['predict']<0, 'predict'] = 0
  data['ascii_name_fix'] = data.rest_name.apply(lambda x: rest_name_cleaning(x))
  return data

In [7]:
train_ds = clean_data(train_ds)
test_ds = clean_data(test_ds)

In [8]:
train_ds['is_valid'] = False
test_ds['is_valid'] = True

In [9]:
justeat_df = pd.concat([train_ds,test_ds], axis=0)

In [10]:
justeat_df.head()

Unnamed: 0.1,Unnamed: 0,index,Restaurant ID,Desserts,Vegetables,Salads,Chips,Water,Milk,Special Offers,...,postcode_district,specialOffers,chips,salads,predict,rating2,ascii_name_fix,is_valid,n_review,review
0,0,21615.0,37196,1,18,4,1,0,0,3,...,G12,3,1,4,8.568073,3,la vita spuntini,False,,
1,1,21158.0,38498,1,18,4,2,0,0,3,...,G12,3,1,4,8.556266,2,la vita spuntini,False,,
2,2,20740.0,35972,2,17,5,2,0,0,3,...,G12,2,2,5,8.546312,3,la vita spuntini,False,,
3,3,25787.0,95230,0,17,2,0,0,0,4,...,G12,4,0,2,8.555663,3,la vita spuntini,False,,
4,4,26378.0,94912,0,16,3,0,0,0,4,...,G12,4,0,2,8.538543,3,la vita spuntini,False,,


In [11]:
justeat_df.shape, train_ds.shape, test_ds.shape

((62360, 29), (56902, 27), (5458, 28))

In [12]:
justeat_df.drop(justeat_df.columns[0:2],axis=1,inplace=True)

In [13]:
def split_chars(text):
  return ' '.join(list(text))

In [14]:
justeat_df['rest__name_space'] = justeat_df['ascii_name_fix'].apply(split_chars)

In [15]:
justeat_df = justeat_df[['rest__name_space','predict', 'is_valid']]

In [16]:
justeat_df.tail()

Unnamed: 0,rest__name_space,predict,is_valid
5453,s u p e r s h a w a r m a,6.619293,True
5454,b a h a r c a f e,7.049263,True
5455,a m e r i c a n p h i l l i c h e e s e s t e a k,6.134494,True
5456,v i v a p i z z a,5.910214,True
5457,a l i s h a n t a n d o o r i,7.208977,True


In [17]:
test = 'american philli cheese steak'
split_chars(test)

'a m e r i c a n   p h i l l i   c h e e s e   s t e a k'

# train learnner

In [18]:
import inspect
def print_source(obj):
    for line in inspect.getsource(obj).split("\n"):
        print(line)

print_source(BaseTokenizer)

class BaseTokenizer():
    "Basic tokenizer that just splits on spaces"
    def __init__(self, split_char=' ', **kwargs): self.split_char=split_char
    def __call__(self, items): return (t.split(self.split_char) for t in items)



In [19]:
# two ways to work around this: add ' ' for each character, or redefine the tokenizer 

let's first do it on the word level 

In [20]:
# prepare the data for language model
dls_lm = TextDataLoaders.from_df(justeat_df, is_lm=True, valid_pct=0.1)

In [21]:
dls_lm.show_batch(max_n=5)

Unnamed: 0,text,text_
0,xxbos s u g a r s h a c k xxbos s u g a r s h a c k xxbos l a v i t a s p u n t i n i xxbos l a v i t a s p u n t i n i xxbos l a v i t a s p u n t i n i xxbos s u g a,s u g a r s h a c k xxbos s u g a r s h a c k xxbos l a v i t a s p u n t i n i xxbos l a v i t a s p u n t i n i xxbos l a v i t a s p u n t i n i xxbos s u g a r
1,e r ( s w i n d o n ) xxbos l a v i t a s p u n t i n i xxbos p i z z a p i z z a xxbos k i r k l a n d n e u k ' s c a f e f i s h a n d c h i p s p i z z,r ( s w i n d o n ) xxbos l a v i t a s p u n t i n i xxbos p i z z a p i z z a xxbos k i r k l a n d n e u k ' s c a f e f i s h a n d c h i p s p i z z a
2,a r s h a c k xxbos s u g a r s h a c k xxbos s u g a r s h a c k xxbos s u b w a y - h a y e s xxbos c r o s s r e s t a u r a n t xxbos r i v e r v i e w c a f e,r s h a c k xxbos s u g a r s h a c k xxbos s u g a r s h a c k xxbos s u b w a y - h a y e s xxbos c r o s s r e s t a u r a n t xxbos r i v e r v i e w c a f e xxbos
3,p u n t i n i xxbos p a u s e c o f f e e xxbos l a v i t a s p u n t i n i xxbos a d a m ' s p i z z a a n d g r i l l w e m b l e y xxbos l a v i t a s p u n,u n t i n i xxbos p a u s e c o f f e e xxbos l a v i t a s p u n t i n i xxbos a d a m ' s p i z z a a n d g r i l l w e m b l e y xxbos l a v i t a s p u n t
4,a m ) xxbos j e r i c h o xxbos s u g a r s h a c k xxbos p e r i - p e r i g r i l l c o t t a g e xxbos h o m e n a w a y xxbos r e d c h i l i xxbos g r e g g s - e,m ) xxbos j e r i c h o xxbos s u g a r s h a c k xxbos p e r i - p e r i g r i l l c o t t a g e xxbos h o m e n a w a y xxbos r e d c h i l i xxbos g r e g g s - e d


In [22]:
learn = language_model_learner(dls_lm, AWD_LSTM, metrics=[accuracy, Perplexity()], path='', wd=0.1).to_fp16()

In [24]:
learn.freeze()

In [25]:
lr = 1e-1
cycles = 25

In [26]:
learn.fit_one_cycle(cycles, 1e-2)



epoch,train_loss,valid_loss,accuracy,perplexity,time


KeyboardInterrupt: ignored

In [None]:
learn.unfreeze()
learn.fit_one_cycle(cycles, 1e-3)

In [None]:
TEXT = "BOB"
N_WORDS = 40
N_SENTENCES = 1
preds = [learn.predict(TEXT, N_WORDS, temperature=0.75) 
         for _ in range(N_SENTENCES)]

In [None]:
preds

In [None]:
learn.save_encoder('just_eat_enc_210722')

# Train a regression model 

In [None]:
# loading the data (with both the label and texts)
dls_rn = DataBlock(
    blocks=(TextBlock.from_df('rest__name_space', tok_text_col="rest__name_space"), RegressionBlock),
    get_x=ColReader('rest__name_space'), get_y=ColReader('predict'),
    splitter=ColSplitter())

In [None]:
dls = dls_rn.dataloaders(justeat_df)

In [None]:
dls.show_batch(max_n=3)

In [None]:
learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=mae)

In [None]:
learn.load_encoder('just_eat_enc_210722')

In [None]:
learn.fit_one_cycle(1, 2e-2)

In [None]:
learn.freeze_to(-2)
learn.fit_one_cycle(cycles, slice(1e-2/(2.6**4),1e-2))

In [None]:
learn.freeze_to(-3)
learn.fit_one_cycle(cycles, slice(5e-3/(2.6**4),5e-3))


In [None]:
learn.unfreeze()
learn.fit_one_cycle(cycles, slice(1e-3/(2.6**4),1e-3))

In [None]:
learn.save('Letter_FastAI')