## IMDB 电影影评数据集

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from fastai import *
from fastai.text import *

### Preparing the data

In [None]:
path = untar_data(URLs.IMDB_SAMPLE)
path.ls()

In [None]:
df = pd.read_csv(path/'texts.csv')
df.head()

In [None]:
df['text'][1]

In [None]:
data_lm = TextDataBunch.from_csv(path, 'texts.csv')
data_lm.save()   # 预处理已经做过了，下次加载就不用重新做了

In [None]:
# data = TextDataBunch.load(path)  # API 已弃用
data = load_data(path)

### Tokenization  分词

In [None]:
#data = TextClasDataBunch.load(path) # 因为下载的文件没有tmp/.pkl
data = load_data(path)
data.show_batch()

### Numericalization  数值化

In [None]:
data.vocab.itos[:10]  # 那个汇集了所有不重复标记的列表，可以成为词汇表，简写成vocab

In [None]:
data.train_ds[0][0]

In [None]:
# data.train_ds[0][0][:10]
data.train_ds[0][0].data[:10]

### With the data block API

In [None]:
# data = (TextSplitData.from_csv(path, 'texts.csv', input_cols=1, label_cols=0, valid_col=2)
data = (TextList.from_csv(path, 'texts.csv', col='text')
       .datasets(TextDataset)
       .tokenize()
       .numericalize()
       .databunch(TextDataBunch))

### Full datasets

In [None]:
'''
Downloading https://s3.amazonaws.com/fast-ai-nlp/imdb
0.00% [0/144440600 00:00<00:00]
 Download of https://s3.amazonaws.com/fast-ai-nlp/imdb.tgz has failed after 5 retries
 Fix the download manually:
$ mkdir -p C:\Users\xiaop\.fastai\data
$ cd C:\Users\xiaop\.fastai\data
$ wget -c https://s3.amazonaws.com/fast-ai-nlp/imdb.tgz
$ tar -zxvf imdb.tgz
And re-run your code once the download is successful
'''
path = untar_data(URLs.IMDB)
path.ls()

In [None]:
(path/'train').ls()

### Langurage model  创建语言模型

In [None]:
#data_lm = (TextFileList.from_folder(path)  # API 已弃用
#          .label_const(0)
#          .split_by_folder(valid='test')
#          .datasets()
#          .tokenize()
#          .numericalize()
#          .databunch(TextLMDataBunch))
data_lm = (TextList.from_folder(path)
           .filter_by_folder(include=['train', 'test'])
           .random_split_by_pct(0.1)
           .label_for_lm()
           .databunch())
data_lm.save('tmp_lm')
data_lm = TextLMDataBunch.load(path, 'tmp_lm')
data_lm.show_batch()

In [None]:
learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.3) # RNN 神经网络
learn.lr_find()
learn.recorder.plot(skip_end=15)

In [None]:
learn.fit_one_cycle(1, 1e-2, moms=(0.8, 0.7))
learn.save('fit_head')

In [None]:
learn.load('fit_head')
learn.unfreeze()
learn.fit_one_cycle(10, 1e-3, moms=(0.8, 0.7))
learn.save('fine_tuned')
learn.save_encoder('fine_tuned_enc')

### 预测

In [None]:
learn.load('fine_tuned')
learn.predict('I liked this movie because ', 100, temperature=1.1, min_p=0.001)

### Classifier

In [None]:
#data_clas = (TextFileList.from_folder(path)  # API 已弃用
#            .label_from_folder(classes=['neg', 'pos'])
#            .split_by_folder(valid='test')
#            .datasets()
#            .tokenize()
#            .numericalize(vocab=data_lm.vocab)
#            .databunch(TextClasDataBunch, bs=50))  # bs根据GPU内存调整
data_clas = (TextFileList.from_folder(path, vocab=data_lm.vocab)
             .split_by_folder(valid='test')
             .label_from_folder(class=['neg', 'pos'])
             .databunch(bs=50))
data_clas.save('tmp_clas')
data_clas = TextClasDataBunch.load(path, 'tmp_clas', bs=50)
data_clas.show_batch()

In [None]:
learn = text_classifier_learner(data_clas, drop_mult=0.5)
learn.load_encoder('fine_tuned_enc')
learn.freeze()
learn.lr_find()
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(1, 2e-2, moms=(0,8, 0.7))
learn.save('first')

In [None]:
learn.load('first')
larn.freeze_to(-2)  # 解冻最后两层
learn.fit_one_cycle(1, slice(1e-2/(2.6**4), 1e-2), moms=(0.8,0.7))
learn.save('second')

In [None]:
learn.load('second')
learn.freeze_to(-3)  # 解冻最后三层
laarn.fit_one_cycle(1, slice(5e-3/(2.6**4), 5e-3), moms=(0.8,0.7))
learn.save('third')

In [None]:
learn.load('third')
learn.unfreeze()
learn.fit_one_cycle(2, slice(1e-3/(2.6**4), 1e-3), moms=(0.8,0.7))

## Tablular example

In [None]:
from fastai import *
from fastai.tabular import *

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
train_df, valid_df = df[:-2000].copy(), df[-2000:].copy()
train_df.head()

## Tabular example

In [None]:
from fastai import *
from fastai.tabular import *

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
dep_var = '>=50k'
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [FillMissing, Categorify, Normalize]
test = TabularList.from_df(df.ilo[800:1000].copy(), path=path, cat_names=cat_names, cont_names=cont_names)
data = (TabularList.from_df(df, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)
       .split_by_idx(list(range(800, 1000)))
       .label_from_df(cols=dep_var)
       .add_test(test, label=0)
       .databunch())
data.show_batch(rows=10)

In [None]:
learn = get_tabular_learner(data, layers=[200, 100], metrics=accuracy)
learn.fit(1, 1e-2)
df = pd.read_csv(path/'adult.csv')
df.head()

## Collaborative filtering example  协同过滤

In [None]:
path = untar_data(URLs.ML_SAMPLE)
path

In [None]:
ratings = pd.read_csv(path/'ratings.csv')
series2cat(ratings, 'userId', 'movieId')
ratings.head()

In [None]:
learn = get_collab_learner(ratings, n_factors=50, min_score=0., max_score=5.)
learn.fit_one_cycle(4, 5e-3)