In [0]:
!pip install torch_nightly -f https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html

Looking in links: https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html
Collecting torch_nightly
[?25l  Downloading https://download.pytorch.org/whl/nightly/cu92/torch_nightly-1.2.0.dev20190805%2Bcu92-cp36-cp36m-linux_x86_64.whl (704.8MB)
[K     |████████████████████████████████| 704.8MB 26kB/s 
[?25hInstalling collected packages: torch-nightly
Successfully installed torch-nightly-1.2.0.dev20190805+cu92


In [0]:
!pip install fastai



In [0]:
# import libraries
import fastai
from fastai import *
from fastai.text import * 
import pandas as pd
import numpy as np
from functools import partial
import io
import os

In [0]:
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [0]:
documents[:2]

["Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n",
 "\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap

In [0]:
type(dataset)

sklearn.utils.Bunch

In [0]:
dataset.target

array([17,  0, 17, 11, ..., 13,  9,  4,  9])

In [0]:
documents.index

<function list.index>

In [0]:
d = {'label':dataset.target, 'text':dataset.data}
df = pd.DataFrame( data=d)

In [0]:
df.head()

Unnamed: 0,label,text
0,17,Well i'm not sure about the story nad it did s...
1,0,"\n\n\n\n\n\n\nYeah, do you expect people to re..."
2,17,Although I realize that principle is not one o...
3,11,Notwithstanding all the legitimate fuss about ...
4,10,"Well, I will have to change the scoring on my ..."


In [0]:
df.shape

(11314, 2)

In [0]:
# take 1 & 10 to do a binbary classification
df = df[df['label'].isin([1,10])]
df = df.reset_index(drop = True)
#Let's have a quick look at the target distribution.

df['label'].value_counts()

10    600
1     584
Name: label, dtype: int64

In [0]:
print(df.head())

   label                                               text
0     10  Well, I will have to change the scoring on my ...
1      1  Archive-name: graphics/resources-list/part1\nL...
2     10  \nAnd of course, Mike Ramsey was (at one time)...
3     10  As I promised, I would give you the name of th...
4     10  GAME(S) OF 4/15\n---------------\nADIRONDACK 6...


In [0]:
df['text'] = df['text'].str.replace("[^a-zA-Z]", " ")


In [0]:
print(df.head())

   label                                               text
0     10  well  i will have to change the scoring on my ...
1      1  archive name  graphics resources list part  la...
2     10   and of course  mike ramsey was  at one time  ...
3     10  as i promised  i would give you the name of th...
4     10  game s  of                      adirondack   c...


In [0]:
df['text'] = df['text'].apply(lambda x: x.lower())

In [0]:
df.head()

Unnamed: 0,label,text
0,10,well i will have to change the scoring on my ...
1,1,archive name graphics resources list part la...
2,10,and of course mike ramsey was at one time ...
3,10,as i promised i would give you the name of th...
4,10,game s of adirondack c...


In [0]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords 
stop_words = stopwords.words('english')
# tokenization 
tokenized_doc = df['text'].apply(lambda x: x.split())

# remove stop-words 
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

# de-tokenization 
detokenized_doc = [] 
for i in range(len(df)): 
    t = ' '.join(tokenized_doc[i]) 
    detokenized_doc.append(t) 

df['text'] = detokenized_doc

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
df.head()

Unnamed: 0,label,text
0,10,well change scoring playoff pool unfortunately...
1,1,archive name graphics resources list part last...
2,10,course mike ramsey one time captain buffalo pr...
3,10,promised would give name panther president hui...
4,10,game adirondack cdi adirondack leads series fi...


In [0]:
df['text'][0]

'well change scoring playoff pool unfortunately time right certainly post new scoring rules tomorrow matter enter anyway good keith keller let go rangers let go quakers kkeller mail sas upenn edu ivy league champs'

In [0]:
from sklearn.model_selection import train_test_split


In [0]:
df_trn, df_val = train_test_split(df, stratify = df['label'], test_size = 0.4, random_state = 12)

In [0]:
df_trn.shape, df_val.shape


((710, 2), (474, 2))

In [0]:
# Language model data
data_lm = TextLMDataBunch.from_df(train_df = df_trn, valid_df = df_val, path = "")


In [0]:
type(data_lm)

fastai.text.data.TextLMDataBunch

In [0]:
# Classifier model data
data_clas = TextClasDataBunch.from_df(path = "", train_df = df_trn, valid_df = df_val, vocab=data_lm.train_ds.vocab, bs=32)

In [0]:
#learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.7)
#learn = language_model_learner(data_lm, AWD_LSTM, pretrained_model=URLs.WT103_BWD, drop_mult=0.7)
learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.3,pretrained=True)

Downloading https://s3.amazonaws.com/fast-ai-modelzoo/wt103-fwd.tgz


In [0]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,7.742992,6.689374,0.145045,00:02


In [0]:
learn.save_encoder('ft_enc')

In [0]:
#learn = text_classifier_learner(data_clas, drop_mult=0.7)
learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5)
learn.load_encoder('ft_enc')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (710 items)
x: TextList
xxbos looks like edmonton oilers decided take european xxunk spring ranford tugnutt benning manson smith buchberger corson playing canada podein weight playing us kravchuk playing xxunk know nagging injuries late season podein interesting case eligible play cape breton ahl playoffs like kovalev zubov andersson obviously sather pocklington total xxunk everyone makes certainly case massively xxunk paramount new york rangers,xxbos xxunk speaking die hard read xxunk hard xxunk toronto cup finals first anyone planet heard team detroit al xxunk however spell idiot name must chicago espn said even close chicago xxunk win norris division playoffs team close everyone picking chicago get says easy choice god chicago wings division point two followed closely toronto also good team leafs beating detroit doubt even going get chicago even xxunk get past hawks would probably face vancouver lose habs reaching finals forget ev

In [0]:
learn.fit_one_cycle(2, 1e-2)


epoch,train_loss,valid_loss,accuracy,time
0,0.348678,0.217137,0.932489,00:05
1,0.333499,0.131748,0.953587,00:05


In [0]:
# get predictions
preds, targets = learn.get_preds()

In [0]:
#preds

In [0]:
predictions = np.argmax(preds, axis = 1)

In [0]:
pd.crosstab(predictions, targets)


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,230,18
1,4,222
