<a href="https://colab.research.google.com/github/marek-bardonski/airev-advanced-workshops/blob/master/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# AI REV LLC - Copyrights 2020
# [Google Colab] Change runtime mode to GPU
# https://docs.rapids.ai/
# https://alraqmiyyat.github.io/2013/01-02.html

!nvcc --version
!pip3 install wget

# https://github.com/rapidsai/cudf/issues/3390
!pip3 install pyarrow==0.15.0 ## Workaround to cover up for Google Colab bug

#!pip3 install cudf-cuda100
#!pip3 install nvstrings-cuda100
#!pip3 install cuml-cuda100
#!pip3 install nvvm-cuda100

# RAPIDS installation script. Thanks Ritchie Ng and NVIDIA Corporation.
# intall miniconda
!wget -c https://repo.continuum.io/miniconda/Miniconda3-4.5.4-Linux-x86_64.sh
!chmod +x Miniconda3-4.5.4-Linux-x86_64.sh
!bash ./Miniconda3-4.5.4-Linux-x86_64.sh -b -f -p /usr/local

# install RAPIDS packages
!conda install -q -y --prefix /usr/local -c conda-forge \
  -c rapidsai-nightly/label/cuda10.0 -c nvidia/label/cuda10.0 \
  cudf cuml

**Remember to restart the runtime at this moment and restart the procedure from the top.** 



In [0]:
# set environment vars
import sys, os, shutil

sys.path.append('/usr/local/lib/python3.6/site-packages/')
sys.path.append('/usr/local/')
os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'
os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'

# copy .so files to current working dir
for fn in ['libcudf.so', 'librmm.so']:
  shutil.copy('/usr/local/lib/'+fn, os.getcwd())

import wget
from zipfile import ZipFile
import os
import cudf
import sys, os
import nvcategory
import os
import numpy as np
import nvstrings
import nltk
from numba import cuda
import json
import nvtext
import ctypes
import numpy as np
import torch
from torch import nn, optim
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader
import cupy
torch.cuda.is_available()

**If you got import errors, please look above or ask instructor.**

In [0]:
# Raw datasets are samples from WebHose.io 
print('Beginning dataset download with wget module')

url = 'https://bardonski.pl/chineese.zip'
wget.download(url)
url = 'https://bardonski.pl/arabic.zip'
wget.download(url)
url = 'https://bardonski.pl/arabic-true-pr.csv'
wget.download(url)
# TODO Arabic-2
# TODO Chineese-1
# TODO Chineese-2

print('Beginning word vectors download with wget module')
url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.vec.gz'
print('Word vectors unzip')
wget.download(url)
!gzip -d cc.ar.300.vec.gz

In [0]:
zip = ZipFile('chineese.zip')
zip.extractall()

!mkdir chineese

zip = ZipFile('630_webhose-2016-10_20170904084325.zip')
zip.extractall('chineese')

In [0]:
zip = ZipFile('arabic.zip')
zip.extractall()

!mkdir arabic

zip = ZipFile('627_webhose-2016-10_20170904083346.zip')
zip.extractall('arabic')



In [0]:
#Expected 236384 arabic articles
#Expected 316004 chineese articles
!ls arabic -l | wc -l
!ls chineese -l | wc -l


In [0]:
nltk.download('stopwords')

In [0]:
# Thanks VibhuJawa
def get_text(lines):
    """
        returns non empty lines from a list of lines
    """
    decoded = json.loads(lines[0])
    clean_lines = decoded['text']
    return [clean_lines]

def get_txt_lines(data_dir):
    """
        Read text lines from gutenberg tests
        returns (text_ls,fname_ls) where 
        text_ls = input_text_lines and fname_ls = list of file names
    """
    text_ls = []
    fname_ls = []
    for fn in os.listdir(data_dir):
        full_fn = os.path.join(data_dir,fn)
        with open(full_fn,encoding="utf-8",errors="ignore") as f:
            content = f.readlines()
            content = get_text(content)
            if content is not None:
                text_ls += content
                ### dont add .txt to the file
                fname_ls += [fn[:-4]]*len(content)
        #return text_ls, fname_ls    
    
    return text_ls, fname_ls    
    
print("File Read Time:")
%time txt_ls,fname_ls = get_txt_lines('arabic')
df = cudf.DataFrame()

print("\nCUDF  Creation Time:")
%time df['text'] = nvstrings.to_device(txt_ls)

In [0]:
print("Number of lines in the DF = {:,}".format(len(df)))

In [0]:
df.head(10).to_pandas()

In [0]:
STOPWORDS = nltk.corpus.stopwords.words('arabic')

filters = [ '!', '"', '#', '$', '%', '&', '(', ')', '*', '+', '-', '.', '/',  '\\', ':', ';', '<', '=', '>',
           '?', '@', '[', ']', '^', '_', '`', '{', '|', '}', '\~', '\t','\\n',"'",",",'~' , '—']

def preprocess_text(input_strs , filters=None , stopwords=STOPWORDS):
    """
        * filter punctuation
        * to_lower
        * remove stop words (from nltk corpus)
        * remove multiple spaces with one
        * remove leading spaces    
    """
    
    # filter punctuation and case conversion
    input_strs = input_strs.str.replace_multi(filters, ' ', regex=False)
    input_strs = input_strs.str.lower()
        
    # remove stopwords
    stopwords_gpu = nvstrings.to_device(stopwords)
    input_strs = nvtext.replace_tokens(input_strs.data, stopwords_gpu, ' ')
    input_strs = cudf.Series(input_strs)
        
    # replace multiple spaces with single one and strip leading/trailing spaces
    input_strs = input_strs.str.replace(r"\s+", ' ', regex=True)
    input_strs = input_strs.str.strip(' ')
    
    return input_strs

def preprocess_text_df(df, text_cols=['text'], **kwargs):
    for col in text_cols:
        df[col] = preprocess_text(df[col], **kwargs)
    return  df

%time df = preprocess_text_df(df, filters=filters)

# TASK #1 - Remove stopwords keeping the arabic symbols. Hint: ^[\u0621-\u064A0-9 ]+$
# TASK #2 - Shuffle the DataFrame df

In [0]:
df.head(5).to_pandas()

In [0]:
SAMPLE_SIZE = 1000
df2 = df.head(SAMPLE_SIZE)

In [0]:
df2.to_pandas()

In [0]:
# How many articles contain the word bitcoin?
sum(df2['text'].str.find('بيتكوين')) 

In [0]:
pre_df = cudf.read_csv("cc.ar.300.vec",
                       header=None,
                       delim_whitespace=True,
                       quoting=3,
                       skiprows=1)  #ignore quoting
print(pre_df.head())

In [0]:
# Read the file with cudf
names = ['query', 'title', 'text', 'link', 'desc','other']
# Note 'int' for 3rd column- text will be hashed
dtypes = ['str', 'str', 'str', 'str', 'str', 'str']
df_pos = cudf.read_csv('arabic-true-pr.csv', delimiter=',',
                   names=names, dtype=dtypes,
                   skiprows=1)
df_pos.head(15).to_pandas()

In [0]:
df_pos = df_pos.drop(['query', 'title', 'link', 'desc', 'other'])
df_pos.add_column('target', 1)
df_pos = df_pos.dropna()
df2.add_column('target', 0)
df2

In [0]:
df = cudf.core.reshape.concat([df_pos, df2], 0)

In [0]:
# Thanks Ayush Kumar
# setting the max length of each article to 200
MAX_LEN = 200
num_sents = df['text'].data.size()

# generate the tokens
seq = df['text'].data.split_record(' ')
# padding each strings if smaller or trim down if larger
for i in range(len(seq)):
  l = seq[i].size()
  if l<= MAX_LEN:
    seq[i] = seq[i].add_strings(nvstrings.to_device((MAX_LEN-l)*['PAD']))
  else:
    seq[i] = seq[i].remove_strings(list(range(MAX_LEN,l)))

In [0]:
print(seq[40])
print(seq[4])
print((len(seq)))

In [0]:
# generating the indices corresponding each token 
c = nvcategory.from_strings_list(seq)
print(c.keys_size())   # total number of unique tokens
print(c.size())       # total number of tokens or vocabulary

In [0]:
# creating gdf using unique tokens
# TASK more preprocessing - that can be tricky in Arabic
sent_df = cudf.DataFrame({'tokens':c.keys()})
sent_df.head(10)

In [0]:
# preparing the X_train 
X_train = cuda.device_array((num_sents, MAX_LEN), dtype=np.int32)
c.values(X_train.device_ctypes_pointer.value)
print(X_train.shape)

In [0]:
# preparing the y_train
y_train = df['target'].astype('float32').to_gpu_array()
print(y_train.shape)

In [0]:
# creating embedding matrix 
vocab_df = sent_df.merge(pre_df,
                         left_on='tokens',
                         right_on='0',
                         how='left')

In [0]:
all_token = vocab_df.shape[0]
print(all_token)

In [0]:
vocab_df.drop_column('0')
vocab_df.drop_column('tokens')

# filling the not found tokens with random vector
for c in vocab_df.columns:
  vocab_df[c] = vocab_df[c].fillna(cupy.random.normal(size=all_token)).astype(np.float32)

# embedding matrix
vocab = vocab_df.as_gpu_matrix(order='C')

In [0]:
# open issue #23067
def devndarray2tensor(dev_arr, dtyp='float32'):
    dmap = {'float32':torch.float32, 'int32':torch.int32}
    t = torch.empty(size=dev_arr.shape, dtype=dmap[dtyp]).cuda()
    ctx = cuda.cudadrv.driver.driver.get_context()
    
    # constant value of #bytes in float32 = 4
    mp = cuda.cudadrv.driver.MemoryPointer(ctx, ctypes.c_ulong(t.data_ptr()), t.numel()*4)
    tmp_arr = cuda.cudadrv.devicearray.DeviceNDArray(t.size(), [i*4 for i in t.stride()], np.dtype(dtyp), 
                                            gpu_data=mp, stream=torch.cuda.current_stream().cuda_stream)
    tmp_arr.copy_to_device(dev_arr)
    return t

In [0]:
# Workshop goal -> Find as many as possible articles, that have a indication of bitcoin price going up or down, that would be usefull for a trader.

# Recommendations
'''
Literature:
1. https://roywrightme.wordpress.com/2017/11/16/positive-unlabeled-learning/
2. http://mlg.eng.cam.ac.uk/yarin/blog_3d801aa532c1ce.html

1. Firstly, try to clean the positive dataset. You can use free Google Tranlsate API to understand the articles. 
2. Using the clean dataset, try to train a simple model like logistic regression over glove embeddings and the PU technique
3. Using the above model, try to find additional positive articles, that rank high in the above.
4. Having additional data, train a more complex model capable of upgrading with Variational Dropout Uncretainty estimation. Recommended example is 1D Convolution. 
5. By performing na inference on the unlabelled dataset, try to find additional positive sample with low aleatoric uncertainty.

In [0]:
#@title
