<a href="https://colab.research.google.com/github/marek-bardonski/airev-advanced-workshops/blob/master/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# AI REV LLC - Copyrights 2020
# [Google Colab] Change runtime mode to GPU
# https://docs.rapids.ai/

!nvcc --version
!pip3 install wget

# https://github.com/rapidsai/cudf/issues/3390
!pip3 install pyarrow==0.15.0 ## Workaround to cover up for Google Colab bug

#!pip3 install cudf-cuda100
#!pip3 install nvstrings-cuda100
#!pip3 install cuml-cuda100
#!pip3 install nvvm-cuda100

# RAPIDS installation script. Thanks Ritchie Ng and NVIDIA Corporation.
# intall miniconda
!wget -c https://repo.continuum.io/miniconda/Miniconda3-4.5.4-Linux-x86_64.sh
!chmod +x Miniconda3-4.5.4-Linux-x86_64.sh
!bash ./Miniconda3-4.5.4-Linux-x86_64.sh -b -f -p /usr/local

# install RAPIDS packages
!conda install -q -y --prefix /usr/local -c conda-forge \
  -c rapidsai-nightly/label/cuda10.0 -c nvidia/label/cuda10.0 \
  cudf cuml

Remember to restart the runtime at this moment.

In [0]:
# set environment vars
import sys, os, shutil
import wget
from zipfile import ZipFile
import os
import cudf
import sys, os
import nvcategory
import os
import numpy as np
#import cuml (bug https://gitmemory.com/issue/rapidsai/cuml/404/477551898)
import nvstrings
import nltk
from numba import cuda
import json
import nvtext

sys.path.append('/usr/local/lib/python3.6/site-packages/')
os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'
os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'

# copy .so files to current working dir
for fn in ['libcudf.so', 'librmm.so']:
  shutil.copy('/usr/local/lib/'+fn, os.getcwd())

In [0]:
# Raw datasets are samples from WebHose.io 
print('Beginning dataset download with wget module')

url = 'https://bardonski.pl/chineese.zip'
wget.download(url)
url = 'https://bardonski.pl/arabic.zip'
wget.download(url)

In [0]:
zip = ZipFile('chineese.zip')
zip.extractall()

!mkdir chineese

zip = ZipFile('630_webhose-2016-10_20170904084325.zip')
zip.extractall('chineese')

In [0]:
zip = ZipFile('arabic.zip')
zip.extractall()

!mkdir arabic

zip = ZipFile('627_webhose-2016-10_20170904083346.zip')
zip.extractall('arabic')

In [0]:
#Expected 236384 arabic articles
#Expected 316004 chineese articles
!ls arabic -l | wc -l
!ls chineese -l | wc -l


In [0]:
# Thanks VibhuJawa
def get_text(lines):
    """
        returns non empty lines from a list of lines
    """
    decoded = json.loads(lines[0])
    clean_lines = decoded['text']
    return [clean_lines]

def get_txt_lines(data_dir):
    """
        Read text lines from gutenberg tests
        returns (text_ls,fname_ls) where 
        text_ls = input_text_lines and fname_ls = list of file names
    """
    text_ls = []
    fname_ls = []
    for fn in os.listdir(data_dir):
        full_fn = os.path.join(data_dir,fn)
        with open(full_fn,encoding="utf-8",errors="ignore") as f:
            content = f.readlines()
            content = get_text(content)
            if content is not None:
                text_ls += content
                ### dont add .txt to the file
                fname_ls += [fn[:-4]]*len(content)
        #return text_ls, fname_ls    
    
    return text_ls, fname_ls    
    
print("File Read Time:")
%time txt_ls,fname_ls = get_txt_lines('arabic')
df = cudf.DataFrame()

print("\nCUDF  Creation Time:")
%time df['text'] = nvstrings.to_device(txt_ls)

In [0]:
print("Number of lines in the DF = {:,}".format(len(df)))

In [0]:
df.head(10).to_pandas()

In [0]:
STOPWORDS = nltk.corpus.stopwords.words('arabic')

filters = [ '!', '"', '#', '$', '%', '&', '(', ')', '*', '+', '-', '.', '/',  '\\', ':', ';', '<', '=', '>',
           '?', '@', '[', ']', '^', '_', '`', '{', '|', '}', '\~', '\t','\\n',"'",",",'~' , '—']

def preprocess_text(input_strs , filters=None , stopwords=STOPWORDS):
    """
        * filter punctuation
        * to_lower
        * remove stop words (from nltk corpus)
        * remove multiple spaces with one
        * remove leading spaces    
    """
    
    # filter punctuation and case conversion
    input_strs = input_strs.str.replace_multi(filters, ' ', regex=False)
    input_strs = input_strs.str.lower()
        
    # remove stopwords
    stopwords_gpu = nvstrings.to_device(stopwords)
    input_strs = nvtext.replace_tokens(input_strs.data, stopwords_gpu, ' ')
    input_strs = cudf.Series(input_strs)
        
    # replace multiple spaces with single one and strip leading/trailing spaces
    input_strs = input_strs.str.replace(r"\s+", ' ', regex=True)
    input_strs = input_strs.str.strip(' ')
    
    return input_strs

def preprocess_text_df(df, text_cols=['text'], **kwargs):
    for col in text_cols:
        df[col] = preprocess_text(df[col], **kwargs)
    return  df

%time df = preprocess_text_df(df, filters=filters)

In [0]:
df.head(5).to_pandas()

In [0]:
df2 = df.head(100000)

In [0]:
df2.to_pandas()

In [0]:
sum(df2['text'].str.find('بيتكوين'))

In [0]:
#Arabic word vectors (fasttext.cc)
url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.vec.gz'
wget.download(url)


In [0]:
pre_df = cudf.read_csv("glove.6B.50d.txt",
                       header=None,
                       delim_whitespace=True,
                       quoting=3)  #ignore quoting
print(pre_df.head())