In [1]:
import collections
from typing import Dict, List, Tuple


def text2bow(words: List[str], dictionary: Dict[str, int]) -> List[Tuple[int, int]]:
    word_frequences = collections.defaultdict(int)
    for word in words:
        if word not in dictionary:
            dictionary[word] = len(dictionary)
        word_frequences[dictionary[word]] += 1
 
    return list(word_frequences.items())

In [2]:
with open("C:\\Users\\znowak\Anaconda3\\Lib\\site-packages\\pattern\\text\\en\\en-lexicon.txt", "r", encoding = "utf8") as f:
    print(f.readlines()[0])

;;;   



In [3]:
%load_ext Cython

In [4]:
%%cython -a

def fibonacci(n):    
    if n < 0:
        print("1st fibonacci number = 0")    
    elif n == 1:
        return 0    
    elif n == 2:
        return 1    
    else:
        return fibonacci(n-1) + fibonacci(n-2)

In [5]:
%%cython -a
with open("C:\\Users\\znowak\Anaconda3\\Lib\\site-packages\\pattern\\text\\en\\en-lexicon.txt", "r", encoding = "utf8") as f:
    print(f.readlines()[0])

;;;   



In [6]:
%%cython -f
# distutils: extra_compile_args = -fopenmp
# distutils: extra_link_args = -fopenmp
# cython: language_level=3
# cython: embedsignature=True
# cython: profile=True
# cython: boundscheck=False
# coding: utf8

from libc.stdlib cimport malloc, realloc, free
from libc.stdio cimport fopen, fclose, FILE, EOF, fseek, SEEK_END, SEEK_SET
from libc.stdio cimport ftell, fgetc, fgets, getc, gets, feof, fread, getline
from libc.string cimport strlen, memcpy, strcpy, strtok, strchr, strncpy
from cython.parallel import prange, parallel, threadid

# - C structure that is set to readonly
cdef readonly struct FileContents:
    char *contents
    
cdef class CyReadFile:
    """Read in the contents of a file."""
    cdef:
        FileContents *File
        FILE *fp
        char *filename
        char *delimiter
        long file_size
        bint is_open
        bint EO_STR
    
    def __init__(self, char *delimiter, char *filename):
        self.File = <FileContents*>malloc(sizeof(CyReadFile))
        self.delimiter = delimiter
        self.filename = filename
        self.File.contents = NULL
        self.is_open = 0
        self.EO_STR = 0
        self.file_size = 0
        self.fp = NULL
        
    def open_file(self):
        """Open the file for reading."""
        self.fp = fopen(self.filename, "r")
        if self.fp == NULL:
            raise FileNotFoundError(2, "No such file or directory: '%s'" % self.filename)
        else:
            # file is now open
            self.is_open = 1
    
    def read_in_file(self):
        """Read in the entire file."""
        if self.is_open == 1:
            # get the length of the file
            fseek(self.fp, 0, SEEK_END)
            self.file_size = ftell(self.fp)
            fseek(self.fp, 0, SEEK_SET)
            # allocate memory for reading in the file
            self.File.contents = <char*>malloc(self.file_size*sizeof(char))
            # read entire file into the struct
            fread(self.File.contents, 1, self.file_size, self.fp)
            # close the file once it's read into the char array
            fclose(self.fp)
            # set is_open to 0
            self.is_open = 0
              
    def read_file_in_parallel(self):
        """Bypass the gil and read in the file."""
        if self.is_open == 1:
            with nogil:
                # get the length of the file
                fseek(self.fp, 0, SEEK_END)
                self.file_size = ftell(self.fp)
                fseek(self.fp, 0, SEEK_SET)
                # allocate memory for reading in the file
                self.File.contents = <char*>malloc(self.file_size*sizeof(char))
                # read entire file into the struct
                fread(self.File.contents, 1, self.file_size, self.fp)
                # close the file once it's read into the char array
                fclose(self.fp)
                # set is_open to 0
                self.is_open = 0
    
    def __dealloc__(self):
        """Deallocate memory"""
        free(self.File.contents)
        free(self.File)
        free(self.fp)
        free(self.filename)
        free(self.delimiter)
        
            
# - To use the cython class, we must create a python subclass that inherits from it.
# - I will set the cython variables concretely in the Python subclass

# test data
emlFile = "en-lexicon.txt"

class PyReadFile(CyReadFile):
    """A python wrapper around a cython class."""
    def __init__(self):
        super().__init__(b',', emlFile)
    
        
def py_read_file(filename):
    with open(filename, "r", encoding="utf-8") as f:
        return f.read()

In [7]:
import os
print(os.getcwd())


C:\Users\znowak\Documents\Projects\testNLP\dev


In [8]:
%%cython 
from __future__ import print_function
from libc.stdio cimport FILE, fopen, fseek, fclose, SEEK_END, SEEK_SET, ftell, fread
from libc.stdlib cimport malloc, free
from cpython.exc cimport PyErr_SetFromErrnoWithFilenameObject


def open_file_cy():
    cdef FILE* p
    p = fopen("en-lexicon.txt", "r")
    if p is NULL:
        PyErr_SetFromErrnoWithFilenameObject(OSError, "spam.txt")
        return None
    else:
        # get the length of the file
        fseek(p, 0, SEEK_END)
        file_size = ftell(p)
        fseek(p, 0, SEEK_SET)
        # allocate memory for reading in the file
        contents = <char*>malloc(file_size*sizeof(char))
        # read entire file into the struct
        fread(contents, 1, file_size, p)
        # close the file once it's read into the char array
        fclose(p)
        return contents
        
def open_file_py():
    with open("en-lexicon.txt", "r", encoding="utf-8") as f:
        return f.read()

In [9]:
%timeit open_file_cy()
%timeit open_file_py()

5.94 ms ± 496 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
28.4 ms ± 4.37 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [10]:
open_file_cy()



In [11]:
%%cython
from __future__ import print_function
from libc.stdio cimport FILE, fopen, fseek, fclose, SEEK_END, SEEK_SET, ftell, fread, getc
from libc.stdlib cimport malloc, free
from cpython.exc cimport PyErr_SetFromErrnoWithFilenameObject
# Count the number of NN
def count_lines_cy(char* string):
    cdef int count = 0
    for c in string:
        if (c == b'\n'):
            count = count + 1;
    return count

def count_lines_py(string):
    return string.count(b'\n')

In [12]:
string = open_file_cy()
%timeit count_lines_cy(string)
%timeit count_lines_py(string)

#94258

4.12 ms ± 583 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
2.93 ms ± 32.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [13]:
%timeit string.split(b"\n")
string_list = string.split(b'\n')

11.2 ms ± 494 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [14]:
%%cython
from __future__ import print_function
cimport cython
from libc.stdio cimport FILE, fopen, fseek, fclose, SEEK_END, SEEK_SET, ftell, fread, fgets, rewind
from libc.stdlib cimport malloc, free
from cpython.exc cimport PyErr_SetFromErrnoWithFilenameObject
from libc.string cimport strcpy, strlen


def read_lines():
    cdef:
        char** array        # Array of lines */
        int    i                   # Loop counter */
        char   line[100]           # Buffer to read each line */
        int    line_count          # Total number of lines */
        int    line_length         # Length of a single line */
        FILE* txt   #file of interest */
        
    # Open the file
    txt = fopen("en-lexicon.txt", "r")
    if txt is NULL:
        PyErr_SetFromErrnoWithFilenameObject(OSError, "en-lexicon.txt")
        return

    # Clear output parameter. */
    array = NULL
    arr = []
    
    # Get the count of lines in the file */
    line_count = 0;
    while (fgets(line, sizeof(line), txt) != NULL):                                     
        line_count+=1;

    # Move to the beginning of file. */
    rewind(txt);

    # Allocate an array of pointers to strings 
     # (one item per line). */
    array = <char**>malloc(line_count * sizeof(char*));
    if (array == NULL):
        return; 


    # Read each line from file and deep-copy in the array. */
    for i in range(line_count):
        # Read the current line. */
        fgets(line, sizeof(line), txt);

        # Remove the ending '\n' from the read line. */
        line_length = strlen(line);        
        line[line_length - 1] = b'\0';
        line_length-=1; # update line length */

        # Allocate space to store a copy of the line. +1 for NUL terminator */
        array[i] = <char*>malloc(line_length + 1);

        # Copy the line into the newly allocated space. */
        strcpy(array[i], line);
        arr.append(<object>line)
    return arr

In [15]:
%%cython
from __future__ import print_function
cimport cython
from libc.stdio cimport FILE, fopen, fseek, fclose, SEEK_END, SEEK_SET, ftell, fread, fgets, rewind
from libc.stdlib cimport malloc, free
from cpython.exc cimport PyErr_SetFromErrnoWithFilenameObject
from libc.string cimport strcpy, strlen
import re


def read_lines():
    cdef:
        int    i                   # Loop counter */
        char   line[100]           # Buffer to read each line */
        int    line_count          # Total number of lines */
        int    line_length         # Length of a single line */
        FILE* txt                  # File of interest */
        
    # Open the file
    txt = fopen("en-lexicon.txt", "r")
    
    # Clear output parameter. */
    arr = []
    
    # Get the count of lines in the file */
    line_count = 94258


    # Read each line from file and deep-copy in the array. */
    for i in range(line_count):
        # Read the current line. */
        fgets(line, sizeof(line), txt);
        length = strlen(line)
        if line[length-3] == b'N':
            if line[length-4] == b' ':
                temp = line[:length-4]
                arr.append(temp)
            else:
                temp = line[:length-5]
                arr.append(temp)
        
    fclose(txt)
    return arr
def read_lines_py():
    cdef FILE* txt                  # File of interest */
    
    # Open the file
    p = fopen("en-lexicon.txt", "r")
    # get the length of the file
    fseek(p, 0, SEEK_END)
    file_size = ftell(p)
    fseek(p, 0, SEEK_SET)
    # allocate memory for reading in the file
    contents = <char*>malloc(file_size*sizeof(char))
    # read entire file into the struct
    fread(contents, 1, file_size, p)
    # close the file once it's read into the char array
    fclose(p)
    temp = (<bytes>contents).decode('utf-8')
    # temp = <bytes>contents
    # return temp.split(b"\n") #15 ms
    return re.findall(r'\w+(?= NN)', temp)

In [16]:
%timeit read_lines()
%timeit read_lines_py()

23.9 ms ± 999 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
191 ms ± 1.74 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [17]:
read_lines()

[b'#GLEE',
 b'#Glee',
 b'#fail',
 b'#fb',
 b'#glee',
 b'#shoutout',
 b'#win',
 b'%',
 b'%CHG',
 b'&0C.',
 b"''.",
 b"'40s",
 b"'50s",
 b"'60s",
 b"'70s",
 b"'80s",
 b"'89s",
 b"'90s",
 b"'A",
 b"'BS",
 b"'Channel",
 b"'Chief",
 b"'D",
 b"'Dividend",
 b"'Do",
 b"'Em",
 b"'God",
 b"'Goodison",
 b"'Guesstimates",
 b"'Hagura",
 b"'Happy",
 b"'Here's",
 b"'I've",
 b"'Il",
 b"'K",
 b"'Lady",
 b"'MTV",
 b"'Ma",
 b"'N",
 b"'Nightline",
 b"'Ounce",
 b"'People",
 b"'Poltergeist",
 b"'R",
 b"'Son",
 b"'Sweets",
 b"'Tahiti",
 b"'The",
 b"'Three's",
 b"'Tide",
 b"'To",
 b"'Today",
 b"'Unsolved",
 b"'War",
 b"'Which",
 b"'Who",
 b"'X",
 b"'You",
 b"'d.",
 b"'pache",
 b"'thirties",
 b',..',
 b'--Boca',
 b'--Bordeaux',
 b'--China',
 b'--Dell',
 b'--Dorothy',
 b'--George',
 b'--Hitachi',
 b'--Mrs',
 b'--Thailand',
 b'--Tokyo',
 b'--William',
 b'--agreed',
 b'--dividends',
 b'--in',
 b'--meal',
 b'--players',
 b'--products',
 b'--subjects',
 b'--wines',
 b'-16-degrees CD',
 b'-20-degrees CD',
 b'-20-deg

In [18]:
import fast
words = read_lines()
fast.t2bow(words)

[(8733587623715143684, 1),
 (4842599486780342283, 1),
 (11512552109288456204, 1),
 (4513208660830060557, 1),
 (14662381588442906641, 1),
 (15536272596882161682, 1),
 (16455515915026432021, 1),
 (16742990409685663767, 1),
 (16173034612666007580, 1),
 (1255273752809308191, 1),
 (1151439422270603295, 1),
 (17553938765897924639, 1),
 (14651438934890774562, 1),
 (9760615298323578916, 1),
 (2189164929004470309, 1),
 (358151967250579495, 1),
 (17127722596980097064, 1),
 (10487657834148528169, 1),
 (443513706543317033, 1),
 (5635023407846260779, 1),
 (1031767000519409708, 1),
 (8411414691203252267, 1),
 (6981348057124306991, 1),
 (9183295508616970288, 1),
 (16237875781125275697, 1),
 (6100905980968304697, 1),
 (529943897327272000, 1),
 (6765000088045224007, 1),
 (14283686172931391561, 1),
 (14036817388615106640, 1),
 (3933033240984420435, 1),
 (4117679061207089239, 1),
 (6828737245181378649, 1),
 (18181585195071570012, 1),
 (16360840177772396637, 1),
 (6244336202041262180, 1),
 (18443083261855

In [32]:
%%cython
from __future__ import print_function
cimport cython
from libc.stdio cimport FILE, fopen, fseek, fclose, SEEK_END, SEEK_SET, ftell, fread, fgets, rewind
from libc.stdlib cimport malloc, free
from cpython.exc cimport PyErr_SetFromErrnoWithFilenameObject
from libc.string cimport strcpy, strlen
import string

def read_file_py(filename):
    cdef FILE* txt                  # File of interest */
    
    # Open the file
    p = fopen(filename, "r")
    # get the length of the file
    fseek(p, 0, SEEK_END)
    file_size = ftell(p)
    fseek(p, 0, SEEK_SET)
    # allocate memory for reading in the file
    contents = <char*>malloc(file_size*sizeof(char))
    # read entire file into the struct
    fread(contents, 1, file_size, p)
    # close the file once it's read into the char array
    fclose(p)
    #temp = (<bytes>contents).decode('utf-8')
    temp = <bytes>contents
    punc = string.punctuation.encode()
    temp = temp.translate(None, punc)
    return temp.lower().split()

In [31]:
%timeit read_file_py(b"scripts\\10-Things-I-Hate-About-You.txt")

2.88 ms ± 166 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
def clean(list words, dict nouns):
    # convert to hash
    # check to see if it is there
    # if it is not remove from list

In [38]:
from collections import Counter
words = read_file_py(b"scripts\\10-Things-I-Hate-About-You.txt")

%timeit fast.t2bow(words)
%timeit Counter(words)

1.97 ms ± 41.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
2.28 ms ± 58.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


Counter({b'if': 35,
         b'toplocationhreflocationhref': 1,
         b'written': 1,
         b'by': 49,
         b'karen': 1,
         b'mccullah': 1,
         b'lutz': 1,
         b'amp': 1,
         b'kirsten': 1,
         b'smith': 1,
         b'based': 1,
         b'on': 125,
         b'taming': 2,
         b'of': 215,
         b'the': 698,
         b'shrew': 3,
         b'william': 9,
         b'shakespeare': 8,
         b'revision': 1,
         b'november': 1,
         b'12': 1,
         b'1997': 1,
         b'welcome': 2,
         b'to': 437,
         b'padua': 4,
         b'high': 8,
         b'school': 23,
         b'your': 74,
         b'typical': 1,
         b'urbansuburban': 1,
         b'in': 208,
         b'portland': 1,
         b'oregon': 1,
         b'smarties': 1,
         b'skids': 1,
         b'preppies': 1,
         b'granolas': 1,
         b'loners': 1,
         b'lovers': 1,
         b'and': 401,
         b'out': 114,
         b'crowd': 12,
         b'rub': 1

In [95]:
%%cython
from __future__ import print_function
import numpy as np
cimport numpy as cnp

cnp.import_array()  # needed to initialize numpy-API

#cpdef return_empty():
#    cdef cnp.npy_intp dim = 10
#    return cnp.PyArray_ZEROS(1, &dim, cnp.NPY_INT32,1)

def return_empty():
    cdef cnp.ndarray[cnp.int_t, ndim=1, mode='c'] data
    data = np.zeros(10, dtype=int)

    cdef unsigned int* buff = <unsigned int*>data
    print(data)
    


Error compiling Cython file:
------------------------------------------------------------
...

def return_empty():
    cdef cnp.ndarray[cnp.int_t, ndim=1] data
    data = np.zeros(10, dtype=int)

    cdef unsigned int* buff = <unsigned int*>data
                             ^
------------------------------------------------------------

C:\Users\znowak\.ipython\cython\_cython_magic_dce949eaebc93616c84d1bf1a55d8f0c.pyx:15:30: Python objects cannot be cast to pointers of primitive types


In [94]:
return_empty()

<class 'numpy.ndarray'>
