In [1]:
import spacy

In [2]:
nlp = spacy.load('en')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [3]:
nlp = spacy.load('en')
tokens = nlp(u'dog cat banana')

for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

dog dog 1.0
dog cat 0.53906965
dog banana 0.28761008
cat dog 0.53906965
cat cat 1.0
cat banana 0.48752153
banana dog 0.28761008
banana cat 0.48752153
banana banana 1.0


In [5]:
doc = nlp(u'I love coffee')
for word in doc:
    lexeme = doc.vocab[word.text]
    print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
          lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_)

I 4690420944186131903 X I I True False True en
love 3702023516439754181 xxxx l ove True False False en
coffee 3197928453018144401 xxxx c fee True False False en


In [6]:
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Hello, world. Here are two sentences.')
print([t.text for t in doc])

['Hello', ',', 'world', '.', 'Here', 'are', 'two', 'sentences', '.']


In [7]:
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
apple = doc[0]
print('Fine-grained POS tag', apple.pos_, apple.pos)
print('Coarse-grained POS tag', apple.tag_, apple.tag)
print('Word shape', apple.shape_, apple.shape)
print('Alphanumeric characters?', apple.is_alpha)
print('Punctuation mark?', apple.is_punct)

billion = doc[10]
print('Digit?', billion.is_digit)
print('Like a number?', billion.like_num)
print('Like an email address?', billion.like_email)

Fine-grained POS tag PROPN 95
Coarse-grained POS tag NNP 15794550382381185553
Word shape Xxxxx 16072095006890171862
Alphanumeric characters? True
Punctuation mark? False
Digit? False
Like a number? True
Like an email address? False


In [8]:
doc = nlp(u'San Francisco considers banning sidewalk delivery robots')
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

San Francisco 0 13 GPE


In [11]:
from spacy import displacy


doc_dep = nlp(u'This is a sentence.')
displacy.serve(doc_dep, style='dep')

doc_ent = nlp(u'When Sebastian Thrun started working on self-driving cars at Google '
              u'in 2007, few people outside of the company took him seriously.')
displacy.serve(doc_ent, style='ent')


[93m    Serving on port 5000...[0m
    Using the 'dep' visualizer


    Shutting down server on port 5000.


[93m    Serving on port 5000...[0m
    Using the 'ent' visualizer


    Shutting down server on port 5000.



In [13]:
import spacy

nlp = spacy.load('en')
doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.")

apple = doc[0]
banana = doc[2]
pasta = doc[6]
hippo = doc[8]

print('apple <-> banana', apple.similarity(banana))
print('pasta <-> hippo', pasta.similarity(hippo))
print(apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector)

apple <-> banana 0.37047377
pasta <-> hippo 0.40645224
True True True True


In [14]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u"When Sebastian Thrun started working on self-driving cars at Google "
          u"in 2007, few people outside of the company took him seriously.")

dep_labels = []
for token in doc:
    while token.head != token:
        dep_labels.append(token.dep_)
        token = token.head
print(dep_labels)

['advmod', 'advcl', 'compound', 'nsubj', 'advcl', 'nsubj', 'advcl', 'advcl', 'xcomp', 'advcl', 'prep', 'xcomp', 'advcl', 'npadvmod', 'amod', 'pobj', 'prep', 'xcomp', 'advcl', 'punct', 'amod', 'pobj', 'prep', 'xcomp', 'advcl', 'amod', 'pobj', 'prep', 'xcomp', 'advcl', 'pobj', 'prep', 'xcomp', 'advcl', 'prep', 'xcomp', 'advcl', 'pobj', 'prep', 'xcomp', 'advcl', 'prep', 'xcomp', 'advcl', 'pobj', 'prep', 'xcomp', 'advcl', 'punct', 'amod', 'nsubj', 'nsubj', 'prep', 'nsubj', 'prep', 'prep', 'nsubj', 'det', 'pobj', 'prep', 'prep', 'nsubj', 'pobj', 'prep', 'prep', 'nsubj', 'dobj', 'advmod', 'punct']


In [17]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
          chunk.root.head.text)

Autonomous cars cars nsubj shift
insurance liability liability dobj shift
manufacturers manufacturers pobj toward


In [16]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

Apple apple PROPN NNP nsubj Xxxxx True False
is be VERB VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. u.k. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [19]:
doc

Autonomous cars shift insurance liability toward manufacturers

In [24]:
for token in doc:
    print(token.text, token.lemma_)

Autonomous autonomous
cars car
shift shift
insurance insurance
liability liability
toward toward
manufacturers manufacturer


In [3]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u'I love my love but my love doesn\'t love me as I love my love')

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_)

I -PRON- PRON PRP nsubj
love love VERB VBP ROOT
my -PRON- ADJ PRP$ poss
love love NOUN NN dobj
but but CCONJ CC cc
my -PRON- ADJ PRP$ poss
love love NOUN NN nsubj
does do VERB VBZ aux
n't not ADV RB neg
love love VERB VB conj
me -PRON- PRON PRP dobj
as as ADP IN mark
I -PRON- PRON PRP nsubj
love love VERB VBP advcl
my -PRON- ADJ PRP$ poss
love love NOUN NN dobj


In [14]:
import codecs, csv
import pandas as pd
dat = pd.read_csv('AI_peopleschina.csv')
dat

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x92 in position 16: invalid start byte

In [15]:
import codecs, csv
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en')
STOP_WORDS = list(STOP_WORDS)

In [4]:
news = []
counter = 1
with open('AI_peopleschina.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        news.append(row)
        if counter > 5: break
        counter += 1

In [None]:
news[0]['Text']

In [16]:
counter = 1
poss = ['NOUN','ADJ','ADV']
tags = []
lemmatize = True
outnews = []
poss.append('PUNCT')

for new in news:
    doc = nlp(new['Text'])
    filtered_words  = []
    for token in doc:
        if (token.pos_ in poss or token.tag_ in tags) and token.lemma_ not in STOP_WORDS + ['-PRON-']:
            outword = token.lemma_ if lemmatize else token.text
            filtered_words.append(outword)
    print('Document %s is processed' % counter)
    new['Text'] = ' '.join(filtered_words)
    outnews.append(new)
    counter += 1

Document 1 is processed
Document 2 is processed
Document 3 is processed
Document 4 is processed
Document 5 is processed
Document 6 is processed


In [18]:
outnews[0]['Text']



## SpacyCleaner

In [1]:
import codecs, csv
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en')
STOP_WORDS = list(STOP_WORDS)

In [None]:
news = []
counter = 1
with open('AI_peopleschina.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        news.append(row)
        if counter > 5: break
        counter += 1

In [22]:
def spacy_cleaner(news_list,poss = ['NOUN','ADJ','ADV'],tags = [],lemmatize = True, cleansws = True, filterpunct=True):
    outnews = []
    if not filterpunct: poss.append('PUNCT')
    counter = 1
    sws = STOP_WORDS + ['-PRON-'] if cleansws else ['-PRON-']
    for new in news:
        doc = nlp(new['Text'])
        filtered_words  = []
        for token in doc:
            if (token.pos_ in poss or token.tag_ in tags) and token.lemma_ not in STOP_WORDS + ['-PRON-']:
                outword = token.lemma_ if lemmatize else token.text
                filtered_words.append(outword)
        print('Document %s is processed' % counter)
        new['Text'] = ' '.join(filtered_words)
        outnews.append(new)
        counter += 1
    return(outnews)

## csv_cleaner

In [3]:
def csv_cleaner(csvdest,poss = ['NOUN','ADJ','ADV'],tags = [],lemmatize = True, cleansws = True, filterpunct=True):
    # initializing
    csvname = csvdest.replace('.csv','')
    if not filterpunct: poss.append('PUNCT')
    sws = STOP_WORDS + ['-PRON-'] if cleansws else ['-PRON-']
    counter = 1
    # opening csv files, both input and output
    writefile = open(csvname + '_processed.csv','w') # output file
    with open(csvdest) as readfile:
        reader = csv.DictReader(readfile)
        writer = csv.DictWriter(writefile, fieldnames=reader.fieldnames)
        writer.writeheader()
        # reading, processing and writing files
        for news in reader:
            doc = nlp(news['Text'])
            filtered_words  = []
            filtered_lemmas = []
            for token in doc: # select words if conditions hold
                if (token.pos_ in poss or token.tag_ in tags) and token.lemma_ not in STOP_WORDS + ['-PRON-']:
                    filtered_words.append(token.text)
                    filtered_lemmas.append(token.lemma_)
            print('Document %s is processed' % counter)
            news['Text'] = ' '.join(filtered_lemmas) if lemmatize else ' '.join(filtered_words)
            writer.writerow(news)
            counter += 1
    writefile.close()

In [40]:
csv_cleaner('AI_peopleschina.csv')

Document 1 is processed
Document 2 is processed
Document 3 is processed
Document 4 is processed
Document 5 is processed


## JSON cleaner

In [4]:
import json, os


def json_cleaner(jsondest,poss = ['NOUN','ADJ','ADV'],tags = [],lemmatize = True, cleansws = True, filterpunct=True):
    # initializing
    jsonname = jsondest.replace('.json','')
    if not filterpunct: poss.append('PUNCT')
    sws = STOP_WORDS + ['-PRON-'] if cleansws else ['-PRON-']
    counter = 1
    # opening json files, both input and output
    writefile = open(jsonname + '_processed.json','w') # output file
    writefile.write('[')
    with open(jsondest) as readfile:
        for news in readfile:
            if len(news) < 10: continue
            news = news.strip()
            if news.endswith(','): news = news[:-1]
            news = json.loads(news)
            doc = nlp(news['Text'])
            filtered_words  = []
            filtered_lemmas = []
            for token in doc: # select words if conditions hold
                if (token.pos_ in poss or token.tag_ in tags) and token.lemma_ not in STOP_WORDS + ['-PRON-']:
                    filtered_words.append(token.text)
                    filtered_lemmas.append(token.lemma_)
            print('Document %s is processed' % counter)
            news['Text'] = ' '.join(filtered_lemmas) if lemmatize else ' '.join(filtered_words)
            # Writing
            writefile.write('\n')
            json.dump(news, writefile)
            writefile.write(',')
            counter += 1
    # Son iterasyonda eklenen gereksiz virgul'u sil
    writefile.seek(writefile.tell()-1)
    writefile.truncate()
    # Bir satir asagiya
    writefile.write('\n]')
    writefile.close()

## Apply on test data 

In [9]:
csv_cleaner('test.csv')

Document 1 is processed
Document 2 is processed
Document 3 is processed
Document 4 is processed
Document 5 is processed
Document 6 is processed


In [6]:
json_cleaner('test.json')

Document 1 is processed
Document 2 is processed
Document 3 is processed
Document 4 is processed


In [7]:
import pandas as pd
pd.read_csv('test_processed.csv')

Unnamed: 0.1,Unnamed: 0,source,Date,link,Text,queryword
0,0,peopleschina,00:00.0,http://en.people.cn/n/2014/1126/c202936-881422...,machine stake decade people rise advanced comp...,'% artificial intelligence%'
1,1,peopleschina,00:00.0,http://en.people.cn/n3/2017/0821/c202936-92580...,photo artificial intelligence treatment center...,'% artificial intelligence%'
2,2,peopleschina,00:00.0,http://en.people.cn/business/n3/2017/0817/c907...,second left executive director chairman execut...,'% artificial intelligence%'
3,3,peopleschina,00:00.0,http://en.people.cn/business/n3/2017/0810/c907...,telecom service provider high profit growth ha...,'% artificial intelligence%'
4,4,peopleschina,00:00.0,http://en.people.cn/n3/2017/0805/c90000-925136...,scanner traditional chinese painting time big ...,'% artificial intelligence%'
5,5,peopleschina,00:00.0,http://en.people.cn/n3/2017/0527/c90000-922154...,world weiqi player contest artificial intellig...,'% artificial intelligence%'


In [8]:
pd.read_json('test_processed.json')

Unnamed: 0,Unnamed: 1,Date,Text,link,queryword,source
0,0,2018-06-05,machine stake decade people rise advanced comp...,http://en.people.cn/n/2014/1126/c202936-881422...,'% artificial intelligence%',peopleschina
1,1,2018-06-05,photo artificial intelligence treatment center...,http://en.people.cn/n3/2017/0821/c202936-92580...,'% artificial intelligence%',peopleschina
2,2,2018-06-05,second left executive director chairman execut...,http://en.people.cn/business/n3/2017/0817/c907...,'% artificial intelligence%',peopleschina
3,3,2018-06-05,telecom service provider high profit growth ha...,http://en.people.cn/business/n3/2017/0810/c907...,'% artificial intelligence%',peopleschina
