# Psychoinformatics - Week 13 (Examples)
by Tsung-Ren (Tren) Huang (trhuang@g.ntu.edu.tw)

In [1]:
%config IPCompleter.greedy=True 
%matplotlib inline
import numpy as np
from matplotlib.pyplot import *
from IPython.display import *
import re
import warnings
warnings.filterwarnings("ignore") 

## 1 Regular Expression

### 1.0 Basic String Operations

In [2]:
a='This is a built-in module'
print(a.lower(),a.upper())
print(a.split(' '),a.split('-'))
print(a.find('built'),a[10:15])
print(a.replace('module','library'))
print(a.count('is'))
print(a.center(50,'='))

this is a built-in module THIS IS A BUILT-IN MODULE
['This', 'is', 'a', 'built-in', 'module'] ['This is a built', 'in module']
10 built
This is a built-in library
2


### 1.1 Verification

In [3]:
import re
print(bool(re.match('[A-Za-z]\d{9}','S123456789')))

True


In [4]:
print(bool(re.match('[^@]+@[^@]+\.[^@]+','a@b.c')))

True


### 1.2 Search, Match, & Find

In [5]:
regex=re.compile('abc',re.IGNORECASE)
for txt in ['abc','hello abc','hi AbC aBc']:
    print('-'*50)
    out=regex.search(txt) #inexact match
    if(out): print('search() found:',out.string)
    out=regex.match(txt) #exact match
    if(out): print('match() found:',out.string)
    out=regex.findall(txt) #search into a list
    if(out): print('findall() found:',out)

--------------------------------------------------
search() found: abc
match() found: abc
findall() found: ['abc']
--------------------------------------------------
search() found: hello abc
findall() found: ['abc']
--------------------------------------------------
search() found: hi AbC aBc
findall() found: ['AbC', 'aBc']


### 1.3 Analysis of retweets

In [6]:
tweets=['RT @spiketren  No class tomorrow','No class tomorrow (via @spiketren)']
rt=re.compile('(RT|via) (@\w+)')
#rt=re.compile('(RT|\(via) (@\w+\)*)')
for t in tweets:
    m=rt.search(t)
    print(m.group(0),m.group(1),m.group(2));
    #print(m[0],m[1],m[2]);
    #print(m[2])

RT @spiketren RT @spiketren
via @spiketren via @spiketren


### 1.4 Collection of email addresses

In [7]:
html='<body><b>test</b><img src=test.jpg></body>'
print(re.sub('<[^<]*>','',html))
e='@mail.ncku.edu.tw'
t='pichun_huang,chendy'
print(re.sub('\w+','\g<0>'+e,t))

test
pichun_huang@mail.ncku.edu.tw,chendy@mail.ncku.edu.tw


## 2 Traditional Natural Language Processing (NLP)

### 2.0 Download text corpora

### 2.1 Lexical diversity & Big words

In [None]:
import nltk
nltk.download('all')

In [9]:
a='cat'
print(len(a))

3


In [10]:
from nltk.book import text4 
print(len(set(text4))/len(text4)) # lexical diversity
long_words=[w for w in set(text4) if len(w)>10] 
print(len(long_words)) # number of big words

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus


LookupError: 
**********************************************************************
  Resource [93mwebtext[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('webtext')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/webtext[0m

  Searched in:
    - '/Users/tren/nltk_data'
    - '/Users/tren/opt/anaconda3/nltk_data'
    - '/Users/tren/opt/anaconda3/share/nltk_data'
    - '/Users/tren/opt/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


### 2.2 Tokenization & Word distributions

In [11]:
import nltk
mytxt='This is a cat. That is a dog.'
print(nltk.sent_tokenize(mytxt))
text=nltk.Text(nltk.word_tokenize(mytxt))
text.plot()
text4.dispersion_plot(['democracy','freedom','duties'])
dist=nltk.FreqDist(text4)
print(dist['freedom'])
print([w for w in dist.keys() if dist[w]>1000])

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/Users/tren/nltk_data'
    - '/Users/tren/opt/anaconda3/nltk_data'
    - '/Users/tren/opt/anaconda3/share/nltk_data'
    - '/Users/tren/opt/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


### 2.3 Term Frequency–Inverse Document Frequency (TF-IDF)

#### 2.3.1 Functions

In [12]:
import math
def tf(word, count): #count是一個word count的dictionary
    return count[word] / sum(count.values())
def nDoc_have(word, count_list):
    return sum(1 for count in count_list if word in count)
def idf(word, count_list):
    return math.log(len(count_list)) / (1 + nDoc_have(word, count_list))
def tfidf(word, count, count_list):
    return tf(word, count) * idf(word, count_list)

#### 2.3.2 Testing data

In [13]:
doc=['']*3
doc[0]=['a','a','b','c','d']
doc[1]=['a','b','b','c','d']
doc[2]=['a','b','c','c','d']

#construct word counts:
count = [{}, {}, {}]

for d in range(3):
    for word in doc[d]:
        if word not in count[d]:
            count[d][word]=1
        else:
            count[d][word]+=1
            
print(count)

[{'a': 2, 'b': 1, 'c': 1, 'd': 1}, {'a': 1, 'b': 2, 'c': 1, 'd': 1}, {'a': 1, 'b': 1, 'c': 2, 'd': 1}]


#### 2.3.3 TF-IDF

In [14]:
print(tfidf('a',count[0],count)) # "a" in the first document

0.10986122886681099


In [15]:
#print the tf-idf of each word in each documetes
keywords=[]
for d in range(3):
    print("Top words in document {}".format(d+1))
    scores={word: tfidf(word,count[d],count) for word in count[d]}
    sorted_words=sorted(scores.items(),key=lambda x:x[1],reverse=True)
    for word, score in sorted_words[:3]:
        print("\tWord: {}, TF-IDF: {}".format(word,round(score,5)))
        #create a list of keywords:
        keywords+=word

Top words in document 1
	Word: a, TF-IDF: 0.10986
	Word: b, TF-IDF: 0.05493
	Word: c, TF-IDF: 0.05493
Top words in document 2
	Word: b, TF-IDF: 0.10986
	Word: a, TF-IDF: 0.05493
	Word: c, TF-IDF: 0.05493
Top words in document 3
	Word: c, TF-IDF: 0.10986
	Word: a, TF-IDF: 0.05493
	Word: b, TF-IDF: 0.05493


#### 2.3.4 TF-IDF-based document vectors

In [16]:
keywords=list(set(keywords)) #remove duplicates
print(keywords)

#create a feature vector for each document:
fv=['']*3
for d in range(3):
    idx=0 #feature index
    fv[d]=[tfidf(word,count[d],count) for word in keywords]
print(fv)

['c', 'a', 'b']
[[0.054930614433405495, 0.10986122886681099, 0.054930614433405495], [0.054930614433405495, 0.054930614433405495, 0.10986122886681099], [0.10986122886681099, 0.054930614433405495, 0.054930614433405495]]


### 2.4 Chinese "word" segementation

In [18]:
#!pip install jieba
import jieba
text='結巴到底會不會成功地分解這行字呢?'
wordlist=jieba.cut(text,cut_all=False)
print("|".join(wordlist))
wordlist=jieba.cut(text,cut_all=True)
print("|".join(wordlist))

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/9p/gv8q6rsj23960sv_fdkvh0jm0000gn/T/jieba.cache
Loading model cost 0.536 seconds.
Prefix dict has been built successfully.


結巴|到底|會不會|成功|地|分解|這行|字|呢|?
結|巴|到底|會|不|會|成功|地|分解|這|行|字|呢|?


### 2.5 Topic Modeling

#### 2.5.0 Word ID & Dictionary

In [22]:
import gensim
import gensim.downloader as api
corpus=api.load('text8')
dictionary=gensim.corpora.Dictionary(corpus) # generate a dictionary from the text corpus
print(dictionary)

Dictionary<253854 unique tokens: ['a', 'abacus', 'abilities', 'ability', 'able']...>


In [23]:
print(dictionary.doc2bow(['a', 'abacus', 'abilities', 'ability']))
corpus2=[dictionary.doc2bow(word) for word in corpus]

[(0, 1), (1, 1), (2, 1), (3, 1)]


#### 2.5.1 Latent Dirichlet Allocation (LDA)

In [24]:
model=gensim.models.ldamodel.LdaModel(corpus2, num_topics=5, id2word=dictionary) #LDA training

In [25]:
model.print_topics(num_words=5)

[(0, '0.057*"the" + 0.035*"of" + 0.031*"one" + 0.024*"and" + 0.021*"a"'),
 (1, '0.062*"the" + 0.024*"of" + 0.023*"a" + 0.023*"one" + 0.023*"and"'),
 (2, '0.072*"the" + 0.035*"of" + 0.023*"one" + 0.021*"to" + 0.020*"in"'),
 (3, '0.054*"the" + 0.034*"of" + 0.029*"one" + 0.028*"and" + 0.022*"in"'),
 (4, '0.056*"the" + 0.043*"of" + 0.028*"and" + 0.025*"in" + 0.020*"one"')]

## 3 Modern Natural Language Processing (NLP)

### 3.1 Word2Vec
Check <a href="https://github.com/RaRe-Technologies/gensim-data">this</a> & <a href="https://github.com/3Top/word2vec-api#where-to-get-a-pretrained-models">this</a> for more text datasets & pretained models.

#### 3.1.0 Train a model

In [26]:
import gensim
import gensim.downloader as api
corpus=api.load('text8')

In [27]:
model = gensim.models.word2vec.Word2Vec(corpus) # training

In [28]:
model.wv.most_similar("car") # testing

[('driver', 0.7849211692810059),
 ('motorcycle', 0.7434033751487732),
 ('cars', 0.7229942679405212),
 ('taxi', 0.7160823941230774),
 ('vehicle', 0.6912055611610413),
 ('truck', 0.686586320400238),
 ('cab', 0.6670401096343994),
 ('racing', 0.6656779646873474),
 ('passenger', 0.6519773006439209),
 ('sidecar', 0.6474007964134216)]

#### 3.1.1 Man:King :: Woman:?

In [29]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'])

[('queen', 0.6857359409332275),
 ('prince', 0.6322767734527588),
 ('princess', 0.6159293055534363),
 ('empress', 0.6010196805000305),
 ('throne', 0.6002517342567444),
 ('regent', 0.5805041193962097),
 ('elizabeth', 0.5800679922103882),
 ('kings', 0.5714446902275085),
 ('emperor', 0.5714309811592102),
 ('mary', 0.5664592385292053)]

#### 2.6.2 Man:Doctor :: Woman:?

In [32]:
model.wv.most_similar(positive=['woman', 'doctor'], negative=['man'])

[('child', 0.5809516310691833),
 ('teacher', 0.5554264187812805),
 ('nurse', 0.5533174872398376),
 ('murderer', 0.5325186848640442),
 ('detective', 0.5162506103515625),
 ('lesbian', 0.5101373791694641),
 ('priestess', 0.5079809427261353),
 ('prostitute', 0.5074547529220581),
 ('helen', 0.5060341358184814),
 ('herself', 0.49865785241127014)]

#### 2.6.3 (Good + Best)/2 = Better ?

In [33]:
model.wv.word_vec('Good')

KeyError: "Key 'Good' not present"

In [34]:
what=(model.wv.word_vec('good')+model.wv.word_vec('best'))/2

In [35]:
model.wv.similar_by_vector(what)

[('best', 0.8559709787368774),
 ('good', 0.7881002426147461),
 ('better', 0.6068601608276367),
 ('bad', 0.5423668026924133),
 ('fun', 0.5169402360916138),
 ('poor', 0.4934327006340027),
 ('little', 0.49075865745544434),
 ('helpful', 0.4829798936843872),
 ('finest', 0.47662466764450073),
 ('what', 0.47124817967414856)]

In [None]:
model.wv.most_similar(positive=['good', 'best'])

[('better', 0.588021993637085),
 ('bad', 0.5557962656021118),
 ('fair', 0.5461308360099792),
 ('fun', 0.5098727941513062),
 ('little', 0.4782398045063019),
 ('leisure', 0.4754738211631775),
 ('greatest', 0.4739540219306946),
 ('worst', 0.46178722381591797),
 ('excellent', 0.4616394340991974),
 ('practical', 0.4583337903022766)]

### 3.1 Sentence/Document Vector

In [9]:
# !pip install spacy or conda install -c conda-forge spacy for M1 Macs
# !python -m spacy download en_core_web_sm
import spacy

In [2]:
# Load English tokenizer, tagger, parser and NER
import en_core_web_sm
nlp = en_core_web_sm.load()

# Process whole documents
doc1 = nlp("I love you!")
doc2 = nlp("I like you!")
doc3 = nlp("I hate you!")

ModuleNotFoundError: No module named 'en_core_web_sm'

In [11]:
print(doc1.vector.shape)
print(doc1.vector)

(96,)
[-0.09983218  0.35353643  0.5265776  -2.0017223   1.4662154  -0.33170778
  1.0827445   0.74185747  0.9684267   0.15458488 -0.83901423 -0.31761074
  1.1670883   1.6654768  -0.30134588 -1.393422   -0.5176586  -0.44548252
  0.64707303 -0.56519    -0.09047961 -0.969736   -0.44712326 -0.91941214
 -1.680759   -0.18274733  0.32576013 -2.6253097  -0.12723407  1.2189727
  0.46241376 -0.96291625  2.1276584   1.8101883   1.7117918  -2.8340235
 -0.4939808  -0.21001339  0.29979473 -1.0554987  -0.12981123  0.0149786
  0.43961585 -1.8888178   1.8597324  -0.53544307  1.5854046  -0.6399481
 -2.6180916   1.1852651  -1.167548   -0.86542356  0.54269934 -0.6894389
 -1.5810661  -1.0903887  -0.35865986  0.41884246 -0.2837396   2.520783
  0.6547807  -0.6294886   0.58366    -0.8731143  -1.3666099  -0.71271324
 -0.6313154  -0.38491994  0.41647953  0.07151747 -0.38954067  1.3536111
 -0.02793354  0.21484032 -0.19864672 -1.3100643   1.9951842   0.8023096
  1.7919707  -0.4781978  -0.8473945   0.06738667 -0.14

In [12]:
print(doc1.similarity(doc2))
print(doc1.similarity(doc3))

0.7934198520902852
0.9119906074696802


  "__main__", mod_spec)
  "__main__", mod_spec)


## 4 ChatBots

### 4.2 BackEnd

#### 4.2.1 ChatterBot: Write your own QAs

In [13]:
#!pip install chatterbot chatterbot_corpus
#!python -m spacy link en_core_web_sm en
import chatterbot
from chatterbot import ChatBot
from chatterbot.trainers import ListTrainer

In [14]:
chatbot = ChatBot("Psychoinformatics")

conversation = [
    "Hello",
    "Hi there!",
    "How are you doing?",
    "I'm doing great.",
    "That is good to hear",
    "Thank you.",
    "You're welcome.",
    "帥喔!",
    "帥只有一個字，卻跟了我一輩子~"
]

trainer = ListTrainer(chatbot)

trainer.train(conversation)

List Trainer: [####################] 100%


In [21]:
print(chatbot.get_response("Hi there"))
print(chatbot.get_response("帥喔!"))

How are you doing?
帥只有一個字，卻跟了我一輩子~
