# Getting Started

# Classifying Text

In [1]:
import sys
import csv
import random
import string
import pandas as pd
import numpy as np
import nltk
import sklearn.model_selection as modsel
import sklearn.naive_bayes as nb

DEBUGGING = False

try:
    df = pd.read_csv( '../data/SMSSpamCollection.csv', na_filter=False, encoding='latin-1' )
except Exception as x:
    print('error> reading data file' + str( x ))
    sys.exit()
M = len( df.values )
N = len( df.columns )
if ( DEBUGGING ):
    print('number of instances = ' + str( M ))
    print('number of columns = ' + str( N ))

ham = 0
spam = 0
for i in range (M):
    if(df.values[i][0] =='ham'):
        ham += 1
    elif(df.values[i][0] == 'spam'):
        spam += 1;
print("ham number is " + str(ham))
print("spam number is " + str(spam))

msgs   = []
y_raw = []
for rec in df.values:
    try:
        label = rec[0].strip()
        msg   = rec[1].strip()
        words = nltk.word_tokenize(msg)
        msgs.append(msg)
        y_raw.append(label)
    except Exception as x:
        print('error> parsing raw data file: ' + str( x ))
        
if ( DEBUGGING ):
    print('msgs shape = ', np.shape( msgs ), 'y_raw shape = ', np.shape( y_raw ))
    
msgs_train,msgs_test,y_train,y_test = modsel.train_test_split(msgs,y_raw,test_size = 0.5)
if ( DEBUGGING ):
    print('size of data sets:')
    print('msgs:   training={} test={}'.format( len( msgs_train ), len( msgs_test )))
    print('labels: training={} test={}'.format( len( y_train ), len( y_test )))

#--STEP 3: CREATE BALANCED TRAINING SET
ham = []
spam = []
numerr = 0
for (msg, label) in zip(msgs_train,y_raw):
    if(label == 'ham'):
        ham.append(msg)
    elif(label == 'spam'):
        spam.append(msg)
    else:
        numerr += 1
if ( DEBUGGING):
    print('number of: ham={} spam={} errors={} total={}'.format( len( ham ), len( spam ), numerr, ( len( ham ) + len( spam ) + numerr )))
    
while(len(spam) < len(ham)):
    i = random.randint(0,len(spam)-1)
    spam.append(spam[i])
    
if ( DEBUGGING ):
    print('balanced! number of: ham={} spam={}'.format( len(ham), len(spam) ))
    
msgs_bal_train = []
y_bal_train = []
for i in range(len(spam)):
    msgs_bal_train.append(spam[i])
    y_bal_train.append("spam")
for i in range(len(ham)):
    msgs_bal_train.append(ham[i])
    y_bal_train.append("ham")
    
if ( DEBUGGING ):
    print('size of balanced training: msgs={} labels={}'.format( len( msgs_bal_train ), len( y_bal_train )))
    
#STEP 4 CHARACTERISE THE TRAINING SET
# attributes:
# num_words
# msg_len
# num_digits
# num_punct
# num_upper
num_attributes = 5
X = []

for msg in (msgs_bal_train):
    num_digits = 0
    num_punct  = 0
    num_upper  = 0
    
    for ch in msg:
        if(ch.isdigit()):
            num_digits += 1
        elif(ch in string.punctuation):
            num_punct += 1
        elif(ch.isupper()):
            num_upper += 1
    msg_len = len(msg)
    words = nltk.word_tokenize(msg)
    num_words = len(words)
    X.append((num_words, msg_len, num_digits, num_punct, num_upper))
    
X = np.array(X)
y_bal_train  = np.array(y_bal_train)

#--STEP 5: TRAIN CLASSIFIER

clf = nb.MultinomialNB()
clf.fit(X, y_bal_train)

#test classfier
num_TP = 0
num_TN = 0
num_FP = 0
num_FN = 0

# attributes:
# num_words
# msg_len
# num_digits
# num_punct
# num_upper

A = []
for (msg,label) in zip(msgs_test,y_test):
#     A = []
    words = nltk.word_tokenize(msg)
    num_words = len(words)
    
    num_digits = 0
    num_punct  = 0
    num_upper  = 0
    msg_len = len(msg)
    for ch in msg:
        if(ch.isdigit()):
            num_digits +=1
        if(ch.isupper()):
            num_upper += 1
        if(ch in string.punctuation):
            num_punct += 1
    A.append((num_words, msg_len, num_digits, num_punct, num_upper))


#     pred_label = clf.predict(A)
#     if ( label == 'spam' ):
#         if ( pred_label == 'spam' ):
#             num_TP += 1
#         elif ( pred_label == 'ham' ):
#             num_FN += 1
#         else:
#             num_err += 1
#     elif ( label == 'ham' ):
#         if ( pred_label == 'spam' ):
#             num_FP += 1
#         elif ( pred_label == 'ham' ):
#             num_TN += 1
#         else:
#             num_err += 1
#     else:
#         num_err += 1

y_hat = clf.predict(A)
for i in range (len(y_hat)):
    if(y_test[i] == 'spam'):
        if(y_hat[i] == 'spam'):
            num_TP += 1
        else:
            num_FN += 1
    else:
        if ( y_hat[i] == 'spam' ):
            num_FP += 1
        else:
            num_TN += 1
            
print('TP={} FP={} TN={} FN={}'.format( num_TP, num_FP, num_TN, num_FN ))
precision = num_TP / float( num_TP + num_FP )
recall = num_TP / float( num_TP + num_FN )
f1 = 2 * precision * recall / ( precision + recall )
print('precision={} recall={} f1={}'.format( precision, recall, f1 ))

ham number is 4824
spam number is 747
TP=47 FP=2287 TN=155 FN=297
precision=0.020137103684661525 recall=0.13662790697674418 f1=0.0351008215085885


# Clustering Text

## setp 1-4

In [2]:
import textblob
import requests

TOP_MOST = 10

with open("../data/el-owl-cat.txt") as f:
    raw_verse = f.read()
f.close()

verse = textblob.TextBlob(raw_verse)
# print(verse.words)
print(len(verse.words))
# print(verse.word_counts)

# top most frequent
sorted_words = sorted(verse.word_counts,key=verse.word_counts.__getitem__, reverse=True)
for (i ,w ) in zip(range(TOP_MOST),sorted_words):
    print('{} = {}, {}'.format(( i+1) , w, verse.word_counts[w] ))
    
#get the stopwords
stopwords = requests.get("https://raw.githubusercontent.com/fozziethebeat/S-Space/master/data/english-stop-words-large.txt" ).content.decode("UTF-8").split("\n")

words = {}
for key in sorted_words:
    if (len(words) >10):
        break;
    if(key not in stopwords):
        words[key] = verse.word_counts[key]
        
print('{} most frequent words, after removing stopwords:'.format( TOP_MOST ))
for ( i, w ) in zip( range( TOP_MOST ), words ):
    print('{} = {}, {}'.format(( 1+i) , w, verse.word_counts[w] ))
    


221
1 = the, 20
2 = a, 13
3 = and, 8
4 = they, 7
5 = of, 7
6 = you, 7
7 = to, 6
8 = pussy, 5
9 = are, 5
10 = in, 4
10 most frequent words, after removing stopwords:
1 = pussy, 5
2 = ring, 4
3 = nose, 4
4 = moon, 4
5 = owl, 3
6 = beautiful, 3
7 = married, 2
8 = day, 2
9 = end, 2
10 = hand, 2


## step 5-7

In [16]:
import textblob
import requests
import numpy as np

TOP_MOST = 10

DATA_DIR = '../data/'
DATA_FILES = [ 'a-3kittens.txt', 'bp-tom-kitten.txt', 'el-owl-cat.txt', 'rc-cat-fiddle.txt', 'rk-cat.txt' ]

stop_words = requests.get("https://raw.githubusercontent.com/fozziethebeat/S-Space/master/da\
ta/english-stop-words-large.txt").content.decode("UTF-8").split("\n")

freq_words = [{} for i in range (len(DATA_FILES))]
for i in range(len(DATA_FILES)):
    with open(DATA_DIR+DATA_FILES[i]) as f:
        raw_verse = f.read()
    f.close()
    verse = textblob.TextBlob(raw_verse)
    sorted_words = sorted(verse.word_counts,key = verse.word_counts.__getitem__,reverse= True)
    
    for key in sorted_words:
        if(len(freq_words[i])==10):
            break;
        if(key in stop_words):
            continue;
        freq_words[i][key] = verse.word_counts[key]

        
#step 6 Boolean term-document
terms = []
for i in range (len(DATA_FILES)):
    for key in freq_words[i]:
        if(key not in terms):
            terms.append(key)
            
#empty matrix
termdoc = [ [0 for key in range(len(terms))] for j in range (len(DATA_FILES))]

#build the matrix
for i in range (len(DATA_FILES)):
    for j in range(len(terms)):
        if(terms[j] in freq_words[i]):
            termdoc[i][j] = 1

#step 7 Euclidean distance
def euc_dist(j0,j1,num_t,termdoc):
    dist = 0.0
    for i in range (num_t):
        dist += np.square(termdoc[j0][i] - termdoc[j1][i])
    dist = np.sqrt(dist)
    return dist

min_d = euc_dist(0,1,len(terms),termdoc)
min_0 = 0
min_1 = 1
for j0 in range(len(DATA_FILES)):
    for j1 in range(j0+1,len(DATA_FILES)):
        dist = euc_dist(j0,j1,len(terms),termdoc)
        print('Euclidean distance from {} to {} = {}'.format( j0, j1, dist ))
        if(dist< min_d):
            min_d = dist
            min_0 = j0
            min_1 = j1
print('{},{}, dist is {}'.format(min_0,min_1,min_d))

Euclidean distance from 0 to 1 = 4.0
Euclidean distance from 0 to 2 = 4.47213595499958
Euclidean distance from 0 to 3 = 4.47213595499958
Euclidean distance from 0 to 4 = 4.47213595499958
Euclidean distance from 1 to 2 = 4.47213595499958
Euclidean distance from 1 to 3 = 4.47213595499958
Euclidean distance from 1 to 4 = 4.47213595499958
Euclidean distance from 2 to 3 = 4.242640687119285
Euclidean distance from 2 to 4 = 4.47213595499958
Euclidean distance from 3 to 4 = 4.0
0,1, dist is 4.0


45

## step 8-10

In [38]:
import textblob
import requests
import numpy as np

TOP_MOST = 10
DATA_DIR = '../data/'
DATA_FILES = [ 'a-3kittens.txt', 'bp-tom-kitten.txt', 'el-owl-cat.txt', 'rc-cat-fiddle.txt', 'rk-cat.txt' ]

#get the stop words
stopwords = requests.get("https://raw.githubusercontent.com/fozziethe\
beat/S-Space/master/data/english-stop-words-large.txt").content.decode("UTF-8").split("\n")

#get all the frequent words
freq_words = [{} for i in range(len(DATA_FILES))]

for i in range (len(DATA_FILES)):
    #open the file
    with open(DATA_DIR+DATA_FILES[i]) as f:
        raw_verse = f.read()
        verse = textblob.TextBlob(raw_verse)
    f.close()
    #get the frequent word in file
    sorted_words = sorted(verse.word_counts,key= verse.word_counts.__getitem__, reverse=True)
    for key in sorted_words:
        if(len(freq_words[i]) == 10):
            break
        if(key in stopwords):
            continue;
        if(key not in freq_words[i]):
            freq_words[i][key] = verse.word_counts[key]

#build all the terms
terms = []
for i in range(len(DATA_FILES)):
    for key in freq_words[i]:
        if(key not in terms):
            terms.append(key)

# step 8 build the frequency term-document matrix
termdoc = [[0 for i in range(len(terms))]  for i in range(len(DATA_FILES))]
for i in range(len(DATA_FILES)):
    for j in range(len(terms)):
        if(terms[j] in freq_words[i]):
            termdoc[i][j] = freq_words[i][terms[j]]

#step 9 Euclidean distance 
def euc_dist(j0,j1,termdoc):
    length = len(termdoc[0])
    dist = 0.0
    for i in range (length):
        dist += np.square(termdoc[j0][i] - termdoc[j1][i])
    dist = np.sqrt(dist)
    return dist

min_d = euc_dist(0,1,termdoc)
min_0 = 0
min_1 = 1 
for i in range(len(DATA_FILES)):
    for j in range(i+1,len(DATA_FILES)):
        dist = euc_dist(i,j,termdoc)
        print('Euclidean distance from {} to {} = {}'.format( i, j, dist ))
        if(dist< min_d):
            min_d = dist
            min_0 = i
            min_1 = j
        
print('{},{}, dist is {}'.format(min_0,min_1,min_d))


# step 10 Cosine similarity  with frequency term-document matrix
def cos_sim( j0, j1, termdoc ):
    num_t = len(termdoc[0])
    sim_top = 0
    sim_bottom_0 = 0
    sim_bottom_1 = 0
    for t in range (num_t):
        sim_top += termdoc[j0][t] * termdoc[j1][t]
        sim_bottom_0 += np.square( termdoc[j0][t] )
        sim_bottom_1 += np.square( termdoc[j1][t] )
    sim = sim_top / ( np.sqrt( sim_bottom_0 )) * ( np.sqrt( sim_bottom_1 ))
    return sim

max_d  = cos_sim( 0, 1,  termdoc )
max_j0 = 0
max_j1 = 1

for i in range(len(DATA_FILES)):
    for j in range(i+1,len(DATA_FILES)):
        dist = cos_sim(i,j,termdoc)
        print('cos similarity  from {} to {} = {}'.format( i, j, dist ))
        if(dist> max_d):
            max_d = dist
            max_j0 = i
            max_j1 = j
        
print('{},{}, cos similarity is {}'.format(max_j0,max_j1,max_d))


Euclidean distance from 0 to 1 = 74.15524256584965
Euclidean distance from 0 to 2 = 73.58668357794092
Euclidean distance from 0 to 3 = 72.94518489934754
Euclidean distance from 0 to 4 = 168.5734261382855
Euclidean distance from 1 to 2 = 24.413111231467404
Euclidean distance from 1 to 3 = 22.40535650240808
Euclidean distance from 1 to 4 = 153.6164053739053
Euclidean distance from 2 to 3 = 10.583005244258363
Euclidean distance from 2 to 4 = 152.3679756379273
Euclidean distance from 3 to 4 = 151.36710342739602
2,3, dist is 10.583005244258363
cos similarity  from 0 to 1 = 45.224652749107065
cos similarity  from 0 to 2 = 0.0
cos similarity  from 0 to 3 = 0.0
cos similarity  from 0 to 4 = 0.0
cos similarity  from 1 to 2 = 0.0
cos similarity  from 1 to 3 = 0.0
cos similarity  from 1 to 4 = 0.0
cos similarity  from 2 to 3 = 1.3942471924464683
cos similarity  from 2 to 4 = 0.0
cos similarity  from 3 to 4 = 4426.986516286741
3,4, cos similarity is 4426.986516286741


In [34]:
#--
# p5.py
# # finds two closest documents using frequency term-document matrix and cosine similarity
# @author: letsios, sklar
# @created: 28 Jan 2021
#--

import textblob
import requests
import numpy as np

DEBUGGING = False

TOP_MOST = 10

DATA_DIR = '../data/'
DATA_FILES = [ 'a-3kittens.txt', 'bp-tom-kitten.txt', 'el-owl-cat.txt', 'rc-cat-fiddle.txt', 'rk-cat.txt' ]


#--
# cos_sim()
# computes and returns cosine similarity between two vectors j0 and j1 in termdoc matrix
#--
def cos_sim( j0, j1, num_t, termdoc ):
    #print 'j0: ',termdoc[j0][:]
    #print 'j1: ',termdoc[j1][:]
    sim_top = 0
    sim_bottom_0 = 0
    sim_bottom_1 = 0
    for t in range( num_t ):
        sim_top += termdoc[j0][t] * termdoc[j1][t]
        sim_bottom_0 += np.square( termdoc[j0][t] )
        sim_bottom_1 += np.square( termdoc[j1][t] )
    sim = sim_top / ( np.sqrt( sim_bottom_0 )) * ( np.sqrt( sim_bottom_1 ))
    return( sim )


#-----
# MAIN
#-----

# initalise list of dictionaries of most frequent words in each verse
freq_words = [ dict() for j in range( len( DATA_FILES )) ]

# get list of "stopwords"
print('fetching list of stopwords...')
stopwords = requests.get( "https://raw.githubusercontent.com/fozziethebeat/S-Space/master/data/english-stop-words-large.txt" ).content.decode('utf-8').split( "\n" )
print('number of stopwords = ' + str( len( stopwords )))
if ( DEBUGGING ):
    print('stopwords=', stopwords)

#-loop to read in the verses
for ( j, myfile ) in zip( range( len( DATA_FILES )), DATA_FILES ):
    with open( DATA_DIR+myfile ) as f:
        raw_verse = f.read()
    f.close()
    if ( DEBUGGING ):
        print('raw_verse=', raw_verse)
    print('file=', myfile)

    # initialise a TextBlob object using the verse
    # (this will decode any UTF-8 characters in the file)
    verse = textblob.TextBlob( raw_verse) 
    if ( DEBUGGING ):
        print('verse=', verse)

    # create a dictionary of words for this verse, removing the stopwords
    words = {}
    for w in verse.word_counts:
        if ( w not in stopwords ):
            words[w] = verse.word_counts[w]
    if ( DEBUGGING ):
        print(words)

    # sort the words in order to find the TOP_MOST most frequent
    sorted_words = sorted( words, key=words.__getitem__, reverse=True )
    for ( i, w ) in zip( range( TOP_MOST ), sorted_words ):
        freq_words[j][w] = verse.word_counts[w]
        print(i, w, verse.word_counts[w])

# now, the freq_words list contains a dictionary of the TOP_MOST most
# frequent words in each data file, and the word frequencies
if ( DEBUGGING ):
    print(freq_words)

# let's use this to create a term-document matrix
# start by getting a unique list of terms
terms = []
for j in range( len( DATA_FILES )):
    for w in freq_words[j]:
        if ( w not in terms ):
            terms.append( w )
if ( DEBUGGING ):
    print('terms=', terms)

# now we can use this to create a binary term-document matrix
termdoc = [[0 for t in range( len( terms ))] for j in range( len( DATA_FILES ))]
for j in range( len( DATA_FILES )):
    for t in range( len( terms )):
        if ( terms[t] in freq_words[j] ):
            termdoc[j][t] = freq_words[j][ terms[t] ]

# print term-document matrix
if ( DEBUGGING ):
    for t in range( len( terms )):
        print( terms[t], end='' )
    print()
    for j in range( len( DATA_FILES )):
        print( DATA_FILES[j], end='' )
        for t in range( len( terms )):
            print( termdoc[j][t], end='' )
        print()

# compute pairwise cosine similarity between document vectors
max_d  = cos_sim( 0, 1, len( terms ), termdoc )
max_j0 = 0
max_j1 = 1
for j0 in range( len( DATA_FILES )):
    for j1 in range( j0+1, len( DATA_FILES )):
        d = cos_sim( j0, j1, len( terms ), termdoc )
        print('cos similarity from {} to {} = {}'.format( j0, j1, d ))
        if ( d > max_d ):
            max_d = d
            max_j0 = j0
            max_j1 = j1
print('closest two verses by Cosine similarity are: {} ({}) and {} ({})'.format( DATA_FILES[max_j0], max_j0, DATA_FILES[max_j1], max_j1 ))
print('vectors=')
print(termdoc[max_j0][:])
print(termdoc[max_j1][:])


fetching list of stopwords...
number of stopwords = 593
file= a-3kittens.txt
0 miew 62
1 purr 32
2 mittens 14
3 kittens 13
4 began 4
5 mammy 4
6 dear 4
7 lost 3
8 cry 3
9 pie 3
file= bp-tom-kitten.txt
0 tom 10
1 pat 10
2 moppet 9
3 mittens 6
4 kitten 6
5 puddle-duck 6
6 kittens 5
7 tabitha 5
8 wall 5
9 pit 5
file= el-owl-cat.txt
0 pussy 5
1 ring 4
2 nose 4
3 moon 4
4 owl 3
5 beautiful 3
6 married 2
7 day 2
8 end 2
9 hand 2
file= rc-cat-fiddle.txt
0 diddle 2
1 hey 1
2 cat 1
3 fiddle 1
4 cow 1
5 jumped 1
6 moon 1
7 dog 1
8 laughed 1
9 fun 1
file= rk-cat.txt
0 wild 104
1 cat 75
2 woman 46
3 cave 40
4 dog 30
5 enemy 25
6 woods 21
7 man 20
8 fire 19
9 horse 15
cos similarity from 0 to 1 = 45.224652749107065
cos similarity from 0 to 2 = 0.0
cos similarity from 0 to 3 = 0.0
cos similarity from 0 to 4 = 0.0
cos similarity from 1 to 2 = 0.0
cos similarity from 1 to 3 = 0.0
cos similarity from 1 to 4 = 0.0
cos similarity from 2 to 3 = 1.3942471924464683
cos similarity from 2 to 4 = 0.0
cos simil