In [57]:
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.metrics
from nltk.tokenize import word_tokenize
from bisect import bisect
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/bart/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Metryki w przestrzeni napisów

## Bartosz Kucharz

In [2]:
def bag_of_words(data, stopwords=None):
    vectorizer = CountVectorizer(stop_words=stopwords)
    bag = vectorizer.fit_transform(data)
#     print(vectorizer.get_feature_names())
#     print(vectorizer.vocabulary_)
#     print(bag.toarray())
    return bag.toarray()

## Implementacja "metryk"

## Euklidesowa
$$d(\textbf{x}, \textbf{y})=\sqrt{(x_1-y_1)^2+...+(x_n-y_n)^2} $$

In [5]:
def euclidean_metric(x, y):
    return np.sqrt(np.sum((x - y) ** 2))

## Cosinusowa

$$
d(\textbf x, \textbf y) = 1 - \frac{ \sum_{i=1}^{n} x_iy_i}{\sqrt{\sum_{i=1}^{n}{x_i^2}} \sqrt{\sum_{i=i}^{n}y_i^2}}
$$

In [6]:
def cosine_metric(x, y):
    r =  1 - ((x@y.T) / (np.sqrt(x@x.T) * np.sqrt(y@y.T)))
    if np.abs(r) < 1e-10:
        r = 0.
    return r

# LCS

"Metryka" LCS między dwoma stringami jest rówa odległości edycyjnej między tymi stringami.

In [7]:
def lcs_metric(x, y):
    xx, yy = lines[int(x[0])].lower(), lines[int(y[0])].lower()
    
    def lcs(x,y):
        ranges = []
        ranges.append(len(y)) 
        y_letters = list(y)
        for i in range(len(x)):
            positions = [j for j, l in enumerate(y_letters) if l == x[i]]
            positions.reverse()
            for p in positions:
                k = bisect(ranges, p)
                if(k == bisect(ranges, p-1)):
                    if(k < len(ranges) - 1):
                        ranges[k] = p
                    else:
                        ranges[k:k] = [p]
        return len(ranges) - 1
    res =  lcs(xx, yy)
#     print(xx)
#     print(yy)
#     print(max(len(xx), len(yy)) - res, end='\n\n_____________\n')
    return 1 - res/max(len(xx), len(yy))

# Ocena jakości klasteryzacji

## Indeks Daviesa-Bouldina

In [8]:
def davies_bouldin_index(X, cluster):
    labels = cluster.labels_
    return sklearn.metrics.davies_bouldin_score(X, labels)

## Silhouette Coefficient

In [9]:
def silhouette(X, cluster, metric_function):
    labels = cluster.labels_
    return sklearn.metrics.silhouette_score(X, labels, metric=metric_function)

# Klasteryzacja

In [10]:
def cluster(X, metric_function, epsilon=1):
#     data = np.array(lines)
    db = DBSCAN(metric=metric_function, min_samples=1, eps=epsilon, n_jobs=-1).fit(X)
    return db

In [11]:
def print_clusters(lines, db, new_file_name='result.txt'):
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    unique_labels = set(labels)
    clusters = [[] for _ in unique_labels]
    for label in unique_labels:
        
        class_member_mask = labels == label

        clusters[label].append(lines[class_member_mask & core_samples_mask])
    
    
    with open(new_file_name, 'w') as file:
        file.write(f'Number of clusters: {len(unique_labels)}\n\n')
        print(f'Number of clusters: {len(unique_labels)}\n')
        for cluster in clusters:
            for name in cluster:
                file.write(str(name)+'\n')
                print(name)
            file.write("\n"+"#"*15+"\n\n")
            print("\n"+"#"*15+"\n")
    

Ze względu na długi czas obliczeń za dane bierzemy 500 losowych lini z pliku lines.txt.

In [19]:
def get_stop_list(lines, n=10):
    cv = CountVectorizer()
    cv.fit(lines)
    bag = cv.transform(lines)
    sum_words = bag.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in cv.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return [word for word, count in words_freq[:n]]

In [25]:
with open('lines.txt', 'r') as file:
    text = file.readlines()

n = 500
lines = np.random.choice(np.array(text), n, replace=False)
with open(f'lines_{n}.txt', 'w') as file:
    file.writelines(lines)
X = bag_of_words(lines)

# Wyniki

## Bez użycia stop listy

### "Metryka" euklidesowa

In [26]:
c1 = cluster(X, euclidean_metric, epsilon=1)

In [27]:
print_clusters(lines, c1, 'result_euclidean.txt')

Number of clusters: 489

['HANARO T&S CO LTD (BUSAN)RM 502 TRADE CENTER87-7, CHUNGANG-DONG 4-GA CHUNG-KU 6BUSAN KOREA, SOUTH\n']

###############

['PANTOS LOGISTICS HCMC 111 PASTEUR ST, BEN NGHE WD, DIST ROOM 801 8TH FL, SAILING BLDG VIETNA VIETNAM HO CHI MINH CITY (SAIGON) 65 VN\n']

###############

['AGILITY LOGISTICS (SHANGHAI) LTD Building No. 9, IBP 280 Linhong Rd, Changning District Shanghai, 200335CHINA\n']

###############

['OPENLOG SP.Z.O.O. UL.HUTNICZA 3 81-212 GDYNIA KAROLINA JADESZKO TEL.:+48 58 699 12 43\n']

###############

['T.S.TRANSPED SP.ZO.O. UL. KWIATKOWSKIEGO 60 81-127 GDYNIA POLAND TEL:(+48 58)62 13671 CONTRACT PERSON: MARZENA PRZYBYSZ\n']

###############

['IKEA DISTRIBUTION SERVICES POLAND S.A. UL.M. GRABINSKIEGO 50, JAROSTYPL-97310 MOSZCZENICA POLAND\n']

###############

['JIANGXI INTL LOGISTICS CO LTD2 NANJING WEST RD9F330006 NANCHANG CHINA\n']

###############

["FMG SHIPPING&FORWARDING LTD. 190020,SAINT PETERSBURG, LIFLYANDSKAYASTR.6,LIERA A',BUILDING 

['SAUDI BASIC INDUSTRIES CORPORATIONc/o Nijhof-Wassink Sp. Z.o.o. ul. Holenderska 3 99-300 Kutno Poland Tel:  +48.24.251.0700  Fax: +48.24.251.0701\n']

###############

['GUANGAN NEW FURNITURE CO.,LTD  NO.1CHAOSHAN INDUSTRIAL PARK,XIQIAO TOWN,NANHAI DISTRICT,FOSHAN CITY OF P.R.C.\n']

###############

['THAI UNION MANUFACTURING COMPANY LIMITED 979/13-16,M.FLOOR,SM TOWER, PHAHOLYOTHIN ROAD, SAMSENNAI,PHAYATHAI, BANGKOK,10400,THAILAND\n']

###############

['SEJUNG SHIPPING CO LTDVDRWOOSUNG BLDG 9F35 BUKCHANG-DONG JUNG-GU 100-180SEOUL KOREA, SOUTH\n']

###############

['MITRANS LTD., 119607, RUSSIA, MOSCOW, MITCHURINSKIY PR.27, KORP.3  TEL:+7-495-642-3669, E-MAIL:MITRANS.LTD@GMAIL.COM\n']

###############

['TO ORDER OF DHL GLOBAL FORWARDING AS AGENT FOR DANMAR LINES BRANCH OFFICE OOO DHL LOGISTICS BUSINESS CENTER "PULKOVO SKY" . OFFICE B 602, KORPUS B, 2\n']

###############

['TO ORDER OF MARINO ENTERPRISES CORPADDRESS: 40,VILLA CACERES, PANAMACITY, P.O. BOX 0819-00626,EL DORADO, REP


###############

['WENZHOU TOP TRADING CO.,LTD RM 1102JINRIJIAYUAN TANGJIAQIAO ROAD,WENZHOU 325000 CHINA\n']

###############

['DHL GLOBAL FORWARDING SP. Z O.O. DOSTAWCZA 17 93-231 LODZ POLAND POLAND PL POLAND TEL:42 677 33 53 /3413. FAX:42 677 34 82\n']

###############

['1.OOO SILMAR SPB 198035, RUSSIA, STPETERSBURG UL. DVINSKAYA 16/2 CONTACT PERSON: YASHIN ALEXEY PHONE: +7812 495 8527---\n']

###############

['OOO "KLEYA" PR. ENGELSA D. 16 K.2 LIT. A 194156, ST.PETERSBURG, RUSSIATEL +7 (812) 329-42-62 PASHKEVICHANNA\n']

###############

['DSV AIR&SEA SP.Z O.O. UL.J.WISNIEWSKIEGO 31 81-183 GDYNIA POLAND PHONE:+48(0)58 621 39 26 ---\n']

###############

['EVERLAST ASTER LIMITED\n']

###############

['PACTRA INTERNATIONAL(POLAND) SP. Z.O.O. UL. SMOLENSKIEGO 4/18 01-698 WARSZAWA POLAD CONTACT PERSON : MS.PAULINA++\n']

###############

['TROPIC.P.H   KOCZARGI NOWE,  UL. BUGAJ 11,  05-082 STARE BABICE, POLAND\n']

###############

['HUSQVARNA ZENOAH CHANGZHOU MACHINERY CO., LTD NO

['WENZHOU UNION FASHION CO.,LTD ADD:2WENZHOU AVENUE,325011, WENZHOU ZHEJIANG,CHINA\n']

###############

['SHENGZHOU JINYU IMP&EXP.CO.,LTD NO.5F. NO.108 BEIZHI STREET SHENGZHOUCITY ZHEJIANG CHINA\n']

###############

['WEISS-ROHLIG CHINA CO., LTD. NINGBOBRANCH RM 8608, HOWARD JOHNSON OFFICE BLDG No.230 LIUTING STR. 315010NINGBO CHINA-\n']

###############

['FMG SHIPPING AND FORWARDING LTD 190020,SAINT PETERSBURG, LIFLYANDSKAYASTR.,6,LITERA"A",BUILDING 31H, OFFICE+++\n']

###############

['XIAMEN JAGUAR GLOBAL TRADING CO LTD7 HAITIAN RD 361006 XIAMEN CHINA XIAMEN 35 CN\n']

###############



In [28]:
d_i = davies_bouldin_index(bag_of_words(lines), c1)
s_c = silhouette(X, c1, euclidean_metric)
print(f'Indeks Daviesa-Bouldina: {d_i}')
print(f'Silhouette Coefficient: {s_c}')

Indeks Daviesa-Bouldina: 0.14137898109098318
Silhouette Coefficient: 0.03117305885474193


### "Metryka" cosinusowa

In [29]:
c2 = cluster(X, cosine_metric, epsilon=0.3)

In [30]:
print_clusters(lines, c2, 'result_cosine.txt')

Number of clusters: 441

['HANARO T&S CO LTD (BUSAN)RM 502 TRADE CENTER87-7, CHUNGANG-DONG 4-GA CHUNG-KU 6BUSAN KOREA, SOUTH\n']

###############

['PANTOS LOGISTICS HCMC 111 PASTEUR ST, BEN NGHE WD, DIST ROOM 801 8TH FL, SAILING BLDG VIETNA VIETNAM HO CHI MINH CITY (SAIGON) 65 VN\n']

###############

['AGILITY LOGISTICS (SHANGHAI) LTD Building No. 9, IBP 280 Linhong Rd, Changning District Shanghai, 200335CHINA\n']

###############

['OPENLOG SP.Z.O.O. UL.HUTNICZA 3 81-212 GDYNIA KAROLINA JADESZKO TEL.:+48 58 699 12 43\n']

###############

['T.S.TRANSPED SP.ZO.O. UL. KWIATKOWSKIEGO 60 81-127 GDYNIA POLAND TEL:(+48 58)62 13671 CONTRACT PERSON: MARZENA PRZYBYSZ\n']

###############

['IKEA DISTRIBUTION SERVICES POLAND S.A. UL.M. GRABINSKIEGO 50, JAROSTYPL-97310 MOSZCZENICA POLAND\n'
 'IKEA DISTRIBUTION SERVICES S.A. UL.M.GRABINSKIEGO 50, JAROSTY PL-97310MOSZCZENICA POLAND\n'
 'IKEA Distribution Services S.A. UL.M.Grabinskiego 50, Jarosty PL-97310Moszczenica POLAND\n']

###############


In [32]:
d_i = davies_bouldin_index(bag_of_words(lines), c2)
s_c = silhouette(X, c2, cosine_metric)
print(f'Indeks Daviesa-Bouldina: {d_i}')
print(f'Silhouette Coefficient: {s_c}')

Indeks Daviesa-Bouldina: 0.5835052493502155
Silhouette Coefficient: 0.11838535688541969


### "Metryka" LCS

Ze względu na bardzo długi czas wykonywania klastryzacji dla metryki LCS, zmniejszamy dane wejściowe do 200 lini.

In [33]:
n = 200
lcs_lines = np.random.choice(lines, n, replace=False)
X_LCS = np.arange(lcs_lines.shape[0]).reshape(-1, 1)


c3 = cluster(X_LCS, lcs_metric, epsilon=0.5)

In [34]:
print_clusters(lcs_lines, c3, 'result_lcs.txt')

Number of clusters: 133

['HANGZHOU TRUST APPAREL CO.,LTD ROOM401-402 SONGDU IND.MANSION, NO.229HANGHAI ROAD, HANGZHOU, CHINA\n']

###############

['OOO "Maguro" Veernaya str., 3, build. 2 119501, Moscow, Russia\n']

###############

['SCHENKER SP. Z O.O. 1235531UL. ORDONA 2A 01- 237 WARSZAWA PHONE. +48 58 621 37 6 2FAX. +48 58 621 37 37\n'
 'TO ORDER OF PANALPINA CIS HELSINKIOY AS AGENT FOR AND ON BEHALF OF PANTAINER LTD\n'
 'LPP S.A. 80-769 GDANSK,UL. LAKOWA 39/44\n'
 'TO ORDER OF MAPLE FOREST INC. 50, 30-TH FLOOR, PLAZA BANCO  GENERAL, AQUILINO DE LA GUARDIA STREET, PANAMA,  REPUBLIC OF PANAMA\n']

###############

['NINGBO YONGHUA RESIN CO.,LTD 555 FENGMING ROAD, NINGBO CHEMICAL INDUSTRIAL ZONE, CHINA\n'
 'POLIMEX FORWARDING INC27 SELBY RDBRAMPTONON L6W 1K5\n'
 'AMCO CARGO SYSTEM LTDVDR315-71 JEIL BLDG PO BOX RM408SUNGSOO-DONG 2KA, SUNGDONG-GU 133-1SEOUL KOREA, SOUTH\n'
 'XUANCHENG TANGBIAO SANITARY WARE CO.,LTD ANHUI PROVINCE  ADD:QILIN ROAD, XUANZHOU INDUSTRIAL PARK,XUANCHENG AN

In [35]:
d_i = davies_bouldin_index(bag_of_words(lcs_lines), c3)
s_c = silhouette(X_LCS, c3, lcs_metric)
print(f'Indeks Daviesa-Bouldina: {d_i}')
print(f'Silhouette Coefficient: {s_c}')

Indeks Daviesa-Bouldina: 1.2999548981416271
Silhouette Coefficient: 0.052186276937939445


## Z użyciem stop listy

In [66]:
stopwords = get_stop_list(lines, 10)
X = bag_of_words(lines, stopwords)

Na stop liście znajduje się 10 najczęściej występujących słów w tekście.

In [38]:
print(f'Stop lista: {stopwords}')

Stop lista: ['ltd', 'tel', 'co', 'fax', 'china', 'ul', 'poland', 'no', '48', 'russia']


### "Metryka" euklidesowa

In [39]:
c1 = cluster(X, euclidean_metric, epsilon=1)

In [40]:
print_clusters(lines, c1, 'result_euclidean_stoplist.txt')

Number of clusters: 487

['HANARO T&S CO LTD (BUSAN)RM 502 TRADE CENTER87-7, CHUNGANG-DONG 4-GA CHUNG-KU 6BUSAN KOREA, SOUTH\n']

###############

['PANTOS LOGISTICS HCMC 111 PASTEUR ST, BEN NGHE WD, DIST ROOM 801 8TH FL, SAILING BLDG VIETNA VIETNAM HO CHI MINH CITY (SAIGON) 65 VN\n']

###############

['AGILITY LOGISTICS (SHANGHAI) LTD Building No. 9, IBP 280 Linhong Rd, Changning District Shanghai, 200335CHINA\n']

###############

['OPENLOG SP.Z.O.O. UL.HUTNICZA 3 81-212 GDYNIA KAROLINA JADESZKO TEL.:+48 58 699 12 43\n']

###############

['T.S.TRANSPED SP.ZO.O. UL. KWIATKOWSKIEGO 60 81-127 GDYNIA POLAND TEL:(+48 58)62 13671 CONTRACT PERSON: MARZENA PRZYBYSZ\n']

###############

['IKEA DISTRIBUTION SERVICES POLAND S.A. UL.M. GRABINSKIEGO 50, JAROSTYPL-97310 MOSZCZENICA POLAND\n']

###############

['JIANGXI INTL LOGISTICS CO LTD2 NANJING WEST RD9F330006 NANCHANG CHINA\n']

###############

["FMG SHIPPING&FORWARDING LTD. 190020,SAINT PETERSBURG, LIFLYANDSKAYASTR.6,LIERA A',BUILDING 

In [41]:
d_i = davies_bouldin_index(X, c1)
s_c = silhouette(X, c1, euclidean_metric)
print(f'Indeks Daviesa-Bouldina: {d_i}')
print(f'Silhouette Coefficient: {s_c}')

Indeks Daviesa-Bouldina: 0.12413225149204989
Silhouette Coefficient: 0.03724507372337224


### "Metryka" cosinusowa

In [42]:
c2 = cluster(X, cosine_metric, epsilon=0.3)

In [43]:
print_clusters(lines, c2, 'result_cosine_stoplist.txt')

Number of clusters: 444

['HANARO T&S CO LTD (BUSAN)RM 502 TRADE CENTER87-7, CHUNGANG-DONG 4-GA CHUNG-KU 6BUSAN KOREA, SOUTH\n']

###############

['PANTOS LOGISTICS HCMC 111 PASTEUR ST, BEN NGHE WD, DIST ROOM 801 8TH FL, SAILING BLDG VIETNA VIETNAM HO CHI MINH CITY (SAIGON) 65 VN\n']

###############

['AGILITY LOGISTICS (SHANGHAI) LTD Building No. 9, IBP 280 Linhong Rd, Changning District Shanghai, 200335CHINA\n']

###############

['OPENLOG SP.Z.O.O. UL.HUTNICZA 3 81-212 GDYNIA KAROLINA JADESZKO TEL.:+48 58 699 12 43\n']

###############

['T.S.TRANSPED SP.ZO.O. UL. KWIATKOWSKIEGO 60 81-127 GDYNIA POLAND TEL:(+48 58)62 13671 CONTRACT PERSON: MARZENA PRZYBYSZ\n']

###############

['IKEA DISTRIBUTION SERVICES POLAND S.A. UL.M. GRABINSKIEGO 50, JAROSTYPL-97310 MOSZCZENICA POLAND\n']

###############

['JIANGXI INTL LOGISTICS CO LTD2 NANJING WEST RD9F330006 NANCHANG CHINA\n']

###############

["FMG SHIPPING&FORWARDING LTD. 190020,SAINT PETERSBURG, LIFLYANDSKAYASTR.6,LIERA A',BUILDING 

['ASIAN TIGERS TRANSPACK CO., LTD. 901, HOSEO UNIV. VENTURE TOWER 319, KASAN-DONG KEUMCHUN-KU, SEOUL 153-802 KOREA\n']

###############

['BOSSI&C. TRANSITI S.P.A. VIA D, FIASELIA ,1 16121 GENOVA TEL:010  57161 FAX: 010 582346\n']

###############

['IKEA DISTRIBUTION SERVICES S.A. UL.M.GRABINSKIEGO 50, JAROSTY PL-97310MOSZCZENICA POLAND\n'
 'IKEA Distribution Services S.A. UL.M.Grabinskiego 50, Jarosty PL-97310Moszczenica POLAND\n']

###############

['NORINCO SHANGHAI CO.,LTD 15F.,1228SOUTH ZHONG SHAN ROAD, SHANGHAI,200120 CHINA +86-21-63155558\n']

###############

['LEAD WAY EXPRESS CO LTDLEAD WAY EXPRESS CO LTD 60 FU SHINROOM 802 8F TAIWAN\n']

###############

['TO ORDER OF MAPLE FOREST INC. 50, 30-TH FLOOR, PLAZA BANCO  GENERAL, AQUILINO DE LA GUARDIA STREET, PANAMA,  REPUBLIC OF PANAMA\n']

###############

['ADE LINE SA  UL. POLNA 14, 55-110 PRUSICE TEL:+48717355893          FAX:+48713528414\n']

###############

['PANALPINA WORLD TRANSPORT PHIL (MNL3&4/F ZEALCOR BUSINESS CENT

In [44]:
d_i = davies_bouldin_index(X, c2)
s_c = silhouette(X, c2, cosine_metric)
print(f'Indeks Daviesa-Bouldina: {d_i}')
print(f'Silhouette Coefficient: {s_c}')

Indeks Daviesa-Bouldina: 0.5746799108365392
Silhouette Coefficient: 0.11416550331544327


### "Metryka" LCS

Ze względu na bardzo długi czas wykonywania klastryzacji dla metryki LCS, zmniejszamy dane wejściowe do 100 lini.

In [69]:
n = 100
lcs_lines = np.random.choice(lines, n, replace=False)

def rm_sw(line):
    text_tokens = word_tokenize(line.lower())
    tokens_without_sw = [word for word in text_tokens if not word in stopwords]
    filtered_sentence = (" ").join(tokens_without_sw)
    return filtered_sentence

lsc_lines = np.vectorize(rm_sw)(lcs_lines)
X_LCS = np.arange(lcs_lines.shape[0]).reshape(-1, 1)

c3 = cluster(X_LCS, lcs_metric, epsilon=0.5)

In [71]:
c3 = cluster(X_LCS, lcs_metric, epsilon=0.3)

In [77]:
print_clusters(lsc_lines, c3, 'result_lcs_stoplist.txt')

Number of clusters: 98

['ifl international freight lines limited']

###############

['biazet spolka akcyjna gen.wladyslawa andersa 38 bialystok 15-113 lukasz koziol , ewa nasuta telephone : 856784700']

###############

['ningbo topluck imp. & exp.co. , rm.602 bldg.11 east zone of new world385 xintian road ningbo']

###############

['itella logistics oy , huurrekuja 2 , fi-04360 tuusula , finland t : 358 ( 9 ) -2512-1711 , f : 358 ( 9 ) -8251885']

###############

['zhenhua logistics group ltd68 renming road19/f gold name building116001 dalian']

###############

["ooo `` shkval '' 390000 , ryazan , koltcova ul.2 , inn 6234067795"]

###############

['winteam industrial development company limited room 801-2 , 8/f. , easeycommercial building , 253-261 hennessy road , wanchai , h.k .']

###############

['global goodwill logistics corp. cell03.7floor , blocks , senling real estate 200080 no.469 wu song rd . shanghai'
 'ningbo sanbang thread industry co. , . no.62 , changfeng industr


["ooo `` kontex '' 199004. , saintpetersburg , 4 - liniya vasilevskogoostrova , d 17. . 8 812 323- 95-95"]

###############

['hercules logistics co. , ltd.o/b woojin global logistics co. , . o/byin tong ( dong guan city ) glass co. , . xie xi village north , shatian town,523982 dongguan']

###############

['shengzhou tree electrical appliances co. , . no.2 industry zone , sanjie , shengzhou , zhejiang , chin a']

###############



In [75]:
d_i = davies_bouldin_index(X_LCS, c3)
s_c = silhouette(X_LCS, c3, lcs_metric)
print(f'Indeks Daviesa-Bouldina: {d_i}')
print(f'Silhouette Coefficient: {s_c}')

Indeks Daviesa-Bouldina: 2.28422355349415
Silhouette Coefficient: 0.01763197507037525
