In [17]:
import pandas as pd
import regex
import string
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords'),

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aleksandra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(True,)

In [23]:
from many_stop_words import get_stop_words
 
stop_words = get_stop_words('pl')

In [4]:
df = pd.read_csv("allegro-sports-shoes.csv", sep=';')

In [5]:
df.sample(20)

Unnamed: 0,title
1051,BUTY MĘSKIE ADIDAS TUBULAR X S74929 SKÓRA! r. 44
4875,Nowe BUTY NIKE SB CHECK r.44 janoski portmore
4861,Buty męskie adidas vs Pace DB0151 45 1/3
4912,"Buty Adidas YEEZY BOOST 350 V2 39 1/3 24,5c SE..."
1953,"Buty New Balance Lifestyle [MRL247GW] r.44,5"
97,Buty męskie Reebok Ros Workout TR 2.0 AR2979
5481,"Buty NIKE T-LITE XI czarne 616544-007 - 47,5"
3891,NIKE AIR MAX MOTION 2 AO0266 001 BUTY MĘSKIE L...
136,BUTY SPORTOWE MĘSKIE ADIDASY AIR KS37 CZERWONE...
4976,ADIDAS ORIGINALS CLIMACOOL 1 BB0539 BUTY MĘSKIE


In [6]:
data = list(set(df.iloc[:, 0]))

In [7]:
data

['Buty Adidas Tubular Viral S75910 Originals',
 'LONSDALE buty Canons adidasy 43 obuwie sportowe h2',
 'Buty męskie sportowe Kappa Orbit 242523-1111',
 'BUTY NIKE MĘSKIE AIR VAPORMAX 2019 AR6631-002',
 'SPORTOWE ADIDASY BUTY DO BIEGANIA ENERGY NEW',
 'REEBOK ROYAL COMPLETE BS6288 BUTY MĘSKIE WYSOKIE',
 'BUTY NIKE COURT MAJESTIC LEATHER 574236-100 45',
 'Buty Adidas Tubular Invader Strap [BB8394] 44',
 'Halówki młodzieżowe buty sportowe AXIM 43',
 'Buty PUMA REBOUND STREET V2 (363715-09) 43|9',
 'buty NEW BALANCE męskie ML373BN R. 44',
 'Granatowe sportowe trampki zamsz buty 8104 43',
 'Buty męskie Adidas Tubular Nova S74819 r.42',
 'LACOSTE MISANO SPORT 317 LEATHER Buty tu 41 - 26cm',
 'Buty Męskie sportowe adidasy półbuty x925-3 R.43',
 'Converse Trampki Buty Męskie Skóra Czarne 43',
 'Buty męskie adidas RUNFALCON F36199',
 'Buty trampki CONVERSE CT II HI 150143C 39,5',
 'Buty PUMA NRGY COMET Męskie (190556-01) 43|9',
 'MOSKAŁA skóra BUTY męskie GRANAT rozmiar 41',
 '44 BUTY MĘSKIE NI

In [9]:
def clean_data(data, filter=None):
    data = [x.lower() for x in data]
    data = [regex.sub('|\\'.join(string.punctuation), ' ', x) for x in data]
    data = [regex.sub(' ([0-9]+[^ ]{0,} )+', ' ', x) for x in data]
    data = [regex.sub('^[0-9]+[^ ]{0,} ', ' ', x) for x in data]
    data = [regex.sub(' [0-9]+[^ ]{0,}$', ' ', x) for x in data]
    if filter:
        data = [x for x in data if x.find(filter) > -1]

    return data

In [11]:
def prepare_freq_table(data):
    freq_table = [word_tokenize(x) for x in data]
    freq_table = [dict(FreqDist(x)) for x in freq_table]
    freq_table = pd.DataFrame.from_dict(freq_table)
    freq_table = freq_table.fillna(0)

    return freq_table

In [12]:
def prepare_perp_summary(v1, v2, term1, term2, perp_threshold=5):
    iloc = sum(v1*v2)
    size = sum(v1) + sum(v2)

    return {'term1': term1,
            'term2': term2,
            'distance': iloc,
            'is_perp': iloc < perp_threshold,
            'size': size,
            'size_term1': sum(v1),
            'size_term2': sum(v2),
            'size_prop12': sum(v1)/sum(v2),
            'size_prop21': sum(v2)/sum(v1)}
,,,

In [26]:
def prepare_perp_table(freq_table, sw='polish', rm_low_freq=5, rm_high_freq=5, perp_threshold=5):
    # remove short words
    data = freq_table.iloc[:, freq_table.columns.map(lambda x: len(x) > 2)]
    # remove polish stopwords
    data = data.iloc[:, ~data.columns.isin(stop_words)]
    # remove words with low and hight frequency,,
    data = data.iloc[:, list(data.sum() > rm_low_freq)]
    data = data.iloc[:, list(data.sum() < data.shape[0] - rm_high_freq)]
    # convert positive numbers to 1
    for c in data.columns:
        data[c] = data[c].map(lambda x: 1 if x > 0 else 0)

    perp_table = []
    for i in range(data.shape[1]-1):
        for j in range(i+1, data.shape[1]):
            perp_table += [prepare_perp_summary(data[data.columns[i]],
                                                data[data.columns[j]],
                                                data.columns[i],
                                                data.columns[j],
                                                perp_threshold=perp_threshold)]
    perp_table = pd.DataFrame.from_dict(perp_table)
    perp_table = perp_table[perp_table.is_perp]

    return perp_table

In [18]:
def find_perp_set(perp_table, input_data_len, max_proportion=3, bp=0.90):
    # find two perp words with the highest size
    curr_set = perp_table[(perp_table.size_prop12 < max_proportion) & \
                          (perp_table.size_prop21 < max_proportion)].sort_values(by=['size']).iloc[-1, :]
    curr_set = [curr_set.term1, curr_set.term2]

    p = perp_table[perp_table.term1.isin(curr_set) & perp_table.term2.isin(curr_set)]
    print('+{}: {}% size of set'.format(curr_set[0], round(p['size_term1'].sum()/(input_data_len) * 100, 2)))
    p = p['size'].sum() / (input_data_len)
    print('+{}: {}% size of set'.format(curr_set[1], round(p * 100, 2)))

    while p < bp:
        tmp_perp_table = perp_table[~(perp_table.term1.isin(curr_set) & perp_table.term2.isin(curr_set))]

        # find perp elements to curr_set
        tt = []
        for c in curr_set:
            t = tmp_perp_table[(tmp_perp_table.term1 == c) | (tmp_perp_table.term2 == c)]
            t = list(set(list(t.term1) + list(t.term2)))
            t.remove(c)
            tt += [t]

        # find perp elements to all words in curr_set
        perp_terms = tt[0]
        for i in range(1, len(tt)):
            perp_terms = list(set(perp_terms) & set(tt[i]))

        tmp_set = tmp_perp_table[((tmp_perp_table.term1.isin(perp_terms) & tmp_perp_table.term2.isin(curr_set)) |
                                  (tmp_perp_table.term1.isin(curr_set) & tmp_perp_table.term2.isin(perp_terms))) &
                                 ((tmp_perp_table.size_prop12 < max_proportion) &
                                  (tmp_perp_table.size_prop21 < max_proportion))].sort_values(by=['size'])
        if tmp_set.shape[0] > 0:
            tmp_set = tmp_set.iloc[-1, :]
            tmp_set = list(set(perp_terms) and set([tmp_set.term1, tmp_set.term2]))
            new_term = [x for x in tmp_set if x not in curr_set]

            # add this element to curr_set
            curr_set = list(set(curr_set + new_term))

            p = perp_table[perp_table.term1.isin(curr_set) & perp_table.term2.isin(curr_set)]['size'].sum()/((len(curr_set)-1)*input_data_len)
            print('+{}: {}% size of set'.format(new_term[0], round(p*100, 2)))
        else:
            print('early stoping - no more words')
            break

    return curr_set

In [20]:
data = clean_data(data)


In [21]:
data

['buty adidas tubular viral s75910 originals',
 'lonsdale buty canons adidasy obuwie sportowe h2',
 'buty męskie sportowe kappa orbit ',
 'buty nike męskie air vapormax ar6631 ',
 'sportowe adidasy buty do biegania energy new',
 'reebok royal complete bs6288 buty męskie wysokie',
 'buty nike court majestic leather ',
 'buty adidas tubular invader strap  bb8394  ',
 'halówki młodzieżowe buty sportowe axim ',
 'buty puma rebound street v2   ',
 'buty new balance męskie ml373bn r  ',
 'granatowe sportowe trampki zamsz buty ',
 'buty męskie adidas tubular nova s74819 r ',
 'lacoste misano sport leather buty tu   ',
 'buty męskie sportowe adidasy półbuty x925 r ',
 'converse trampki buty męskie skóra czarne ',
 'buty męskie adidas runfalcon f36199',
 'buty trampki converse ct ii hi ',
 'buty puma nrgy comet męskie   ',
 'moskała skóra buty męskie granat rozmiar ',
 ' buty męskie nike ebernon low aq1775 biały',
 'r  buty adidas terrex brushwood ac7851',
 'buty męskie adidas climacool bb0539 

In [28]:
freq_table = prepare_freq_table(data)
perp_table = prepare_perp_table(freq_table)
perp_set_1 = find_perp_set(perp_table, input_data_len=len(data))

+adidas: 28.74% size of set
+nike: 49.28% size of set
+puma: 58.43% size of set
+balance: 67.34% size of set
+reebok: 73.73% size of set
+adidasy: 77.55% size of set
+lacoste: 80.77% size of set
+asics: 83.33% size of set
+kappa: 85.02% size of set
+under: 85.98% size of set
+salomon: 86.72% size of set
+timberland: 87.45% size of set
+caterpillar: 88.13% size of set
+converse: 88.75% size of set
+vans: 89.23% size of set
+big: 89.71% size of set
+tenisówki: 90.17% size of set
