# Preprocessing

This notebook is needed for preparing the input datasets: upper case and etc.

### The input datasets should be in the following format:
- Company name (string)
- Marketplace (string)
- Country (string)
- State (string)
- City (string)
- Zip Code (string)
- Street (string)
- URL (string)
- Industry (SIC)

In [27]:
import pandas as pd
from wordcloud import WordCloud, STOPWORDS
import sys
import numpy as np
import string
import collections
from textblob import TextBlob
import Levenshtein
import binascii

import matplotlib.pyplot as plt
import random


In [3]:
def df_import():
    # import FR data (adjustments for delimeters and encoding - latin)
    df = pd.read_csv("~/Dropbox/Botva/TUM/Master_Thesis/object-identification/datasets/processed_files/france_rna_processed.csv", encoding='latin-1', sep = ';', error_bad_lines=False) 
#    df = pd.read_csv("~/Dropbox/Botva/TUM/Master_Thesis/object-identification/datasets/raw_files/rna_waldec_20201201_dpt_01.csv", error_bad_lines=False)
#    df = df.astype(str)
    print(df)
    print(df.dtypes)
    return df

In [4]:
def df_prepare(df):
    df = df.apply(lambda x: x.astype(str).str.upper())
    df['name'] = df['name'].apply(lambda x: x.replace('.',''))
    df['name'] = df['name'].str.replace('[^0-9a-zA-Z]+', ' ')
    df['name'] = df['name'].str.replace(' +', ' ')
    df['name_split'] = df['name'].str.split(' ')
    print(df)
    print(df.dtypes)

    return df

In [None]:
def frequent_words(df_processed):
    b = TextBlob("bonjour")
    b.detect_language()
    print(df_processed['name'])
    all_words = df_processed['name']    
    all_words_cleaned = []

    for text in all_words:
        text = [x.strip(string.punctuation) for x in text]
        all_words_cleaned.append(text)

    all_words_cleaned[0]

    text_words = [" ".join(text) for text in all_words_cleaned]
    final_text_words = " ".join(text_words)
    #final_text_words[:1000]

    print(all_words)
    stopwords = set(STOPWORDS)
    stopwords.update(["LE","DE","LA","ET","DES","DU","LES","EN","ET","A","POUR","SUR","SOU","S","D","L"])

    wordcloud_names = WordCloud(stopwords=stopwords, background_color="white", max_font_size=50, max_words=100).generate(final_text_words)

    # Lines 4 to 7
    plt.figure(figsize = (15,15))
    plt.imshow(wordcloud_names, interpolation='bilinear')
    plt.axis("off")
    plt.show()
    filtered_words = [word for word in final_text_words.split() if word not in stopwords]
    counted_words = collections.Counter(filtered_words)

    word_count = {}

    for letter, count in counted_words.most_common(100):
        word_count[letter] = count

    for i,j in word_count.items():
        print('Word: {0}, count: {1}'.format(i,j))
    
    return word_count

Levenstein Distance

In [143]:
texts_1 = df_processed['name']
texts_2 = df_processed['name']
lvn_array = np.zeros((len(texts_1),len(texts_2)))
print(lvn_array)

text_1_num = 0
text_2_num = 0

for text_1 in texts_1:
    for text_2 in texts_2:
        lvn_array[text_1_num,text_2_num] = Levenshtein.ratio(text_1,text_2)
        text_2_num += 1 
    text_2_num = 0    
    text_1_num += 1 
lvn_array

array([[1.        , 0.48275862, 0.3       , ..., 0.29787234, 0.17391304,
        0.38095238],
       [0.48275862, 1.        , 0.27586207, ..., 0.39285714, 0.1875    ,
        0.33333333],
       [0.3       , 0.27586207, 1.        , ..., 0.29787234, 0.43478261,
        0.19047619],
       ...,
       [0.29787234, 0.39285714, 0.29787234, ..., 1.        , 0.2       ,
        0.29166667],
       [0.17391304, 0.1875    , 0.43478261, ..., 0.2       , 1.        ,
        0.16666667],
       [0.38095238, 0.33333333, 0.19047619, ..., 0.29166667, 0.16666667,
        1.        ]])

In [87]:
n = 2
shingles_list = []

texts = df_processed['name']
hashes_list = np.array([0] * len(df_processed['name']))
hashes_array = hashes_list

text_num = 0
shingle_num = 0

for text in texts:
#    print(text)
#    text = text.encode()
    shingles = [text[i:i + n] for i in range(len(text) - n + 1)]
#    print(shingles)
    for shingle in shingles:
        if shingle not in shingles_list:
            hashes_array = np.vstack((hashes_array, hashes_list)) #needs to be removed after last iteration
            shingles_list.append(shingle)
            hashes_array[shingle_num,text_num] = 1
            shingle_num += 1
        if shingle in shingles_list:
            hashes_array[shingles_list.index(shingle),text_num] = 1
    text_num += 1
    
hashes_array




KeyboardInterrupt: 

In [89]:
df = df_import()

         local_id                                               name country  \
0      W011000001                                         VENT D'EST  FRANCE   
1      W011000002                                UNE NOTE DE PARTAGE  FRANCE   
2      W011000003                                         CHANDELAIN  FRANCE   
3      W011000004                                   LEYMENT-MATERIEL  FRANCE   
4      W011000005                          ASSOCIATION 'TOILES EMOI'  FRANCE   
...           ...                                                ...     ...   
18119  W014004922                                  AU BONHEUR D'ELDA  FRANCE   
18120  W014004923                                 JE S'AIME ET VOUS?  FRANCE   
18121  W014004924  'ASSOCIATION COMMUNALE DE CHASSE AGREE DE MART...  FRANCE   
18122  W014004925  'ASSOCIATION COMMUNALE DE CHASSE AGREEE DE GRO...  FRANCE   
18123  W014004926                                    L'ATTITUDE NORD  FRANCE   

       state               city    zip 

In [90]:
df_processed = df_prepare(df)

         local_id                                               name country  \
0      W011000001                                         VENT D EST  FRANCE   
1      W011000002                                UNE NOTE DE PARTAGE  FRANCE   
2      W011000003                                         CHANDELAIN  FRANCE   
3      W011000004                                   LEYMENT MATERIEL  FRANCE   
4      W011000005                           ASSOCIATION TOILES EMOI   FRANCE   
...           ...                                                ...     ...   
18119  W014004922                                  AU BONHEUR D ELDA  FRANCE   
18120  W014004923                                 JE S AIME ET VOUS   FRANCE   
18121  W014004924   ASSOCIATION COMMUNALE DE CHASSE AGREE DE MART...  FRANCE   
18122  W014004925   ASSOCIATION COMMUNALE DE CHASSE AGREEE DE GRO...  FRANCE   
18123  W014004926                                    L ATTITUDE NORD  FRANCE   

      state               city    zip  

In [91]:
texts = df_processed['name']

texts

0                                               VENT D EST
1                                      UNE NOTE DE PARTAGE
2                                               CHANDELAIN
3                                         LEYMENT MATERIEL
4                                 ASSOCIATION TOILES EMOI 
                               ...                        
18119                                    AU BONHEUR D ELDA
18120                                   JE S AIME ET VOUS 
18121     ASSOCIATION COMMUNALE DE CHASSE AGREE DE MART...
18122     ASSOCIATION COMMUNALE DE CHASSE AGREEE DE GRO...
18123                                      L ATTITUDE NORD
Name: name, Length: 18124, dtype: object

In [None]:
shingles_list

first, set of shingles
then, array (check from set)




In [92]:
n = 2
shingles_list = set()

for text in texts:
#    print(text)
#    text = text.encode()
    shingles = [text[i:i + n] for i in range(len(text) - n + 1)]
#    print(shingles)
    for shingle in shingles:
        if shingle not in shingles_list: #check, maybe if is not needed bc its a set
            shingles_list.add(shingle)
    
shingles_list = sorted(shingles_list)
shingles_dict = dict(zip(shingles_list,range(len(shingles_list))))
shingles_dict

{' 0': 0,
 ' 1': 1,
 ' 2': 2,
 ' 3': 3,
 ' 4': 4,
 ' 5': 5,
 ' 6': 6,
 ' 7': 7,
 ' 8': 8,
 ' 9': 9,
 ' A': 10,
 ' B': 11,
 ' C': 12,
 ' D': 13,
 ' E': 14,
 ' F': 15,
 ' G': 16,
 ' H': 17,
 ' I': 18,
 ' J': 19,
 ' K': 20,
 ' L': 21,
 ' M': 22,
 ' N': 23,
 ' O': 24,
 ' P': 25,
 ' Q': 26,
 ' R': 27,
 ' S': 28,
 ' T': 29,
 ' U': 30,
 ' V': 31,
 ' W': 32,
 ' X': 33,
 ' Y': 34,
 ' Z': 35,
 '0 ': 36,
 '00': 37,
 '01': 38,
 '02': 39,
 '03': 40,
 '04': 41,
 '05': 42,
 '06': 43,
 '07': 44,
 '08': 45,
 '09': 46,
 '0E': 47,
 '0S': 48,
 '0V': 49,
 '1 ': 50,
 '10': 51,
 '11': 52,
 '12': 53,
 '13': 54,
 '14': 55,
 '15': 56,
 '16': 57,
 '17': 58,
 '18': 59,
 '19': 60,
 '1E': 61,
 '1N': 62,
 '1P': 63,
 '1S': 64,
 '2 ': 65,
 '20': 66,
 '21': 67,
 '22': 68,
 '23': 69,
 '24': 70,
 '25': 71,
 '26': 72,
 '27': 73,
 '28': 74,
 '29': 75,
 '2A': 76,
 '2B': 77,
 '2C': 78,
 '2D': 79,
 '2E': 80,
 '2F': 81,
 '2G': 82,
 '2I': 83,
 '2J': 84,
 '2M': 85,
 '2O': 86,
 '2P': 87,
 '2R': 88,
 '2S': 89,
 '2V': 90,
 '2W': 91

In [93]:
docs = [[] for i in range(len(df_processed['name']))]

#hashes_array = np.zeros((len(shingles_list), len(df_processed['name'])))

for doc, text in zip(docs, texts):
    shingles = [text[i:i + n] for i in range(len(text) - n + 1)]    
    for shingle in shingles:
        doc.append(shingles_dict[shingle])
docs

[[788, 338, 593, 727, 13, 290, 14, 343, 722],
 [769,
  578,
  320,
  23,
  588,
  621,
  735,
  320,
  13,
  299,
  320,
  25,
  631,
  220,
  689,
  731,
  209,
  387],
 [273, 408, 216, 577, 299, 336, 514, 211, 448],
 [518, 349, 856, 549, 338, 593, 727, 22, 545, 222, 735, 342, 678, 439, 336],
 [221,
  721,
  717,
  604,
  274,
  435,
  222,
  739,
  449,
  615,
  570,
  29,
  745,
  610,
  446,
  518,
  343,
  696,
  14,
  337,
  559,
  610,
  432],
 [215,
  553,
  437,
  266,
  214,
  518,
  320,
  13,
  299,
  343,
  696,
  22,
  545,
  220,
  670,
  211,
  453],
 [280,
  614,
  553,
  454,
  735,
  320,
  13,
  290,
  10,
  216,
  582,
  447,
  545,
  222,
  739,
  449,
  615,
  570,
  13,
  299,
  320,
  21,
  518,
  349,
  856,
  549,
  338,
  593],
 [205,
  266,
  206,
  299,
  337,
  553,
  439,
  320,
  13,
  299,
  320,
  29,
  731,
  207,
  320,
  20,
  507,
  819,
  615,
  570,
  13,
  309,
  599,
  22,
  545,
  220,
  689,
  739,
  435,
  214,
  509,
  14,
  344,
  727,
  

In [94]:
min(len(d) for d in texts)

2

In [95]:
signature_size = 50 #signature size
#signatures = [[] for i in range(len(signature_size))]

signatures = np.zeros((signature_size, len(docs)))

#signatures = [[] for i in range(len(signature_size))]

shingles_shuffled = [i for i in range(len(shingles_list))]

for signature in signatures:
    random.shuffle(shingles_shuffled)    
    for doc_index, doc in enumerate(docs):
        doc_a = [shingles_shuffled[i] for i in doc]
        signature[doc_index] = min(doc_a)

signatures

    

array([[140.,  35.,  11., ...,  17.,  17.,   5.],
       [ 69.,  18., 153., ...,   7.,   7.,  36.],
       [131.,  33., 142., ...,   5.,   5., 123.],
       ...,
       [  5.,   5., 125., ...,   4.,   4.,  90.],
       [ 62.,  23.,   7., ...,  37.,  34.,  85.],
       [ 42.,  40.,  65., ...,  14.,  21.,  80.]])

In [None]:
    for num, name in enumerate(presidents, start=1):
    print("President {}: {}".format(num, name))
    
    for shingle in shingles_shuffled:
        while all columns of signature are filled:
            signature[doc] = shingle[doc] 
    
    
    
    
    for i, doc in zip(signature,docs):
        shingles_dict_sh
        i = doc[shingle]
            print(signatures)
            break
#             = 
        break
    break
x =+ 1


In [None]:
Levenshtein.ratio(df_processed['name'],df_processed['name'])


In [None]:
from textdistance import levenshtein

df_1 = df_processed['name']
df_2 = df_processed['name']
df_1.apply(lambda x: levenshtein.distance(df_1['name'], df_2['name']), axis=1)

In [69]:
docs = [{} for i in range(len(df_processed['name']))]

#hashes_array = np.zeros((len(shingles_list), len(df_processed['name'])))

for doc, text in zip(docs, texts):
    shingles = [text[i:i + n] for i in range(len(text) - n + 1)]    
    for shingle in shingles:
        doc[shingle] = shingles.index(shingle)
docs

ValueError: ' AI' is not in list

In [None]:
!{sys.executable} -m pip install textdistance