# Preprocessing

This notebook is needed for preparing the input datasets: upper case and etc.

### The input datasets should be in the following format:
- Company name (string)
- Marketplace (string)
- Country (string)
- State (string)
- City (string)
- Zip Code (string)
- Street (string)
- URL (string)
- Industry (SIC)

In [None]:
import pandas as pd
from wordcloud import WordCloud, STOPWORDS
import sys
import numpy as np
import string
import collections
from textblob import TextBlob
import Levenshtein
import binascii

import matplotlib.pyplot as plt
import random
import time

In [None]:
def df_import():
    # import FR data (adjustments for delimeters and encoding - latin)
    df = pd.read_csv("~/Dropbox/Botva/TUM/Master_Thesis/datasets/processed_files/france_rna_processed.csv", encoding='latin-1', sep = ';', error_bad_lines=False) 
#    df = pd.read_csv("~/Dropbox/Botva/TUM/Master_Thesis/object-identification/datasets/raw_files/rna_waldec_20201201_dpt_01.csv", error_bad_lines=False)
#    df = df.astype(str)
    print(df)
    print(df.dtypes)
    return df

In [None]:
def df_prepare(df):
    df = df.apply(lambda x: x.astype(str).str.upper())
    df['name'] = df['name'].apply(lambda x: x.replace('.',''))
    df['name'] = df['name'].str.replace('[^0-9a-zA-Z]+', ' ')
    df['name'] = df['name'].str.replace(' +', ' ')
    df['name_split'] = df['name'].str.split(' ')
    print(df)
    print(df.dtypes)

    return df

In [None]:
def frequent_words(df_processed):
    b = TextBlob("bonjour")
    b.detect_language()
    print(df_processed['name'])
    all_words = df_processed['name']    
    all_words_cleaned = []

    for text in all_words:
        text = [x.strip(string.punctuation) for x in text]
        all_words_cleaned.append(text)

    all_words_cleaned[0]

    text_words = [" ".join(text) for text in all_words_cleaned]
    final_text_words = " ".join(text_words)
    #final_text_words[:1000]

    print(all_words)
    stopwords = set(STOPWORDS)
    stopwords.update(["LE","DE","LA","ET","DES","DU","LES","EN","ET","A","POUR","SUR","SOU","S","D","L"])

    wordcloud_names = WordCloud(stopwords=stopwords, background_color="white", max_font_size=50, max_words=100).generate(final_text_words)

    # Lines 4 to 7
    plt.figure(figsize = (15,15))
    plt.imshow(wordcloud_names, interpolation='bilinear')
    plt.axis("off")
    plt.show()
    filtered_words = [word for word in final_text_words.split() if word not in stopwords]
    counted_words = collections.Counter(filtered_words)

    word_count = {}

    for letter, count in counted_words.most_common(100):
        word_count[letter] = count

    for i,j in word_count.items():
        print('Word: {0}, count: {1}'.format(i,j))
    
    return word_count

Levenstein Distance

In [None]:
texts_1 = df_processed['name']
texts_2 = df_processed['name']
lvn_array = np.zeros((len(texts_1),len(texts_2)))
print(lvn_array)

text_1_num = 0
text_2_num = 0

for text_1 in texts_1:
    for text_2 in texts_2:
        lvn_array[text_1_num,text_2_num] = Levenshtein.ratio(text_1,text_2)
        text_2_num += 1 
    text_2_num = 0    
    text_1_num += 1 
lvn_array

In [None]:
df = df_import()

In [None]:
df_processed = df_prepare(df)

In [None]:
texts = df_processed['name']

texts

In [None]:
#creating shingles_dict
n = 2
shingles_list = set()

for text in texts:
#    print(text)
#    text = text.encode()
    shingles = [text[i:i + n] for i in range(len(text) - n + 1)]
#    print(shingles)
    for shingle in shingles:
        if shingle not in shingles_list: #check, maybe if is not needed bc its a set
            shingles_list.add(shingle)
    
shingles_list = sorted(shingles_list)
shingles_dict = dict(zip(shingles_list,range(len(shingles_list))))
shingles_dict

In [None]:
shingles_list

In [None]:
#converting docs to shingles
docs = [[] for i in range(len(df_processed['name']))]

for doc, text in zip(docs, texts):
    shingles = [text[i:i + n] for i in range(len(text) - n + 1)]    
    for shingle in shingles:
        doc.append(shingles_dict[shingle])
docs

In [None]:
#creating signatures array
signature_size = 50 #signature size
signatures = np.zeros((signature_size, len(docs))) #create an array

shingles_shuffled = [i for i in range(len(shingles_list))]

for signature in signatures:
    random.shuffle(shingles_shuffled)    
    for doc_index, doc in enumerate(docs):
        doc_a = [shingles_shuffled[i] for i in doc]
        signature[doc_index] = min(doc_a)

signatures

In [None]:
bands = 5 #number of bands
r = int(len(signatures)/bands) #rows per band

def jaccard(list1, list2):
#    print(len(list(set(list1).intersection(list2))))
    intersection = len(set(list1).intersection(list2))
#    print(len(list1))
    union = (len(set(list1)) + len(set(list2))) - intersection
    return intersection/union

print(signatures[:,0])
print(jaccard(docs[0], docs[0]))

In [None]:
threshold = 0.5
a = len(signatures[0])
bucket = {}

for band in range(bands):
    for i in range(len(signatures[0])):
    #    t0 = time.time()
        for j in range(len(signatures[0])):
            if i<j:
                if jaccard(signatures[band*r:band*r+r,i], signatures[band*r:band*r+r,j])>threshold:
#                    print(jaccard(signatures[band*r:band*r+r,i], signatures[band*r:band*r+r,j]))
#                    print(i,j)
                    print(texts[i])
                    print(texts[j])
                    bucket.setdefault(i,[]).append(j)
                    print(bucket)
    #    elapsed = (time.time() - t0)            
    #    print("\nChecking one doc to all others took %.2fsec" % elapsed)  
    #    b = (a-i)*elapsed/60/60
    #    print("\nExpected left time: %.2fhours" % b)  
 


#compare row by row

In [None]:
min(len(d) for d in texts)

In [None]:
Levenshtein.ratio(df_processed['name'],df_processed['name'])


In [None]:
from textdistance import levenshtein

df_1 = df_processed['name']
df_2 = df_processed['name']
df_1.apply(lambda x: levenshtein.distance(df_1['name'], df_2['name']), axis=1)

In [None]:
!{sys.executable} -m pip install textdistance