# Preprocessing

This notebook is needed for preparing the input datasets: upper case and etc.

### The input datasets should be in the following format:
- Company name (string)
- Marketplace (string)
- Country (string)
- State (string)
- City (string)
- Zip Code (string)
- Street (string)
- URL (string)
- Industry (SIC)

In [None]:
import pandas as pd
from wordcloud import WordCloud, STOPWORDS
import sys
import numpy as np
import string
import collections
from textblob import TextBlob
import Levenshtein
import binascii

import matplotlib.pyplot as plt

In [None]:
def df_import():
    # import FR data (adjustments for delimeters and encoding - latin)
    df = pd.read_csv("~/Dropbox/Botva/TUM/Master_Thesis/object-identification/datasets/processed_files/france_rna_processed.csv", encoding='latin-1', sep = ';', error_bad_lines=False) 
#    df = pd.read_csv("~/Dropbox/Botva/TUM/Master_Thesis/object-identification/datasets/raw_files/rna_waldec_20201201_dpt_01.csv", error_bad_lines=False)
#    df = df.astype(str)
    print(df)
    print(df.dtypes)
    return df

In [None]:
def df_prepare(df):
    df = df.apply(lambda x: x.astype(str).str.upper())
    df['name'] = df['name'].apply(lambda x: x.replace('.',''))
    df['name'] = df['name'].str.replace('[^0-9a-zA-Z]+', ' ')
    df['name'] = df['name'].str.replace(' +', ' ')
    df['name_split'] = df['name'].str.split(' ')
    print(df)
    print(df.dtypes)

    return df

In [None]:
def frequent_words(df_processed):
    b = TextBlob("bonjour")
    b.detect_language()
    print(df_processed['name'])
    all_words = df_processed['name']    
    all_words_cleaned = []

    for text in all_words:
        text = [x.strip(string.punctuation) for x in text]
        all_words_cleaned.append(text)

    all_words_cleaned[0]

    text_words = [" ".join(text) for text in all_words_cleaned]
    final_text_words = " ".join(text_words)
    #final_text_words[:1000]

    print(all_words)
    stopwords = set(STOPWORDS)
    stopwords.update(["LE","DE","LA","ET","DES","DU","LES","EN","ET","A","POUR","SUR","SOU","S","D","L"])

    wordcloud_names = WordCloud(stopwords=stopwords, background_color="white", max_font_size=50, max_words=100).generate(final_text_words)

    # Lines 4 to 7
    plt.figure(figsize = (15,15))
    plt.imshow(wordcloud_names, interpolation='bilinear')
    plt.axis("off")
    plt.show()
    filtered_words = [word for word in final_text_words.split() if word not in stopwords]
    counted_words = collections.Counter(filtered_words)

    word_count = {}

    for letter, count in counted_words.most_common(100):
        word_count[letter] = count

    for i,j in word_count.items():
        print('Word: {0}, count: {1}'.format(i,j))
    
    return word_count

In [None]:
df = df_import()

In [None]:
df_processed = df_prepare(df.head(10000))

In [None]:
n = 3
shingles_list = []

texts = df_processed['name']
hashes_list = np.array([0] * len(df_processed['name']))
hashes_array = hashes_list

text_num = 0
shingle_num = 0

for text in texts:
#    print(text)
#    text = text.encode()
    shingles = [text[i:i + n] for i in range(len(text) - n + 1)]
#    print(shingles)
    for shingle in shingles:
        if shingle not in shingles_list:
            hashes_array = np.vstack((hashes_array, hashes_list)) #needs to be removed after last iteration
            shingles_list.append(shingle)
            hashes_array[shingle_num,text_num] = 1
            shingle_num += 1
        if shingle in shingles_list:
            hashes_array[shingles_list.index(shingle),text_num] = 1
    text_num += 1
    
hashes_array




In [None]:
shingles_list

first, set of shingles
then, array (check from set)




In [143]:
texts_1 = df_processed['name']
texts_2 = df_processed['name']
lvn_array = np.zeros((len(texts_1),len(texts_2)))
print(lvn_array)

text_1_num = 0
text_2_num = 0

for text_1 in texts_1:
    for text_2 in texts_2:
        lvn_array[text_1_num,text_2_num] = Levenshtein.ratio(text_1,text_2)
        text_2_num += 1 
    text_2_num = 0    
    text_1_num += 1 
lvn_array

array([[1.        , 0.48275862, 0.3       , ..., 0.29787234, 0.17391304,
        0.38095238],
       [0.48275862, 1.        , 0.27586207, ..., 0.39285714, 0.1875    ,
        0.33333333],
       [0.3       , 0.27586207, 1.        , ..., 0.29787234, 0.43478261,
        0.19047619],
       ...,
       [0.29787234, 0.39285714, 0.29787234, ..., 1.        , 0.2       ,
        0.29166667],
       [0.17391304, 0.1875    , 0.43478261, ..., 0.2       , 1.        ,
        0.16666667],
       [0.38095238, 0.33333333, 0.19047619, ..., 0.29166667, 0.16666667,
        1.        ]])

In [None]:
Levenshtein.ratio(df_processed['name'],df_processed['name'])


In [None]:
from textdistance import levenshtein

df_1 = df_processed['name']
df_2 = df_processed['name']
df_1.apply(lambda x: levenshtein.distance(df_1['name'], df_2['name']), axis=1)

In [None]:
!{sys.executable} -m pip install textdistance