## **Imports required**

In [1]:
import recordlinkage
import pandas as pd
import numpy as np 
import zipfile
import os

from zipfile import ZipFile
from recordlinkage.preprocessing import clean

In [2]:
def unzipe_file_in_folder(dir_name, folder, extension, path_to_unzipe_files):
    zip_files = os.listdir(dir_name)
    os.chdir(dir_name) # change directory from working dir to dir with files

    for item in zip_files: # loop through items in dir
        if item.endswith(extension): # check for ".zip" extension
            file_name = os.path.abspath(item) # get full path of files
            zip_ref = zipfile.ZipFile(file_name) # create zipfile object
            zip_ref.extractall(path_to_unzipe_files) # extract file to dir
            zip_ref.close() # close file
            #os.remove(file_name) # delete zipped file    

In [3]:
def concate_file_valueds_frm_different_folder(dir_name, folder, extension, path_to_unzipe_files, file_to_find):
    
    unzipe_file_in_folder(dir_name, folder, extension, path_to_unzipe_files)
    os.chdir(path_to_unzipe_files) # change directory from working dir to dir with files
    variables = pd.DataFrame()

    for dir_ in os.listdir('.'):
        if os.path.isdir(dir_): 
            dir_name = os.path.abspath(dir_)
            for folder in os.listdir(dir_name):
                if file_to_find in folder:
                    tmp_data = pd.read_csv(dir_name + '/' + folder, header=None)
                    variables = pd.concat([variables, tmp_data], ignore_index=True)
    variables.columns = ['variable_id', 'label', 'unit', 'min_value', 'max_value', 'alias']                
    return variables

In [4]:
def replace_by_nan(data, char):
    new_data = data.copy()
    for col in new_data.columns:
        new_data[col] = new_data[col].apply(lambda x: np.nan if x == char else x)
    return new_data

In [31]:
def preprocessing_recordlinkage(data):
    data_copy = data.copy()
    for column in data.columns:
        if isinstance(variables[column][0], str) :
            data_copy[column] = recordlinkage.preprocessing.clean(data_copy[column], lowercase=True, strip_accents=None, remove_brackets=True, encoding='utf-8', decode_error='strict')
    return data_copy

## **Reading files in zip folders**

In [5]:
folder = '2.MariaDB_PCB_Schema_DUMP_ZIP_Files_Fab=CROLFA_Eqt=END10_Day=2020-11-02_Hrs=19h17-20h22'
dir_name = '../datas/usine_datas/' + folder
extension = ".zip"
path_to_unzipe_files = '../../usine_datas_unziped/'+ folder
file_to_find = 'PCB.Variables.dump'

In [6]:
variables = concate_file_valueds_frm_different_folder(dir_name, folder, extension, path_to_unzipe_files, file_to_find)

## **PREPROCESSING**

In [15]:
variables = replace_by_nan(variables, '\\N')

In [32]:
variables = preprocessing_recordlinkage(variables)

  s = s.str.replace(r'(\[.*?\]|\(.*?\)|\{.*?\})', '')
  s = s.str.replace(replace_by_none, '')
  s = s.str.replace(replace_by_whitespace, ' ')
  s = s.str.replace(r'\s\s+', ' ')


Unnamed: 0,variable_id,label,unit,min_value,max_value,alias
0,55,ch step number,,,,stepid
1,59,ch step number,,,,stepid
2,61,ch step number,,,,stepid
3,62,ch step number,,,,stepid
4,63,ch step number,,,,stepid
...,...,...,...,...,...,...
2044,338,cooldown chamber pressure 01,,,,cooldown chamber pressure 01
2045,390,pvd dc current 08,,,,current pvd
2046,395,pvd target voltage sense 08,,,,voltage sense pvd target
2047,396,pvd target voltage sense 07,,,,voltage sense pvd target


**DEDUPLICATION**

In [33]:
indexer = recordlinkage.Index()
indexer.block('alias')
candidate_links = indexer.index(variables)

In [34]:
compare_cl = recordlinkage.Compare()

compare_cl.string('label', 'label', label='label', method='levenshtein')
features = compare_cl.compute(candidate_links, variables)

In [39]:
def selected_pairs_values(list_tuple, datas):
    res = pd.DataFrame(columns=datas.columns)
    for tpl in list_tuple.tolist():        
        tmp = datas.iloc[datas.index.isin([tpl[0]])]
        tmp2 = datas.iloc[datas.index.isin([tpl[1]])]
        res = pd.concat([res, tmp, tmp2])
                                              
    return res 

In [40]:
selected_pairs_values(features.index, variables)

Unnamed: 0,variable_id,label,unit,min_value,max_value,alias
1,59,CH STEP NUMBER (4),,,,StepID
0,55,CH STEP NUMBER (0),,,,StepID
2,61,CH STEP NUMBER (6),,,,StepID
0,55,CH STEP NUMBER (0),,,,StepID
2,61,CH STEP NUMBER (6),,,,StepID
...,...,...,...,...,...,...
1785,402,chamber Bakeout Current 07,,,,Current_Bakeout_Chamber
1995,400,chamber Bakeout Current 09,,,,Current_Bakeout_Chamber
1820,402,chamber Bakeout Current 07,,,,Current_Bakeout_Chamber
1995,400,chamber Bakeout Current 09,,,,Current_Bakeout_Chamber


### **CLASSIFICATION**

In [43]:
classifier = recordlinkage.KMeansClassifier()

In [44]:
classifier.fit_predict(features)

MultiIndex([(   1,    0),
            (   2,    0),
            (   2,    1),
            (   3,    0),
            (   3,    1),
            (   3,    2),
            (   4,    0),
            (   4,    1),
            (   4,    2),
            (   4,    3),
            ...
            (1995, 1417),
            (1995, 1477),
            (1995, 1537),
            (1995, 1598),
            (1995, 1644),
            (1995, 1704),
            (1995, 1750),
            (1995, 1785),
            (1995, 1820),
            (1995, 1881)],
           length=115633)

**DROP DOUBLON**

In [None]:
#features_multiindex_list = list(map(list, zip(*features.index.to_list())))
#features_first_index = set(features_multiindex_list[0])
#features_second_index = set(features_multiindex_list[1])

#original_index = set(features_first_index) - set(features_second_index)

In [None]:
#variables.iloc[variables.index.isin(original_index)]