In [97]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

In [98]:
data_folder = Path('../data/')
isot_folder = Path('isot/')
kaggle_folder = Path('kaggle/')

In [99]:
def load_isot_dataset(folder: Path):
    """Load isot dataset.
    
    :param folder: folder to look for isot files
    :type folder: pathlib.Path
    
    :rtype: pd.Dataframe
    :return: isot dataset"""
    isot_df = []
    
    for label, csv_file in enumerate(['True.csv', 'Fake.csv']):
        isot_df.append(pd.read_csv(folder / Path(csv_file), header=0))
        isot_df[-1]['label'] = label
        

    isot_df_concat = pd.concat(isot_df)
    isot_df_concat['dataset'] = 'isot'
    
    isot_df_concat = isot_df_concat.sample(frac=1).reset_index(drop=True)
    
    
    return isot_df_concat

In [100]:
def load_kaggle_dataset(folder: Path):
    """Load kaggle dataset.
    
    :param folder: folder to look for kaggle files
    :type folder: pathlib.Path
    
    :rtype: pd.Dataframe
    :return: kaggle dataset"""
    kaggle_df = pd.read_csv(folder / Path('train.csv'), header=0, index_col='id')
    kaggle_df['dataset'] = 'kaggle'
        
    return kaggle_df

In [101]:
def preprocess_dataset(df: pd.DataFrame):
    """Apply preprocessing operations to dataset.
    
    :param df: dataset
    :type dataset: pd.DataFrame"""
    # Remove break lines
    df['text'].str.replace('\n', ' ')
    
    # Filter articles too short
    text_length = df['text'].apply(len)
    df = df[text_length > 10]
    
    return df

In [102]:
def load_dataset(dataset: int = 0):
    """Load the specified dataset.
    
    :param dataset: 0 loads isot + kaggle; 1 loads isot; 2 loads kaggle
    :type dataset: int, optional
    
    :rtype: pd.Dataframe
    :return: dataset
    """
    if dataset==0:
        df = pd.concat([load_isot_dataset(data_folder / isot_folder), load_kaggle_dataset(data_folder / kaggle_folder)])
        df= df.sample(frac=1).reset_index(drop=True)
    elif dataset==1:
        df = load_isot_dataset(data_folder / isot_folder)
    elif dataset==2:
        df = load_kaggle_dataset(data_folder / kaggle_folder)
        
    df['text'] = df['text'].astype(str)
    df['title'] = df['title'].astype(str)

    df = preprocess_dataset(df)
    
    return df

In [167]:
df = load_dataset().head(300)

In [177]:
import numpy as np
df_test = pd.DataFrame(np.array([
    ['this is a test hh', 1],
    ['this is a test h', 2],
    ['this is a test h', 3],
    ['this is a test h', 4],

    ['this is something different', 5],
    ['this is something different', 6],
    ['this is something different', 7],
    ['this is something different', 8],
    ['afa', 9]

]),

                   columns=['text', 'b'],
                   index=[1,7,2,56,0,65,9,11,3])

df_test

Unnamed: 0,text,b
1,this is a test hh,1
7,this is a test h,2
2,this is a test h,3
56,this is a test h,4
0,this is something different,5
65,this is something different,6
9,this is something different,7
11,this is something different,8
3,afa,9


In [170]:
df_test['a'] = pd.Series(df_test.tail(2)['b'])
df_test['a'].isnull()

0     True
1     True
2     True
3     True
4     True
5     True
6     True
7    False
8    False
Name: a, dtype: bool

In [178]:
def remove_duplicates(df: pd.DataFrame, field: str, threshold: int = 90):
    """Remove duplicates from dataframe.
    
    :param df: dataframe
    :type df: pd.DataFrame
    :param field: similarity 
    :type field: str
    :param threshold: similarity threshold
    :type threshold: float"""
    similarity = pd.Series(index=df.index, dtype=float)
    for i, row in enumerate(df.itertuples()):
        current_similarity =  df.tail(df.shape[0]-(i+1))[field].apply(lambda val: fuzz.ratio(val, getattr(row, field)))
        similarity = pd.concat([similarity, current_similarity], axis=1).max(axis=1)

    return df[(similarity < threshold) | (similarity.isnull())]
        
clean_df = remove_duplicates(df_test, 'text')
clean_df

  from ipykernel import kernelapp as app


Unnamed: 0,text,b
1,this is a test hh,1
0,this is something different,5
3,afa,9


In [None]:
def build_vectorizer(
    clean: pd.Series,
    analyzer: str = 'char', 
    ngram_range: Tuple[int, int] = (1, 4), 
    n_neighbors: int = 1, 
    **kwargs
    ) -> Tuple:
    # Create vectorizer
    vectorizer = TfidfVectorizer(analyzer = analyzer, ngram_range = ngram_range, **kwargs)
    X = vectorizer.fit_transform(clean.values.astype('U'))

    # Fit nearest neighbors corpus
    nbrs = NearestNeighbors(n_neighbors = n_neighbors, metric = 'cosine').fit(X)
    return vectorizer, nbrs

#clean -> 'text' column

In [None]:
def find_matches_fuzzy(
    row,
    match_candidates,
    limit = 5
    ):
    row_matches = process.extract(
        row, dict(enumerate(match_candidates)),
        scorer = fuzz.token_sort_ratio,
        limit = limit
        )
    result = [(row, match[0], match[1]) for match in row_matches]
    return result

In [None]:
def fuzzy_tf_idf(
        df: pd.DataFrame,
        column: str,
        clean: pd.Series,
        mapping_df: pd.DataFrame,
        col: str,
        analyzer: str = 'char',
        ngram_range: Tuple[int, int] = (1, 3)
    ) -> pd.Series:
    # Create vectorizer
    clean = clean.drop_duplicates().reset_index(drop = True)
    messy_prep = df[column].drop_duplicates().dropna().reset_index(drop = True).astype(str)
    messy = messy_prep.apply(preprocess_string)
    result = fuzzy_nn_match(messy = messy, clean = clean, column = column, col = col, n_neighbors = 1)
    # Map value from messy to clean
    return result

In [20]:
# Import module for data manipulation
import pandas as pd
# Import module for linear algebra
import numpy as np
# Import module for Fuzzy string matching
from fuzzywuzzy import fuzz, process
# Import module for regex
import re
# Import module for iteration
import itertools
# Import module for function development
from typing import Union, List, Tuple
# Import module for TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
# Import module for cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
# Import module for KNN
from sklearn.neighbors import NearestNeighbors

# String pre-processing
def preprocess_string(s):
    # Remove spaces between strings with one or two letters
    s = re.sub(r'(?<=\b\w)\s*[ &]\s*(?=\w\b)', '', s)
    return s

# String matching - TF-IDF
def build_vectorizer(
    clean: pd.Series,
    analyzer: str = 'char', 
    ngram_range: Tuple[int, int] = (1, 4), 
    n_neighbors: int = 1, 
    **kwargs
    ) -> Tuple:
    # Create vectorizer
    vectorizer = TfidfVectorizer(analyzer = analyzer, ngram_range = ngram_range, **kwargs)
    X = vectorizer.fit_transform(clean.values.astype('U'))

    # Fit nearest neighbors corpus
    nbrs = NearestNeighbors(n_neighbors = n_neighbors, metric = 'cosine').fit(X)
    return vectorizer, nbrs

# String matching - KNN
def tfidf_nn(
    messy, 
    clean, 
    n_neighbors = 1, 
    **kwargs
    ):
    # Fit clean data and transform messy data
    vectorizer, nbrs = build_vectorizer(clean, n_neighbors = n_neighbors, **kwargs)
    input_vec = vectorizer.transform(messy)

    # Determine best possible matches
    distances, indices = nbrs.kneighbors(input_vec, n_neighbors = n_neighbors)
    nearest_values = np.array(clean)[indices]
    return nearest_values, distances

# String matching - match fuzzy
def find_matches_fuzzy(
    row, 
    match_candidates, 
    limit = 5
    ):
    row_matches = process.extract(
        row, dict(enumerate(match_candidates)), 
        scorer = fuzz.token_sort_ratio, 
        limit = limit
        )
    result = [(row, match[0], match[1]) for match in row_matches]
    return result

# String matching - TF-IDF
def fuzzy_nn_match(
    messy,
    clean,
    column,
    col,
    n_neighbors = 100,
    limit = 5, **kwargs):
    nearest_values, _ = tfidf_nn(messy, clean, n_neighbors, **kwargs)

    results = [find_matches_fuzzy(row, nearest_values[i], limit) for i, row in enumerate(messy)]
    df = pd.DataFrame(itertools.chain.from_iterable(results),
        columns = [column, col, 'Ratio']
        )
    return df

# String matching - Fuzzy
def fuzzy_tf_idf(
    df: pd.DataFrame,
    column: str,
    clean: pd.Series,
    mapping_df: pd.DataFrame,
    col: str,
    analyzer: str = 'char',
    ngram_range: Tuple[int, int] = (1, 3)
    ) -> pd.Series:
    # Create vectorizer
    clean = clean.drop_duplicates().reset_index(drop = True)
    messy_prep = df[column].drop_duplicates().dropna().reset_index(drop = True).astype(str)
    messy = messy_prep.apply(preprocess_string)
    result = fuzzy_nn_match(messy = messy, clean = clean, column = column, col = col, n_neighbors = 1)
    # Map value from messy to clean
    return result

In [None]:
start = time.time()
df_result = (df.pipe(fuzzy_tf_idf, # Function and messy data
                     column = 'Expedia', # Messy column in data
                     clean = df['Booking.com'], # Master data (list)
                     mapping_df = df, # Master data
                     col = 'Result') # Can be customized
            )
end = time.time()# Print the computation time
print('Fuzzy string matching in {} seconds'.format(end - start))# View the result of fuzzy string matching
df_result.head()