In [63]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

In [64]:
data_folder = Path('../data/')
isot_folder = Path('isot/')
kaggle_folder = Path('kaggle/')

In [65]:
def load_isot_dataset(folder: Path):
    """Load isot dataset.
    
    :param folder: folder to look for isot files
    :type folder: pathlib.Path
    
    :rtype: pd.Dataframe
    :return: isot dataset"""
    isot_df = []
    
    for label, csv_file in enumerate(['True.csv', 'Fake.csv']):
        isot_df.append(pd.read_csv(folder / Path(csv_file), header=0))
        isot_df[-1]['label'] = label
        

    isot_df_concat = pd.concat(isot_df)
    isot_df_concat['dataset'] = 'isot'
    
    isot_df_concat = isot_df_concat.sample(frac=1).reset_index(drop=True)
    
    
    return isot_df_concat

In [66]:
def load_kaggle_dataset(folder: Path):
    """Load kaggle dataset.
    
    :param folder: folder to look for kaggle files
    :type folder: pathlib.Path
    
    :rtype: pd.Dataframe
    :return: kaggle dataset"""
    kaggle_df = pd.read_csv(folder / Path('train.csv'), header=0, index_col='id')
    kaggle_df['dataset'] = 'kaggle'
        
    return kaggle_df

In [67]:
def preprocess_dataset(df: pd.DataFrame):
    """Apply preprocessing operations to dataset.
    
    :param df: dataset
    :type dataset: pd.DataFrame"""
    # Remove break lines
    df['text'].str.replace('\n', ' ')
    
    # Filter articles too short
    text_length = df['text'].apply(len)
    df = df[text_length > 10]
    
    return df

In [68]:
def load_dataset(dataset: int = 0):
    """Load the specified dataset.
    
    :param dataset: 0 loads isot + kaggle; 1 loads isot; 2 loads kaggle
    :type dataset: int, optional
    
    :rtype: pd.Dataframe
    :return: dataset
    """
    if dataset==0:
        df = pd.concat([load_isot_dataset(data_folder / isot_folder), load_kaggle_dataset(data_folder / kaggle_folder)])
        df= df.sample(frac=1).reset_index(drop=True)
    elif dataset==1:
        df = load_isot_dataset(data_folder / isot_folder)
    elif dataset==2:
        df = load_kaggle_dataset(data_folder / kaggle_folder)
        
    df['text'] = df['text'].astype(str)
    df['title'] = df['title'].astype(str)

    df = preprocess_dataset(df)
    
    return df

In [74]:
df = load_dataset()

In [70]:
import time

#g_start_time = time.time()

def remove_duplicates(df: pd.DataFrame, field: str, threshold: int = 90):
    """Remove duplicates from dataframe.
    
    :param df: dataframe
    :type df: pd.DataFrame
    :param field: similarity 
    :type field: str
    :param threshold: similarity threshold
    :type threshold: float"""
    similarity = pd.Series(index=df.index, dtype=float)
    for i, row in enumerate(df.itertuples()):
        start_time = time.time()
        print(i)
        current_similarity =  df.tail(df.shape[0]-(i+1))[field].apply(lambda val: fuzz.ratio(val, getattr(row, field)))
        similarity = pd.concat([similarity, current_similarity], axis=1).max(axis=1)
        print(time.time() - start_time)

    return df[(similarity < threshold) | (similarity.isnull())]
        

#clean_df
#clean_df = remove_duplicates(df, 'text')
#print(f"total time: {time.time() - g_start_time}")



In [71]:
from typing import Tuple
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

def build_vectorizer(
    clean: pd.Series,
    analyzer: str = 'char', 
    ngram_range: Tuple[int, int] = (1, 4), 
    n_neighbors: int = 1, 
    **kwargs
    ) -> Tuple:
    """Build TF-IDF Vectorizer."""
    # Create vectorizer
    vectorizer = TfidfVectorizer(analyzer = analyzer, ngram_range = ngram_range, **kwargs)
    X = vectorizer.fit_transform(clean.values.astype('U'))

    # Fit nearest neighbors corpus
    nbrs = NearestNeighbors(n_neighbors = n_neighbors, metric = 'cosine').fit(X)
    return vectorizer, nbrs

In [72]:
def tfidf_nn(
    messy, 
    clean, 
    n_neighbors = 1, 
    **kwargs
    ):
    """Get most similar values."""
    # Fit clean data and transform messy data
    vectorizer, nbrs = build_vectorizer(clean, n_neighbors = n_neighbors, **kwargs)
    input_vec = vectorizer.transform(messy)

    # Determine best possible matches
    distances, indices = nbrs.kneighbors(input_vec, n_neighbors = n_neighbors)
    nearest_values = np.array(clean)[indices]
    return nearest_values, distances, indices

In [73]:
import time

start_time = time.time()        
nearest_values, d, i = tfidf_nn(df.text, df.text, 2)
print(time.time() - start_time)


d_ = {'text': nearest_values[:,0], 'distance': d[:,1], 'best_match_index': i[:,1]}
similar_df = pd.DataFrame(data=d_)

similar_df = similar_df[similar_df['distance'] < 0.01]


groups = []
seen = {}
for row in similar_df.itertuples():
    if row.Index in seen:
        groups[seen[row.Index]].add(row.best_match_index)
        seen[row.best_match_index] = seen[row.Index]
    elif row.best_match_index in seen:
        groups[seen[row.best_match_index]].add(row.Index)
        seen[row.Index] = seen[row.best_match_index]
    else:
        groups.append(set([row.Index, row.best_match_index]))
        seen[row.Index] = len(groups) - 1
        seen[row.best_match_index] = len(groups) - 1

print(groups)
print(seen)

indexes_to_remove = list(similar_df.index)
for group in groups:
    indexes_to_remove.remove(group.pop())
preprocessed_df = df.reset_index().drop(index=indexes_to_remove)
preprocessed_df

1.081284523010254
[]
{}


Unnamed: 0,index,title,text,subject,date,label,dataset,author
0,0,Iran calls on Muslim nations to step up effort...,ISTANBUL (Reuters) - Iranian President Hassan ...,worldnews,"December 13, 2017",0,isot,
1,1,Laura Ingraham Wants To Know If The Military ...,"Right-wing radio s high priestess of hate, L...",News,"December 2, 2016",1,isot,
2,2,America’s Most Popular ‘Legal’ Drug is Respons...,Home / Health / America’s Most Popular ‘Legal’...,,,1,kaggle,John Vibes
3,3,Candidate Trump retweets 'White Genocide' acco...,NEW YORK (Reuters) - Republican presidential ...,politicsNews,"January 22, 2016",0,isot,
4,4,Who Will Weed Out the Warmongers?,"Who Will Weed Out the Warmongers? October 30, ...",,,1,kaggle,Consortiumnews.com
...,...,...,...,...,...,...,...,...
95,95,2016 Pulitzer Prizes: A.P. Wins Public Service...,The Associated Press won the Pulitzer Prize fo...,,,0,kaggle,Michael M. Grynbaum
96,96,"‘Swiss Army Man,’ the Strangest Movie Shown at...","LOS ANGELES — In “Swiss Army Man,” Hank (Pa...",,,0,kaggle,Robert Ito
97,97,Dan Rather Condemns Trump’s Reality Show Tact...,Unless you ve been living under a rock for the...,News,"April 8, 2017",1,isot,
98,98,Caitlyn Jenner joins celebs mulling run for U....,LOS ANGELES (Reuters) - Caitlyn Jenner has joi...,politicsNews,"July 17, 2017",0,isot,


In [None]:
import numpy as np

df_test_c = pd.DataFrame(np.array([
    ['this is a test h', 1],
    ['this is something different', 5],

]),

                   columns=['text', 'b'],
)
df_test_c

In [None]:
df_test = pd.DataFrame(np.array([
    ['this is a test hh', 1],
    ['this is a test h', 2],
    ['this is a test h', 3],
    ['this is a test h', 4],
    ['afa', 9],

    ['this is something different', 5],
    ['this is something different', 6],
    ['this is something different', 7],
    ['this is something different', 8]

]),

                   columns=['text', 'b'],
                   index=[1,7,2,56,80,65,9,11,3])

df_test