# EDA

In [None]:
from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

## Reading

In [None]:
df_raw = pd.read_csv(Path('../data/raw/dataset.csv'), sep=';')

In [None]:
df_raw.head()

## Errors

In [None]:
df_raw.info()

In [None]:
df_raw.isna().sum()

In [None]:
df_raw.duplicated(subset='description', keep=False).value_counts()

In [None]:
df_raw.duplicated(keep=False).value_counts()

In [None]:
(
    df_raw
    .duplicated(keep=False)
    .compare(df_raw.duplicated(subset='description', keep=False))
)

In [None]:
df_raw.iloc[9138]

In [None]:
wrong_label = df_raw.loc[df_raw['description'] == df_raw.iloc[9138, 0]].index

In [None]:
df_raw['fraudulent'].unique()

## Cleaning

In [None]:
df_interim = (
    df_raw
    .drop(wrong_label)
    .dropna(subset='description')
    .drop_duplicates()
    .reset_index()
)

In [None]:
df_interim.info()

In [None]:
df_interim['fraudulent'].value_counts()

In [None]:
df_interim.to_feather(Path('../data/interim/dataset_interim.feather'))

## Exploring

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df_interim['description'])

In [None]:
X.toarray()[0:10]

In [None]:
len(vectorizer.get_feature_names_out())