In [1]:
import numpy as np
import pandas as pd
import category_encoders as ce
import sys, os
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [2]:
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from file_handler import FileHandler
from df_selector import *
from df_cleaner import *
from df_visualizer import *

In [3]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)

# Reading Data

In [4]:
file_handler = FileHandler()

In [34]:
# reading the csv file
fraud_df = file_handler.read_csv("../data/Fraud_Data.csv")
fraud_df.head(10)

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0
5,159135,2015-05-21 06:03:03,2015-07-09 08:05:14,42,ALEYXFXINSXLZ,Ads,Chrome,M,18,2809315000.0,0
6,50116,2015-08-01 22:40:52,2015-08-27 03:37:57,11,IWKVZHJOCLPUR,Ads,Chrome,F,19,3987484000.0,0
7,360585,2015-04-06 07:35:45,2015-05-25 17:21:14,27,HPUCUYLMJBYFW,Ads,Opera,M,34,1692459000.0,0
8,159045,2015-04-21 23:38:34,2015-06-02 14:01:54,30,ILXYDOZIHOOHT,SEO,IE,F,43,3719094000.0,0
9,182338,2015-01-25 17:49:49,2015-03-23 23:05:42,62,NRFFPPHZYFUVC,Ads,IE,M,31,341674700.0,0


In [35]:
# reading the csv file
ip_df = file_handler.read_csv("../data/IpAddress_to_Country.csv")
ip_df.head(10)

Unnamed: 0,lower_bound_ip_address,upper_bound_ip_address,country
0,16777216.0,16777471,Australia
1,16777472.0,16777727,China
2,16777728.0,16778239,China
3,16778240.0,16779263,Australia
4,16779264.0,16781311,China
5,16781312.0,16785407,Japan
6,16785408.0,16793599,China
7,16793600.0,16809983,Japan
8,16809984.0,16842751,Thailand
9,16842752.0,16843007,China


# General Statistics

## fraud_df

In [30]:
# number of elements in the df
fraud_df.size

1662232

In [31]:
fraud_df.shape

(151112, 11)

In [32]:
fraud_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   user_id         151112 non-null  int64  
 1   signup_time     151112 non-null  object 
 2   purchase_time   151112 non-null  object 
 3   purchase_value  151112 non-null  int64  
 4   device_id       151112 non-null  object 
 5   source          151112 non-null  object 
 6   browser         151112 non-null  object 
 7   sex             151112 non-null  object 
 8   age             151112 non-null  int64  
 9   ip_address      151112 non-null  float64
 10  class           151112 non-null  int64  
dtypes: float64(1), int64(4), object(6)
memory usage: 12.7+ MB


## ip_df

In [36]:
# number of elements in the df
ip_df.size

416538

In [37]:
ip_df.shape

(138846, 3)

In [38]:
ip_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138846 entries, 0 to 138845
Data columns (total 3 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   lower_bound_ip_address  138846 non-null  float64
 1   upper_bound_ip_address  138846 non-null  int64  
 2   country                 138846 non-null  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 3.2+ MB


# Missing Values

In [33]:
percent_missing_values(fraud_df)

The dataset contains 0.0 % missing values.


In [39]:
percent_missing_values(ip_df)

The dataset contains 0.0 % missing values.


# Data Types

## fraud_df

In [40]:
#check if there are columns with mixed data types.
show_cols_mixed_dtypes(fraud_df)

None of the columns contain mixed types.


In [42]:
fraud_df.dtypes

user_id             int64
signup_time        object
purchase_time      object
purchase_value      int64
device_id          object
source             object
browser            object
sex                object
age                 int64
ip_address        float64
class               int64
dtype: object

In [43]:
# get the columns with object data type
string_columns = fraud_df.select_dtypes(include='object').columns.tolist()
string_columns

['signup_time', 'purchase_time', 'device_id', 'source', 'browser', 'sex']

In [44]:
convert_to_string(fraud_df, string_columns)

In [45]:
fraud_df.dtypes

user_id                    int64
signup_time       string[python]
purchase_time     string[python]
purchase_value             int64
device_id         string[python]
source            string[python]
browser           string[python]
sex               string[python]
age                        int64
ip_address               float64
class                      int64
dtype: object

In [46]:
# get the columns with float data type
float_columns = fraud_df.select_dtypes(include='float64').columns.tolist()
float_columns

['ip_address']

In [47]:
convert_to_int(fraud_df, float_columns)

In [48]:
fraud_df.dtypes

user_id                    int64
signup_time       string[python]
purchase_time     string[python]
purchase_value             int64
device_id         string[python]
source            string[python]
browser           string[python]
sex               string[python]
age                        int64
ip_address                 int64
class                      int64
dtype: object

In [49]:
convert_to_datetime(fraud_df, ['signup_time'])

In [50]:
convert_to_datetime(fraud_df, ['purchase_time'])

In [51]:
fraud_df.dtypes

user_id                    int64
signup_time       datetime64[ns]
purchase_time     datetime64[ns]
purchase_value             int64
device_id         string[python]
source            string[python]
browser           string[python]
sex               string[python]
age                        int64
ip_address                 int64
class                      int64
dtype: object

## ip_df

In [53]:
#check if there are columns with mixed data types.
show_cols_mixed_dtypes(ip_df)

None of the columns contain mixed types.


In [54]:
ip_df.dtypes

lower_bound_ip_address    float64
upper_bound_ip_address      int64
country                    object
dtype: object

In [56]:
convert_to_string(ip_df, ['country'])

In [57]:
convert_to_int(ip_df, ['lower_bound_ip_address'])

In [58]:
ip_df.dtypes

lower_bound_ip_address             int64
upper_bound_ip_address             int64
country                   string[python]
dtype: object

# Duplicates

In [60]:
# search for duplicate rows and drop them
drop_duplicates(fraud_df)

No duplicate rows were found.


In [61]:
# search for duplicate rows and drop them
drop_duplicates(ip_df)

No duplicate rows were found.


In [62]:
fraud_df.duplicated(subset=['user_id']).all()

False

# Merging dataframes and Saving Data

In [None]:
# save the clean dataframe to a csv file
file_handler.to_csv(train_df, '../data/Fraud_Data.csv')
file_handler.to_csv(store_df, '../data/IpAddress_to_Country.csv')