## BAF Veri Seti İnceleme

In [81]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.ensemble import RandomForestClassifier

In [82]:
df = pd.read_csv("sample10.csv")

In [83]:
df.head()

Unnamed: 0,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,velocity_6h,...,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month,fraud_bool
0,0.1,0.423184,-1,174,70,0.022402,3.354768,AA,1140,3659.730906,...,200.0,0,INTERNET,14.859346,windows,1,1,0,0,0
1,0.7,0.246312,-1,60,50,0.009289,-0.305261,AB,1118,4950.849565,...,500.0,0,INTERNET,2.320669,linux,0,1,0,5,0
2,0.7,0.921065,29,148,40,0.009191,-1.054443,AD,399,574.335478,...,1500.0,0,INTERNET,7.387905,windows,1,1,0,6,0
3,0.7,0.547707,56,8,20,0.004642,-1.124991,AB,4062,3484.847833,...,200.0,0,INTERNET,27.903481,macintosh,1,1,0,6,0
4,0.9,0.860882,-1,46,20,0.005121,-1.067835,AD,1052,14073.211653,...,200.0,0,INTERNET,5.674654,windows,1,1,0,3,0


In [84]:
def col_names(dataframe, cat_th = 200):
    
    
    cat_cols = [col for col in dataframe.columns if str(dataframe[col].dtypes) in ["category", "bool", "object"]]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]

    cat_cols = cat_cols + num_but_cat
    
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]
    
    return cat_cols, num_cols

cat_cols, num_cols = col_names(df)

In [85]:
cat_cols

['payment_type',
 'employment_status',
 'housing_status',
 'source',
 'device_os',
 'income',
 'customer_age',
 'date_of_birth_distinct_emails_4w',
 'email_is_free',
 'phone_home_valid',
 'phone_mobile_valid',
 'bank_months_count',
 'has_other_cards',
 'proposed_credit_limit',
 'foreign_request',
 'keep_alive_session',
 'device_distinct_emails_8w',
 'device_fraud_count',
 'month',
 'fraud_bool']

In [86]:
lelist = []

def display_value_counts(dataframe):
    for col in dataframe.columns:
        unique_values_count = dataframe[col].nunique()
        if unique_values_count>2 and unique_values_count <= 40:
            lelist.append(col)
            
display_value_counts(df)

In [87]:
lelist

['income',
 'customer_age',
 'payment_type',
 'date_of_birth_distinct_emails_4w',
 'employment_status',
 'housing_status',
 'bank_months_count',
 'proposed_credit_limit',
 'device_os',
 'device_distinct_emails_8w',
 'month']

In [88]:
categorical_columns = ['income', 'customer_age', 'payment_type', 'employment_status', 'housing_status', 'source', 'device_os', 'device_distinct_emails_8w', 'device_fraud_count', 'month']

In [89]:
one_hot_encoded_df = pd.get_dummies(df, columns=['income', 'customer_age', 'payment_type', 'employment_status', 'housing_status', 'source', 'device_os', 'device_distinct_emails_8w', 'device_fraud_count', 'month'])
target_column = 'fraud_bool'

one_hot_encoded_data = one_hot_encoded_df.drop(columns=[target_column])

boolean_columns = one_hot_encoded_df.select_dtypes(include=['bool']).columns
for column in boolean_columns:
    one_hot_encoded_df[column] = one_hot_encoded_df[column].astype(int)
    
target_data = one_hot_encoded_df[target_column]



def reverse_one_hot_encoding(df, categorical_columns):
    df_reversed = df.copy()
    for col in categorical_columns:
        col_prefix = col + '_'
        category_cols = [c for c in df.columns if c.startswith(col_prefix)]
        
        df_reversed[col] = np.nan
        for category_col in category_cols:
            category_value = category_col.replace(col_prefix, '')
            df_reversed.loc[df_reversed[category_col] == 1, col] = category_value
            
        df_reversed.drop(columns=category_cols, inplace=True)
        
    return df_reversed

original_df = reverse_one_hot_encoding(one_hot_encoded_data, categorical_columns)

original_df[target_column] = target_data.values

generated_df = pd.DataFrame() 

generated_df[target_column] = 1 

final_df = pd.concat([original_df, generated_df], ignore_index=True)

  df_reversed.loc[df_reversed[category_col] == 1, col] = category_value
  df_reversed.loc[df_reversed[category_col] == 1, col] = category_value
  df_reversed.loc[df_reversed[category_col] == 1, col] = category_value
  df_reversed.loc[df_reversed[category_col] == 1, col] = category_value
  df_reversed.loc[df_reversed[category_col] == 1, col] = category_value
  df_reversed.loc[df_reversed[category_col] == 1, col] = category_value
  df_reversed.loc[df_reversed[category_col] == 1, col] = category_value
  df_reversed.loc[df_reversed[category_col] == 1, col] = category_value
  df_reversed.loc[df_reversed[category_col] == 1, col] = category_value
  df_reversed.loc[df_reversed[category_col] == 1, col] = category_value


In [90]:
for col in df.columns:
    if col in final_df.columns:
        final_df[col] = final_df[col].astype(df[col].dtype)


In [93]:
final_df.head()

Unnamed: 0,name_email_similarity,prev_address_months_count,current_address_months_count,days_since_request,intended_balcon_amount,zip_count_4w,velocity_6h,velocity_24h,velocity_4w,bank_branch_count_8w,...,customer_age,payment_type,employment_status,housing_status,source,device_os,device_distinct_emails_8w,device_fraud_count,month,fraud_bool
0,0.423184,-1,174,0.022402,3.354768,1140,3659.730906,6041.862379,6791.873595,278,...,70,AA,CB,BB,INTERNET,windows,1,0,0,0
1,0.246312,-1,60,0.009289,-0.305261,1118,4950.849565,4039.940368,4141.882552,14,...,50,AB,CA,BB,INTERNET,linux,1,0,5,0
2,0.921065,29,148,0.009191,-1.054443,399,574.335478,2493.948454,3726.858015,5,...,40,AD,CF,BA,INTERNET,windows,1,0,6,0
3,0.547707,56,8,0.004642,-1.124991,4062,3484.847833,4210.51295,4288.197328,172,...,20,AB,CA,BC,INTERNET,macintosh,1,0,6,0
4,0.860882,-1,46,0.005121,-1.067835,1052,14073.211653,5469.019495,4997.056511,11,...,20,AD,CA,BC,INTERNET,windows,1,0,3,0
