<a href="https://colab.research.google.com/github/byshadowoz/CCFDM/blob/main/nose.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# Raw data

In [3]:
data = pd.read_csv('fraudtest.csv', index_col=0)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 555719 entries, 0 to 555718
Data columns (total 22 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   trans_date_trans_time  555719 non-null  object 
 1   cc_num                 555719 non-null  float64
 2   merchant               555719 non-null  object 
 3   category               555719 non-null  object 
 4   amt                    555719 non-null  float64
 5   first                  555719 non-null  object 
 6   last                   555719 non-null  object 
 7   gender                 555719 non-null  object 
 8   street                 555719 non-null  object 
 9   city                   555719 non-null  object 
 10  state                  555719 non-null  object 
 11  zip                    555719 non-null  int64  
 12  lat                    555719 non-null  float64
 13  long                   555719 non-null  float64
 14  city_pop               555719 non-null  i

In [4]:
def processingDT(df, columns):
  #creating new column with the time btween the time in the column and the first column (first column = 0)
  if 'trans_date_trans_time' in df.columns:
    df.trans_date_trans_time = pd.to_datetime(data['trans_date_trans_time'])
    df['Time'] = (df['trans_date_trans_time'] - df['trans_date_trans_time'].iloc[0]).dt.total_seconds()
    df['Time'].fillna(0, inplace=True)
    df['Time'] = df['Time'] / 60

  #drop unuseful data for the ML model
  df = df.drop(columns=['first','last','gender','street','lat','long','zip','city_pop'
                   ,'dob','trans_num','unix_time','trans_date_trans_time','merch_lat','merch_long'])

  #Processing string data to boolean data
  def dummies(df,columns):

    dummies = pd.get_dummies(df[columns])

    df = pd.concat([df, dummies], axis=1)

    df.drop(columns=columns, inplace=True)

    return df

  df = dummies(df,[ 'merchant', 'category','city', 'state', 'job',])

  return df

In [5]:
data = processingDT(data,['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud'])

  df.trans_date_trans_time = pd.to_datetime(data['trans_date_trans_time'])


In [6]:
x_cols =list(data.columns)
x_cols.remove('is_fraud')
x_cols.remove('Time')
x_cols.remove('amt')

In [7]:
data_part1_2 = data.head(138929)
data_part1_3 = data.tail(416790)

In [8]:
data_part1_3 = data_part1_3[data_part1_3.is_fraud == 1]

In [9]:
data_part1_3 = data_part1_3.reset_index(drop=True)

In [10]:
data_part = pd.concat([data_part1_2,data_part1_3])

In [11]:
data_part.isna().any().sum()

0

# Modeling

In [12]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [13]:
ss = StandardScaler()
data_part[x_cols] = ss.fit_transform(data_part[x_cols])

In [14]:
pca2 = PCA(n_components=28, random_state=42)
pca2 = pca2.fit_transform(data_part[x_cols])

In [15]:
columns = ['V' + str(i) for i in range(1, 29)]

In [16]:
data_part = data_part.reset_index(drop=True)

In [17]:
data2 = pd.DataFrame(pca2, columns=columns).assign(is_fraud=data_part['is_fraud'],Time=data_part['Time'],Amount=data_part['amt'])

In [18]:
data2[data2.is_fraud == 1]

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,is_fraud,Time,Amount
1685,-0.598172,1.144613,-0.094510,0.859783,-1.011711,0.273199,0.727060,-1.906297,-0.736238,-0.001437,...,-0.443760,-0.490428,-0.030474,1.545240,0.434241,-0.478522,0.463882,1,592.0,24.84
1767,-1.480529,0.222242,1.295261,1.365133,2.788950,-2.822044,2.669732,0.585923,-1.853767,0.483757,...,-3.983478,-1.416715,0.104810,2.107172,-2.274064,-0.167767,1.985643,1,618.0,780.52
1781,-1.271606,0.120356,1.465947,1.133249,2.766102,-2.587641,2.578613,0.505140,-1.745468,0.100368,...,-3.687705,-1.451466,0.204187,2.378236,-2.394648,-0.018352,2.039806,1,623.0,620.33
1784,1.743642,-1.296773,0.433705,-3.781448,-2.790281,-0.258579,1.645773,-3.878941,-2.384804,0.102737,...,-0.120302,0.148986,-2.109904,-0.837889,1.047046,-2.220008,-1.334460,1,624.0,1077.69
1857,-0.815687,1.383508,-0.072413,0.736195,-0.783435,0.417599,0.575419,-2.198089,-0.785825,-0.297522,...,-0.337583,-0.478347,0.136176,1.477905,0.481226,-0.645979,0.451778,1,648.0,842.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140451,-1.885197,-1.901633,-0.328577,0.413878,-1.585098,0.579325,0.020023,0.070661,1.485562,0.856125,...,-1.021674,0.118913,-2.673635,-2.147107,-1.147776,0.033870,-2.094554,1,265551.0,1041.51
140452,-1.884568,-1.865341,-0.385483,0.385217,-1.621335,0.595049,-0.018489,0.083034,1.475932,0.945927,...,-1.062314,0.022884,-2.721337,-2.219702,-1.177209,0.053716,-2.214453,1,265564.0,868.09
140453,-1.791368,-1.933004,-0.455697,0.551157,-1.971012,0.426638,-0.012275,0.414370,1.585215,1.286257,...,-1.057889,0.028093,-2.760386,-2.211881,-1.059834,0.244320,-1.994913,1,265577.0,1039.42
140454,-1.917875,-1.994005,-0.424867,0.592209,-1.757509,0.562001,0.002287,-0.145944,1.640667,0.913690,...,-0.889942,-0.056516,-2.833079,-2.060295,-0.937546,0.249133,-1.997296,1,265612.0,289.27


In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [20]:
x = data2.drop(columns=['is_fraud'])
y = data2.is_fraud

In [21]:
model = LogisticRegression()

In [22]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=42)

In [23]:
model.fit(x_train, y_train)

In [24]:
prediction = model.predict(x_test)

In [25]:
prediction_prof = pd.DataFrame({"prediction": prediction, "real": y_test})

In [26]:
prediction_prof.sum()

prediction    409
real          642
dtype: int64

In [27]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, prediction)

print(cm)

[[41474    21]
 [  254   388]]


In [29]:
import pickle

with open('CCFD.pkl', 'wb') as f:
    pickle.dump(model, f)

print("Modelo guardado como CCFD.pkl")

Modelo guardado como CCFD.pkl


In [31]:
from google.colab import files

files.download('CCFD.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>