In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [2]:
# Load the cleaned dataset
final_data = pd.read_csv('Final_data_set.csv')

In [3]:
#Determine the number of fraud and valid transactions in the dataset

Fraud = final_data[final_data['TARGET']==1]

Valid = final_data[final_data['TARGET']==0]

outlier_fraction = len(Fraud)/float(len(Valid))

print('outlier_fraction for the whole dataset:')
print(outlier_fraction)

print("Fraud Cases : {}".format(len(Fraud)))

print("Valid Cases : {}".format(len(Valid)))

outlier_fraction for the whole dataset:
0.17523364485981308
Fraud Cases : 24825
Valid Cases : 141668


In [4]:
from sklearn.model_selection import train_test_split

#Create independent and Dependent Features
columns = final_data.columns.tolist()
# Filter the columns to remove data we do not want 
columns = [c for c in columns if c not in ["TARGET"]]
# Store the variable we are predicting 
target = "TARGET"
# Define a random state 
state = np.random.RandomState(42)
X = final_data[columns]
y = final_data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(246008, 20) (246008,)
(61502, 20) (61502,)


In [5]:
print(final_data)

        TARGET  AMT_CREDIT  AMT_INCOME_TOTAL  AMT_ANNUITY  AMT_GOODS_PRICE  \
0          1.0    406597.5          202500.0      24700.5         351000.0   
1          0.0   1293502.5          270000.0      35698.5        1129500.0   
2          0.0    135000.0           67500.0       6750.0         135000.0   
3          0.0    312682.5          135000.0      29686.5         297000.0   
4          0.0    513000.0          121500.0      21865.5         513000.0   
...        ...         ...               ...          ...              ...   
307505     NaN         NaN               NaN          NaN              NaN   
307506     NaN         NaN               NaN          NaN              NaN   
307507     NaN         NaN               NaN          NaN              NaN   
307508     NaN         NaN               NaN          NaN              NaN   
307509     1.0    370107.0          171000.0      20205.0         319500.0   

        DAYS_BIRTH  DAYS_EMPLOYED  DAYS_REGISTRATION  CNT_CHILD

In [6]:
final_data.shape


(307510, 21)

In [7]:
final_data.columns.tolist()

['TARGET',
 'AMT_CREDIT',
 'AMT_INCOME_TOTAL',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'CNT_CHILDREN',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'YEARS_BUILD_AVG',
 'AMT_REQ_CREDIT_BUREAU_YEAR',
 'CODE_GENDER',
 'NAME_HOUSING_TYPE',
 'FLAG_OWN_REALTY',
 'FLAG_OWN_CAR',
 'EXT_SOURCE_1',
 'APARTMENTS_AVG',
 'YEARS_BEGINEXPLUATATION_AVG']

In [8]:
# Handle missing values for numeric and categorical data separately
numeric_cols = final_data.select_dtypes(include=['number']).columns
categorical_cols = final_data.select_dtypes(include=['object']).columns

In [9]:
# Impute missing values
numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

In [10]:

final_data[numeric_cols] = numeric_imputer.fit_transform(final_data[numeric_cols])
final_data[categorical_cols] = categorical_imputer.fit_transform(final_data[categorical_cols])

In [11]:
# Prepare data for training
X = final_data.drop('TARGET', axis=1)
y = final_data['TARGET']

In [12]:
# Convert categorical variables to numeric values
for column in X.select_dtypes(include=['object']).columns:
    X[column] = X[column].astype('category').cat.codes

In [13]:

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Train a Linear Regression model
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lr = lin_reg.predict(X_test)

In [15]:
# Evaluate Linear Regression
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

In [16]:
# Train a Random Forest model
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

In [17]:
# Evaluate Random Forest
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

In [18]:
# Determine the best model
if r2_rf > r2_lr:
    best_model = rf
    best_model_name = 'Random Forest'
else:
    best_model = lin_reg
    best_model_name = 'Linear Regression'

In [19]:
# Save the best model to a pickle file
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)