<a href="https://colab.research.google.com/github/alphaking255/fraud-detection/blob/main/Fraud%20Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# !pip install six

In [11]:
import pandas as pd
from sklearn.utils import resample
from sklearn.metrics import accuracy_score
from google.colab import files
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score,f1_score
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LinearRegression
from math import sqrt

In [None]:
uploaded = files.upload()

Saving training.csv to training (1).csv


In [None]:
import io

In [None]:
data1=pd.read_csv(io.BytesIO(uploaded['training.csv']))

## Feature Engineering

#### Convert Time to Datetime format

In [None]:
data1['TransactionStartTime'] = pd.to_datetime(data1['TransactionStartTime'], errors='coerce')

#### Extracting features from Datetime

In [None]:
#Extracting Month,Day,Weekday,Hour,Minute from training
data1['Month'] = data1['TransactionStartTime'].dt.month
data1['Weekday'] = data1['TransactionStartTime'].dt.weekday
data1['Day'] = data1['TransactionStartTime'].dt.day
data1['Hour'] = data1['TransactionStartTime'].dt.hour
data1['Minute'] = data1['TransactionStartTime'].dt.minute
data1['Seconds'] = data1['TransactionStartTime'].dt.second

In [None]:
data1=pd.DataFrame(data1)

In [None]:
data1.head(1)

#### Drop Columns

In [None]:
columns_to_drop=['Value','TransactionId','BatchId','AccountId','SubscriptionId','CustomerId','CurrencyCode','CountryCode','TransactionStartTime']
data=data1.drop(columns_to_drop, axis=1)

In [None]:
data.head(1)

#### Get dummies for categorical

In [None]:
#getting categorical dummies
columns= ["ProviderId", "ProductCategory", "ProductId", "ChannelId"]
data = pd.get_dummies(data, columns=columns)

#### Changing Amount column set to credit(0) and Debit(1) 

In [None]:
data['Amount']=data['Amount'].apply(lambda x: 0 if(x<0) else 1)

In [None]:
data.head(1)

### Balancing the Dataset ### 

Here we'll use **Synthetic Oversampling:** This method helps to avoid overfitting. In this method, a small subset of minority is chosen and synthetic examples of this subset are created to balance up the overall dataset. This adds new information to the dataset and increases the overall number of observations.
We use ***SMOTE*** (synthetic minority oversampling technique) to balance the dataset.

In [None]:
X1=data.drop(['FraudResult'], axis=1)
y1=data[['FraudResult']]

In [None]:
# Split into train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X1, y1, 
                                                    test_size=0.2, 
                                                    random_state=30)

In [None]:
smote_algo=SMOTE(random_state=0)
smote_data_X,smote_data_Y=smote_algo.fit_sample(X_train,y_train)
smote_data_X=pd.DataFrame(data=smote_data_X, columns=X_train.columns)
smote_data_Y=pd.DataFrame(data=smote_data_Y,columns=['FraudResult'])

#Join X and Y smote data into one
smote_data=smote_data_X
smote_data['FraudResult']=smote_data_Y['FraudResult']

#Count of fraud and non-fraud cases on smote after resampling
print("Fraud: {}".format((smote_data["FraudResult"]==1).sum()))
print("Non-Fraud: {}".format((smote_data["FraudResult"]==0).sum()))

In [None]:
#Visualizing Fraud and Non-Fraud Transactions
fig, ax = plt.subplots()
g = sns.countplot(smote_data.FraudResult, palette='viridis')
g.set_xticklabels(['Not Fraud', 'Fraud'])
g.set_yticklabels([])

### Spliting Dataset

In [None]:
X=smote_data.drop(['FraudResult'], axis=1)
y=smote_data[['FraudResult']]

##  Feature selection techniques.

1. Univariate Selection

2. Feature Importance



## Univariate Selection

Statistical tests can be used to select those features that have the strongest relationship with the output variable.

The scikit-learn library provides the SelectKBest class that can be used with a suite of different statistical tests to select a specific number of features.

The example below uses the chi-squared (chi²) statistical test for non-negative features to select 10 of the best features from the Mobile Price Range Prediction Dataset.



In [None]:
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)

dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Columns','Score']  #naming the dataframe columns

In [None]:
print(featureScores.nlargest(20,'Score'))  #print 10 best features

## Feature Importance
You can get the feature importance of each feature of your dataset by using the feature importance property of the model.

Feature importance gives you a score for each feature of your data, the higher the score more important or relevant is the feature towards your output variable.

Feature importance is an inbuilt class that comes with Tree Based Classifiers, we will be using Extra Tree Classifier for extracting the top 10 features for the dataset.

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(X,y)

In [None]:
#print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers

In [None]:
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(20).plot(kind='barh')
plt.show()

### Model Development

In [None]:
best_features=smote_data[['ProductCategory_airtime','ProviderId_ProviderId_4','ChannelId_ChannelId_2','ProductId_ProductId_15',
              'ProviderId_ProviderId_6','ProductId_ProductId_6','ProductId_ProductId_3','ProductId_ProductId_10','PricingStrategy']]
X=best_features
y=smote_data[['FraudResult']]

In [None]:
# split the data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state= 42)

###  GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_class = RandomForestClassifier(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
RFgrid_search = GridSearchCV(forest_class, param_grid, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True)

RFgrid_search.fit(X_train, y_train)

# #DTC
# hyper = {
#     'criterion':['gini','entropy'],
#     'splitter':['best','random'],
#     'max_depth':range(1,50),
#     'min_samples_split':range(1,11),
#     'min_samples_leaf':range(1,11),
#     'max_features':['log2'],
# }

# DTCgrid_search = GridSearchCV(estimator=DecisionTreeClassifier(),param_grid=hyper,verbose=True)
# DTCgrid_search.fit(X_train,y_train)

# #SVC
# hyper = {'C':[0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1],
#          'gamma':['scale','auto'],
#          'degree':[1,2,3,4,5,6,7,8,9,10],
#          'class_weight':['balanced',None],
#          'kernel':['rbf','linear','poly','sigmoid']
#         }

# SVCgrid_search=GridSearchCV(estimator=SVC(),param_grid=hyper,verbose=True)
# SVCgrid_search.fit(X_train,y_train)


#KNN
hyper_knn = {
    'n_neighbors':[28],
    'algorithm':['auto','ball_tree','kd_tree','brute']
}

KNNgrid_search = GridSearchCV(estimator=KNeighborsClassifier(),param_grid=hyper_knn,verbose=True,cv=10)
KNNgrid_search.fit(X_train,y_train)


#LR
hyper_lr = {
    'penalty':['l1','l2','elasticnet','none'],
    'random_state':[42],
    'solver':['newton-cg','lbfgs','liblinear','sag','saga'],
    'max_iter':[120,130],
    'multi_class':['auto','ovr','multinomial'],
    'n_jobs':[-1],
    'fit_intercept':[True,False],
    'tol':[1,0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2,0.1],
    'l1_ratio':[0,0.1,0.2,0.3,None]
}

LRgrid_search = GridSearchCV(estimator=LogisticRegression(),param_grid=hyper_lr,verbose=True)
LRgrid_search.fit(X_train,y_train)

In [None]:
model_dic = {
    "KNN" : KNNgrid_search.best_estimator_,
    # "SVC" : SVCgrid_search.best_estimator_,
    "LR" : LRgrid_search.best_estimator_,
    # "GNB" : GaussianNB(),
    "RFC" : RFgrid_search.best_estimator_,
    # "DTC" : DTCgrid_search.best_estimator_,
}

save_predict=[]


for key in model_dic:
    #model_dic[key].fit(X_train,y_train)
    predictions=model_dic[key].predict(X_test)
    save_predict.append(predictions)
  

## Visualizing Predictions

In [None]:
plt.scatter(np.linspace(1,len(y_test[10:20]),len(y_test[10:20])), y_test[10:20], color ='b',label="Actual", s=20)
plt.scatter(np.linspace(1,len(y_test[10:20]),len(y_test[10:20])), final_predictions[10:20], color = 'g', label="prediction", s=20)
plt.legend(loc='best')
plt.title('Model Visualization')
plt.xlabel('Indexes')
plt.ylabel('Fraud Results')

## Model Evaluation

In [None]:
model_acc=[]
model_f1_score=[]
model_name=[]

for predictions in model_dic:
    model_dic[key].fit(X_train,y_train)
    predictions=model_dic[key].predict(X_test)
    model_acc.append(accuracy_score(y_test,predictions))
    model_score_f1.append(f1_score(y_test,predictions))
    model_name.append(model_dic[key])
    
    mod_dict = {
'Model':model_name,
'Accuracy':model_acc,
'f1_score':model_score_f1
}
models = pd.DataFrame(mod_dict)
models.set_index('Model')

### Save Model

In [None]:
RFC = RandomForestClassifier(random_state=42)
fit=RFC.fit(X_train, y_train)
#Saving file in the directory
model_name="my_model.pkl"
with open(model_name, 'wb') as file:
    pickle.dump(fit, file)

In [None]:
#Load from file
with open(model_name, 'rb') as file:
    pickle_model= pickle.load(file)

In [None]:
#Calculate the accuracy score
score=pickle_model.score(X_test, y_test)
print("Test score: {0:2f} %".format(100*score))