In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import xgboost as xg
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline

import missingno as msno

In [2]:
DATA_DIR = '/Users/andishetavakkoli/Documents/notebook/github_project/machine-learning-projects-data/anomaly_detection/'

In [3]:
df = pd.read_csv(DATA_DIR +'creditcard.csv')

In [4]:
df.head(2)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0


## Missingno function

In [5]:
def missing_plot(df):
    return msno.matrix(df, fontsize=8, figsize=(15, 7))

# Helper Function

In [12]:
def summary_statistic(df) -> pd.DataFrame:
    """
    Calculate summary statistics for the given pandas DataFrame.
    
    Args:
        df (pandas.DataFrame): Input DataFrame, assumed to be in CSV format.
        
    Returns:
        pandas.DataFrame: DataFrame containing information about the columns.
    
    """
    feature_describe = df.describe().T.reset_index().rename(
                           columns={'index':'feature'}).drop(columns='count')

    feature_info = pd.concat([df.dtypes,
                               df.nunique(),
                               df.isna().sum(),
                               df.count()], axis=1,
                keys=['type', 'count_unique', 'count_nan', 'count']).reset_index().rename(columns={'index':'feature'})

    summary_statistic_result = feature_info.merge(feature_describe, how='left', on='feature')

    return summary_statistic_result


In [15]:
summary_statistic(df_iris)

Unnamed: 0,feature,type,count_unique,count_nan,count,mean,std,min,25%,50%,75%,max
0,sepal_length,float64,35,0,150,5.843333,0.828066,4.3,5.1,5.8,6.4,7.9
1,sepal_width,float64,23,0,150,3.057333,0.435866,2.0,2.8,3.0,3.3,4.4
2,petal_length,float64,43,0,150,3.758,1.765298,1.0,1.6,4.35,5.1,6.9
3,petal_width,float64,22,0,150,1.199333,0.762238,0.1,0.3,1.3,1.8,2.5
4,species,object,3,0,150,,,,,,,


# Calculate missing values

In [None]:
def calculate_missing_values(dataframe):
    missing_values_count = dataframe.isnull().sum()
    missing_values_percentage = (missing_values_count / len(dataframe)) * 100
    missing_values_summary = pd.concat([missing_values_count, missing_values_percentage], axis=1)
    missing_values_summary.columns = ['Missing Count', 'Missing Percentage']
    missing_values_summary.sort_values(by='Missing Count', ascending=False, inplace=True)
    return missing_values_summary


In [None]:
def eval(model, X_train, X_test):
    y_pred = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    
    print(confusion_matrix(y_test, y_pred))
    print("Test_Set")
    print(classification_report(y_test,y_pred))
    print("Train_Set")
    print(classification_report(y_train,y_pred_train))
    print("---"*20)
    plot_confusion_matrix(model, X_test, y_test, cmap="plasma")

In [None]:
def train_val(y_train, y_train_pred, y_test, y_pred):
    
    scores = {"train_set": {"Accuracy" : accuracy_score(y_train, y_train_pred),
                            "Precision" : precision_score(y_train, y_train_pred),
                            "Recall" : recall_score(y_train, y_train_pred),                          
                            "f1" : f1_score(y_train, y_train_pred),
                            "roc_auc" : roc_auc_score(y_train, y_train_pred),
                            "recall_auc" : auc(recall, precision)},
    
              "test_set": {"Accuracy" : accuracy_score(y_test, y_pred),
                           "Precision" : precision_score(y_test, y_pred),
                           "Recall" : recall_score(y_test, y_pred),                          
                           "f1" : f1_score(y_test, y_pred),
                           "roc_auc" : roc_auc_score(y_test, y_pred),
                           "recall_auc" : auc(recall, precision)}}
    
    return pd.DataFrame(scores)

# Corrolation Visualization

In [None]:
df_out.corr()['class'].sort_values().drop('class').iplot(kind = 'barh', title = 'Correlation Between the Columns');

In [None]:
RFSmote_feature_imp = pd.DataFrame(index=X.columns, data = RF_smote_model.feature_importances_, columns = ['Importance']).sort_values("Importance", ascending = False)

fig = px.bar(RFSmote_feature_imp.sort_values('Importance', ascending = False), x = RFSmote_feature_imp.sort_values('Importance', 
             ascending = False).index, y = 'Importance', title = "RFSmote Feature_Importance", 
             labels = dict(x = "Features", y ="Importance"))
fig.show()

# Save and Export the Model as .pkl

In [None]:
logistic_regression = pickle.dump(LogReg_Deploy, open('logistic_regression_model', 'wb'))

# Machine Learning

### Scaling

In [15]:
X = df.drop(columns=['Class'])
y = df['Class']

In [16]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify=y, random_state=42, test_size=0.3)

In [23]:
pl = Pipeline([('smote', SMOTE()), ('model', RandomForestClassifier())])

In [None]:
pl.fit(X_train, y_train)

In [None]:
y_pred = pl.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

# Combine SMOTE with Undersampling

In [None]:
over = SMOTE(sampling_strategy = {1: 10000})
under = RandomUnderSampler(sampling_strategy = {0: 10000})
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps = steps)
X, y = pipeline.fit_resample(X, y)