In [None]:
# df.duplicated
# df.drop_duplicates(inplace=true)

#### Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec
%matplotlib inline
import seaborn as sns
import math
import time
import os
from os.path import exists
from collections import Counter
from tabulate import tabulate
from tkinter import *
from PIL import ImageTk, Image

# Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification


# Standardisation methods
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import KernelCenterer
from sklearn.cluster import KMeans


# Metrics methods
import sklearn.metrics as metrics
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report, confusion_matrix


# Metrics methods
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold


#### Global Variables

In [None]:
label = "model_class"   
label_target = "normal" # depends on df
label_dim1 = "Fraud" # depends on model logic
label_dim2 = "Legal" # # depends on model logic
label_binary1 = 0
label_binary2 = 1
df_headers_list = []
column_name_time = "time" # depends on df
column_name_amount = "amount" # depends on df
model_train_exclusion_list = [column_name_time, column_name_amount, label]
# seaborn
sns.set_theme(style="whitegrid")

#### Standardisation Variables

In [None]:
scaling_method_standard = StandardScaler()
scaling_method_minmax = MinMaxScaler()
scaling_method_norm = Normalizer()
scaling_method_kernel = KernelCenterer()
scaling_exclusion_list = model_train_exclusion_list # depends on df and model logic

#### Split Train Variables

In [None]:
test_size = 0.2
random_state = 42
shuffle=True
split_train = 0.75
split_test = 0.25
split_validation = 0

#### Hyperparamaters

In [None]:
n_neighbors = 2
criterion = "entropy"
max_depth_dt = 4 # decision tree
max_depth_xgb = 4 # xgb boost
max_depth_rf = 4 # random forest


#### Paths

In [None]:
path_df = r"C:\Users\edgar\OneDrive\Desktop\MSc Artificial Intelligence\Data Science\Unit 7 - Final Assesment\creditcard dataset small - original.csv" # df path
path_df_backup = r"C:\Users\edgar\OneDrive\Desktop\MSc Artificial Intelligence\Data Science\Unit 7 - Final Assesment\creditcard dataset small - original.csv" # backup df path

#### Controls

In [None]:
def df_builder(path, backup_path):
    file_exist_check = exists(path_df)
    if file_exist_check == True: 
        df  = pd.read_csv(path_df)
    else: 
        df = pd.read_csv(path_df_backup)
    return df


def df_lower_headers(df): 
    df.columns = map(str.lower, df.columns)
    return df


def df_headers(df, df_headers_list):
    for column in df.columns:
        df_headers_list.append(column)
    return df_headers_list


def df_rename_label_header(df, label_target, label_model): 
    df.rename(columns = {label_target: label_model}, inplace = True)
    return df


def df_overview(df):
    print("DF Shape")
    print(df.shape)
    print("DF Description")
    print(df.describe())

    
def df_scaler(df, scaling_method, scaling_exclusion_list):
    scaling_df = df.loc[:, ~df.columns.isin(scaling_exclusion_list)]
    scaling_df_labels = []
    for columns in scaling_df.columns:
        scaling_df_labels.append(columns)
        scaling_range = scaling_df_labels
        df[scaling_range] = scaling_method.fit_transform(df[scaling_range])
    return df

    
def df_nan_remove(df): 
    for column in df.columns:
        nan = df[df[column].isnull()].shape[0]
        df = df.dropna()
    return df


def df_duplicates(df): # this does not work as it suppose to
    dup = df.duplicated().any() # False
    #dup_coordinates = df.duplicated(subset=['Student','Date']).any() # this needs to be loop to check each column with each other 
    if dup == True:
        print("Duplicate Values Found")
    return dup


def df_shape(df):
    print(df.shape)

    
def df_time_conversion(df, column_name):
    column_name = str(column_name) 
    df[column_name] = pd.to_datetime(df[column_name], unit='s')
    df[column_name] = df[column_name].dt.strftime("%H:%M:%S")
    return df


def df_replace_column_header(df, current_header, new_header):
    current_header = str(current_header)
    new_header = str(new_header)
    df.rename(columns={current_header:new_header}, inplace=True, axis=1)
    return df


def df_replace_column_header(df, current_header, new_header): # not sure if needed but lets it shine
    current_header = str(current_header)
    new_header = str(new_header)
    df.rename(columns={current_header:new_header}, inplace=True)
    return df


def df_correlation_matrix(df): 
    corrmat = df.corr()
    fig = plt.figure(figsize=(12, 9))
    sns.heatmap(corrmat, vmax=.8, square=True)
    plt.show()

#### Models

In [None]:
def model_binary_class(df, label_dim1, label_dim2):
    label_dim1 = df[df[label] == class_binary2]
    label_dim2 = df[df[label] == class_binary1]
    return df


def model_train_test_split(df, X_df, y_label, test_size, random_state, shuffle):
    X = df.loc[:, ~df.columns.isin(X1_df)]
    y = df[y1_label]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, shuffle=shuffle)
    return (X_train, X_test, y_train, y_test)


def model_decision_tree(X_train, y_train):
    tree_model = DecisionTreeClassifier(max_depth = max_depth_dt, criterion=criterion)
    tree_model.fit(X_train, y_train)
    tree_yhat = tree_model.predict(X_test)
    
    
def model_label_definitions(label, label_target, label_dim1, label_dim2, label_binary1, label_binary2):
    print("Class")
    print(label)
    print("Originial Class Name")
    print(label_target)
    print("Class 1 Binary Name")
    print(label_dim1)
    print("Class 2 Binary Name")
    print(label_dim2)
    print("Class 1 Binary Unique Identifier")
    print(label_binary1)
    print("Class 2 Binary Unique Identifier")
    print(label_binary2)
    
    
def model_class_balance_check_text(df, model_label, label_dim1, label_dim2): 
    count_binary1 = len(df[df.model_class == label_binary1])
    count_binary2 = len(df[df.model_class == label_binary2])
    count_total = len(df)
    balance_precentage = round(count_binary1/count_binary2, 2)
    print("----------------Class Balance Check----------------")
    print("----------------------------------------------------")
    print("Number of Fraud Cases Are: {}".format(count_binary1))
    print("----------------------------------------------------")
    print("Number of Legal Cases Are: {}".format(count_binary2))
    print("----------------------------------------------------")
    print("Number of Total Cases Are: {}".format(count_total))
    print("----------------------------------------------------")
    print("Class Balance Precentage Is: {}".format(balance_precentage))
    
    
def model_class_balance_check_charts(df):
    pie_data = df[" "] = np.where(df[label] == 0,  label_dim1, label_dim2)
    pie_data = df[" "].value_counts().plot(kind="pie")
    
    

#### DataFrame Workings

In [None]:
df = df_builder(path_df, path_df_backup) # create df
df = df_lower_headers(df) # lower df headers/ run as 2nd function as entire platform is based on lower headers
df = df_rename_label_header(df, label_target, label) # change model_label name
df = df_nan_remove(df) # remove NaN values
df = df_time_conversion(df, column_name_time) # time column formating
df = df_scaler(df, scaling_method_minmax, scaling_exclusion_list) # scaling algortihm 
model_class_balance_check_text(df, label, label_dim1, label_dim2)
#model_label_definitions(label, label_target, label_dim1, label_dim2, label_binary1, label_binary2)
df

In [None]:
# unbalanced check
# build pie, bar charts
# buil table
# unbalanced solution SMOT

#### Models Workings

In [None]:
model_train_test_split(df, X1_df=scaling_exclusion_list, y1_label=label, test_size=test_size, random_state=random_state, shuffle=shuffle)

In [None]:
model = LinearRegression().fit(X_train, y_train)
model.intercept_
model.coef_

In [None]:
def model_build(model): # AEI # Artificial Edgar intelgince  
    model = model
    #predict = model(X_test)
    model_intercept = model.intercept_
    model_coefficient = model.coef_
    return (model, model_intercept, model_coefficient)


model_build(model=LinearRegression().fit(X_train, y_train))

In [None]:
# drop down menu

root = Tk()
root.title("Model Options")
root.geometry("200x200")
clicked = StringVar()
first_model = clicked.set()
drop = OptionMenu(root, clicked, "Monday", "Gay", "Be Gay", "Why are you gay")
drop.pack()

root.mainloop()



#### Graveyard of Codes

In [None]:
n = 5
knn = KNeighborsClassifier(n_neighbors = n)
knn.fit(X_train, y_train)
knn_yhat = knn.predict(X_test)


#model_df_split(df)
df = df_builder(path_df, path_df_backup)
X = df.loc[:, df.columns != label] # needs control
y = label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

In [None]:
f, axes = plt.subplots(1, 2, figsize=(18,4), sharex = True)

amount_value = df[column_name_amount].values # values
time_value = df[column_name_time].values # values

sns.distplot(amount_value, hist=False, color="m", kde_kws={"shade": True}, ax=axes[0]).set_title('Distribution of Amount')
sns.distplot(time_value, hist=False, color="m", kde_kws={"shade": True}, ax=axes[1]).set_title('Distribution of Time')

plt.show()

In [None]:
data = [["Mavs", 0.947257], 
        ["Suns", 0.947257], 
        ["Spurs", 0.947257], 
        ["Nets", 0.947257]]
          # rows
#define header names
col_names = ["Team", "Points"] # headers
  
#display table
print(tabulate(data, headers=col_names, tablefmt="fancy_grid"))


# this is a solution but no idea how to automate it
test1 = df["v1"]
test2 = df["v2"]

data = [[test1], [test2]]

col_names = ["date", "gay"]



# loop each row in column and show it as table??????????????????????????
print(tabulate(data, headers=col_names, tablefmt="fancy_grid"))

In [None]:
# does this fixed dup column issue? 
# df.duplicated
# df.drop_duplicates(inplace=true) 