In [44]:
import pandas as pd


credit_card_df = pd.read_csv('Credit_card.csv')
credit_card_label_df = pd.read_csv('Credit_card_label.csv')


credit_card_df.head()
credit_card_label_df.head()


credits_df = pd.merge(credit_card_df, credit_card_label_df, on='Ind_ID')


credits_df.head()

credits_df.to_csv('merged_credit_data.csv', index=False)

credits_df

Unnamed: 0,Ind_ID,GENDER,Car_Owner,Propert_Owner,CHILDREN,Annual_income,Type_Income,EDUCATION,Marital_status,Housing_type,Birthday_count,Employed_days,Mobile_phone,Work_Phone,Phone,EMAIL_ID,Type_Occupation,Family_Members,label
0,5008827,M,Y,Y,0,180000.0,Pensioner,Higher education,Married,House / apartment,-18772.0,365243,1,0,0,0,,2,1
1,5009744,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2,1
2,5009746,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,,-586,1,1,1,0,,2,1
3,5009749,F,Y,N,0,,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2,1
4,5009752,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1543,5028645,F,N,Y,0,,Commercial associate,Higher education,Married,House / apartment,-11957.0,-2182,1,0,0,0,Managers,2,0
1544,5023655,F,N,N,0,225000.0,Commercial associate,Incomplete higher,Single / not married,House / apartment,-10229.0,-1209,1,0,0,0,Accountants,1,0
1545,5115992,M,Y,Y,2,180000.0,Working,Higher education,Married,House / apartment,-13174.0,-2477,1,0,0,0,Managers,4,0
1546,5118219,M,Y,N,0,270000.0,Working,Secondary / secondary special,Civil marriage,House / apartment,-15292.0,-645,1,1,1,0,Drivers,2,0


In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_curve, auc
from sklearn.svm import SVC
import matplotlib.pyplot as plt

# Function for additional data cleaning
def additional_cleaning(data):
    # Drop columns with more than 90% missing values
    missing_threshold = len(data) * 0.9
    cleaned_data = data.dropna(axis=1, thresh=missing_threshold)

    # Drop columns with low variance (assuming numeric columns only)
    numeric_columns = cleaned_data.select_dtypes(include=['number']).columns
    low_variance_columns = cleaned_data[numeric_columns].var() < 0.01
    cleaned_data = cleaned_data.drop(columns=low_variance_columns[low_variance_columns].index)

    return cleaned_data

# Function for combined, oversampling, or undersampling
def balance_dataset(data, labels):
    # Example: Using SMOTE for oversampling
    smote = SMOTE(random_state=42)
    balanced_data, balanced_labels = smote.fit_resample(data, labels)
    return balanced_data, balanced_labels

# Function to perform linear SVM and plot ROC curve
def linear_svm(data, labels):
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    classifier = SVC(kernel='linear', probability=True)  # Set probability=True for calculating probabilities for ROC curve
    classifier.fit(X_train_scaled, y_train)
    roc_auc = plot_roc_curve(classifier, X_test_scaled, y_test)
    return classifier, roc_auc

# Function to plot ROC curve and calculate AUC
def plot_roc_curve(classifier, X_test, y_test):
    y_score = classifier.decision_function(X_test)
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)
    # Plot ROC curve
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc='lower right')
    plt.show()
    return roc_auc

# Function to explain outlier detection
def explain_outlier_detection():
    # Outlier detection is the process of identifying observations that deviate significantly from the rest of the data.
    # It's important because outliers can skew statistical analyses and machine learning models.
    # Methods for outlier detection include statistical methods like Z-score, IQR, and machine learning-based approaches like isolation forests, one-class SVM, and DBSCAN.
    explanation = """
    Outlier detection is the process of identifying observations that deviate significantly from the rest of the data. 
    It's important because outliers can skew statistical analyses and machine learning models. 
    Methods for outlier detection include statistical methods like Z-score, IQR, and machine learning-based approaches like isolation forests, one-class SVM, and DBSCAN.
    """
    return explanation

# Load data
def load_data():
    credit_card_df = pd.read_csv('Credit_card.csv')
    credit_card_label_df = pd.read_csv('Credit_card_label.csv')
    credits_df = pd.merge(credit_card_df, credit_card_label_df, on='Ind_ID')
    return credits_df

# Main function to execute all steps
def main():
    # Load data
    credits_df = load_data()

    # Additional cleaning
    cleaned_data = additional_cleaning(credits_df)
    print(cleaned_data.columns)

    # Check if 'Target' column exists in cleaned_data
    if 'label' in cleaned_data.columns:
        # Drop the 'Target' column
        labels = cleaned_data['label']
        cleaned_data = cleaned_data.drop(columns=['label'])

        # Balance dataset
        balanced_data, balanced_labels = balance_dataset(cleaned_data, labels)  # Use 'label' column instead of 'Target'

        # Perform linear SVM and plot ROC curve
        svm_auc = linear_svm(balanced_data, balanced_labels)

        # Explain outlier detection
        outlier_explanation = explain_outlier_detection()

        print("Linear SVM ROC AUC:", svm_auc)
        print("Explanation on Outlier Detection:", outlier_explanation)
    else:
        print("'Target' column not found in the dataset.")

    # Execute other models
    print("Executing other models...")

    # Here you can add code to execute other models
    # For example:
    # model2_auc = model2(balanced_data, balanced_labels)
    # print("Model 2 ROC AUC:", model2_auc)

    # Ensure that the model execution is properly added here

# Execute main function
if __name__ == "__main__":
    main()


Index(['Ind_ID', 'GENDER', 'Car_Owner', 'Propert_Owner', 'CHILDREN',
       'Annual_income', 'Type_Income', 'EDUCATION', 'Marital_status',
       'Housing_type', 'Birthday_count', 'Employed_days', 'Work_Phone',
       'Phone', 'EMAIL_ID', 'Family_Members', 'label'],
      dtype='object')
0       1
1       1
2       1
3       1
4       1
       ..
1543    0
1544    0
1545    0
1546    0
1547    0
Name: label, Length: 1548, dtype: int64


ValueError: could not convert string to float: 'M'