In [1]:
# Numerical Operations
import math
import numpy as np

# Reading/Writing Data
import pandas as pd
import os
import csv

# For Progress Bar
from tqdm import tqdm

# Pytorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

# For plotting learning curve
#from torch.utils.tensorboard import SummaryWriter

#Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import pairwise_distances


In [3]:
data = pd.read_csv('/Users/linyinghsiao/Desktop/chatgpt_output拷貝.csv')

In [4]:
# Exploratory Data Analysis (EDA)

# Basic information about the dataset
info = data.info()

# Summary statistics for numerical columns
summary_stats = data.describe()

# Checking for missing values
missing_values = data.isnull().sum()

info, summary_stats, missing_values


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30111 entries, 0 to 30110
Data columns (total 26 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   txkey      30111 non-null  object 
 1   locdt      30111 non-null  int64  
 2   loctm      30111 non-null  int64  
 3   chid       30111 non-null  object 
 4   cano       30111 non-null  object 
 5   contp      30111 non-null  int64  
 6   etymd      29422 non-null  float64
 7   mchno      30111 non-null  object 
 8   acqic      30111 non-null  object 
 9   mcc        30090 non-null  float64
 10  conam      30111 non-null  float64
 11  ecfg       30111 non-null  int64  
 12  insfg      30111 non-null  int64  
 13  iterm      30111 non-null  float64
 14  bnsfg      30111 non-null  int64  
 15  flam1      30111 non-null  int64  
 16  stocn      30108 non-null  float64
 17  scity      29207 non-null  float64
 18  stscd      72 non-null     float64
 19  ovrlt      30111 non-null  int64  
 20  flbmk 

(None,
               locdt          loctm         contp         etymd           mcc  \
 count  30111.000000   30111.000000  30111.000000  29422.000000  30090.000000   
 mean      27.376407  143202.969579      4.905815      4.717286    317.386707   
 std       16.292025   53472.453958      0.473786      2.248508     68.802336   
 min        0.000000       6.000000      1.000000      0.000000      0.000000   
 25%       13.000000  104405.500000      5.000000      4.000000    276.000000   
 50%       27.000000  145225.000000      5.000000      5.000000    289.000000   
 75%       42.000000  184703.500000      5.000000      5.000000    324.000000   
 max       55.000000  235959.000000      6.000000      9.000000    501.000000   
 
                conam          ecfg         insfg         iterm         bnsfg  \
 count   30111.000000  30111.000000  30111.000000  30111.000000  30111.000000   
 mean     1397.426827      0.569194      0.011624      0.066321      0.001395   
 std      7532.1604

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Dropping the 'stscd' column due to a large number of missing values
data_cleaned = data.drop(columns=['stscd'])

# Imputing missing values for other columns
# Using median for numerical columns and most frequent value for categorical columns
imputer_num = SimpleImputer(strategy='median')
imputer_cat = SimpleImputer(strategy='most_frequent')

num_cols = data_cleaned.select_dtypes(include=['float64', 'int64']).columns
cat_cols = data_cleaned.select_dtypes(include=['object']).columns

data_cleaned[num_cols] = imputer_num.fit_transform(data_cleaned[num_cols])
data_cleaned[cat_cols] = imputer_cat.fit_transform(data_cleaned[cat_cols])

# Checking if there are still any missing values
remaining_missing = data_cleaned.isnull().sum().sum()

# Encoding categorical variables
label_encoders = {}
for column in cat_cols:
    le = LabelEncoder()
    data_cleaned[column] = le.fit_transform(data_cleaned[column])
    label_encoders[column] = le

# Checking the first few rows of the cleaned and encoded data
data_cleaned_head = data_cleaned.head()
remaining_missing, data_cleaned_head


(0,
    txkey  locdt     loctm   chid   cano  contp  etymd  mchno  acqic    mcc  \
 0  28082   32.0  163654.0    778  20627    5.0    4.0   1866    108  320.0   
 1  21266   43.0  130338.0  19005  23474    5.0    8.0   3241    136  217.0   
 2  16076   13.0  153337.0   9560  11859    5.0    3.0    282    232  275.0   
 3  25096   18.0  130617.0  17882   7267    5.0    5.0   8219     88  276.0   
 4  22574   35.0  125644.0  18758   7519    5.0    4.0   8377     78  418.0   
 
    ...  flam1  stocn    scity  ovrlt  flbmk  hcefg  csmcu  csmam  flg_3dsmk  \
 0  ...  526.0    0.0  15742.0    0.0    0.0    6.0   70.0  526.0        0.0   
 1  ...  120.0    0.0  16115.0    0.0    0.0    6.0   70.0  120.0        0.0   
 2  ...   90.0    0.0  15759.0    0.0    0.0    6.0   70.0   90.0        0.0   
 3  ...   44.0    0.0  15759.0    0.0    0.0    6.0   70.0   44.0        0.0   
 4  ...   90.0    0.0  15760.0    0.0    0.0    6.0   70.0   90.0        0.0   
 
    label  
 0    0.0  
 1    0.0  
 2

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Defining features and target variable
X = data_cleaned.drop('label', axis=1)
y = data_cleaned['label']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializing the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Training the model
rf_classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = rf_classifier.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)

accuracy, classification_rep, confusion_mat


(0.998007637390005,
 '              precision    recall  f1-score   support\n\n         0.0       1.00      1.00      1.00      6010\n         1.0       0.67      0.15      0.25        13\n\n    accuracy                           1.00      6023\n   macro avg       0.83      0.58      0.62      6023\nweighted avg       1.00      1.00      1.00      6023\n',
 array([[6009,    1],
        [  11,    2]]))