#import pustaka

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.patches as mpatches
import time

# Classifier Libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import collections
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score,roc_curve, accuracy_score, auc, average_precision_score


# Other Libraries
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
warnings.filterwarnings("ignore")
from scipy.stats import f_oneway


#eksporasi data dan pembersihan

In [None]:
train = pd.read_csv('train.csv')
loan_activities = pd.read_csv('loan_activities.csv')
non_borrower_user = pd.read_csv('non_borrower_user.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [None]:
# Eksplorasi Data
print("Train Data")
print(train.info())
print(train.describe())
print(train.head())

print('-----------------------------------------------------------------------------------------------------')

print("\nLoan Activities Data")
print(loan_activities.info())
print(loan_activities.describe())
print(loan_activities.head())

print('-----------------------------------------------------------------------------------------------------')

print("\nNon Borrower User Data")
print(non_borrower_user.info())
print(non_borrower_user.describe())
print(non_borrower_user.head())

print('-----------------------------------------------------------------------------------------------------')

print("\nTest Data")
print(test.info())
print(test.describe())
print(test.head())

print('-----------------------------------------------------------------------------------------------------')

print("\nSample Submission Data")
print(sample_submission.info())
print(sample_submission.describe())
print(sample_submission.head())

In [None]:
# Memeriksa nilai yang hilang
print("\nMissing values in Train Data")
print(train.isnull().sum())

print("\nMissing values in Loan Activities Data")
print(loan_activities.isnull().sum())

print("\nMissing values in Non Borrower User Data")
print(non_borrower_user.isnull().sum())

print("\nMissing values in Test Data")
print(test.isnull().sum())

print("\nMissing values in Sample Submission Data")
print(sample_submission.isnull().sum())

In [None]:
#menhapus missing value
train = train.fillna(method='ffill')
loan_activities = loan_activities.fillna(method='ffill')
non_borrower_user = non_borrower_user.fillna(method='ffill')
test = test.fillna(method='ffill')
sample_submission = sample_submission.fillna(method='ffill')

In [None]:
# Memeriksa duplikat
print("\nDuplicates in Train Data")
print(train.duplicated().sum())

print("\nDuplicates in Loan Activities Data")
print(loan_activities.duplicated().sum())

print("\nDuplicates in Non Borrower User Data")
print(non_borrower_user.duplicated().sum())

print("\nDuplicates in Test Data")
print(test.duplicated().sum())

print("\nDuplicates in Sample Submission Data")
print(sample_submission.duplicated().sum())

#eksplorasi data dan analisis

In [None]:
# 1. Distribusi Label
plt.figure(figsize=(6,4))
sns.countplot(x='label', data=train)
plt.title('Distribusi Label (Fraud vs Non-Fraud)')
plt.show()

In [None]:
# 2. Distribusi Fitur Anonim (pc)
for i in range(17):
    plt.figure(figsize=(6,4))
    sns.histplot(train[f'pc{i}'], kde=True)
    plt.title(f'Distribusi pc{i}')
    plt.show()

In [None]:
# 3. Korelasi Antar Fitur
plt.figure(figsize=(12,10))
correlation_matrix = train.drop('user_id', axis=1).corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Matriks Korelasi')
plt.show()

In [None]:
# 4. Analisis Loan Activities
print("\nLoan Activities Summary")
print(loan_activities.describe())
print(loan_activities['loan_type'].value_counts())

# Plot jumlah pinjaman berdasarkan tipe
plt.figure(figsize=(10,6))
sns.countplot(x='loan_type', data=loan_activities)
plt.title('Jumlah Pinjaman Berdasarkan Tipe')
plt.show()

# Plot jumlah pinjaman dari waktu ke waktu
loan_activities['ts'] = pd.to_datetime(loan_activities['ts'])
loan_activities.set_index('ts').resample('M').size().plot()
plt.title('Jumlah Pinjaman dari Waktu ke Waktu')
plt.show()

In [None]:
# 5. Analisis Non-Borrower User
print("\nNon Borrower User Summary")
print(non_borrower_user.describe())

# Distribusi fitur anonim untuk non-borrower user
for i in range(17):
    plt.figure(figsize=(6,4))
    sns.histplot(non_borrower_user[f'pc{i}'], kde=True)
    plt.title(f'Distribusi pc{i} (Non Borrower User)')
    plt.show()

In [None]:
# 6. Analisis Perbandingan Fitur Antara Fraud dan Non-Fraud
for i in range(17):
    plt.figure(figsize=(6,4))
    sns.boxplot(x='label', y=f'pc{i}', data=train)
    plt.title(f'Perbandingan pc{i} Antara Fraud dan Non-Fraud')
    plt.show()

In [None]:
# 7. Korelasi Fitur dengan Label
correlation_with_label = train.corr()['label'].sort_values(ascending=False)
print("\nKorelasi Fitur dengan Label")
print(correlation_with_label)

In [None]:
# # correlation matrics
# sns.heatmap(train.corr(),cmap = colors,cbar = True)

**Anova**

#pemodelan dan evaluasi

In [None]:
# Memisahkan fitur dan target
X = train.drop(['user_id', 'label'], axis=1)
y = train['label']
X_test = test.drop(['user_id'], axis=1)

# Membagi data training menjadi training set dan validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Melatih model Random Forest
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Membuat prediksi pada validation set
y_pred = model.predict(X_val)
y_proba = model.predict_proba(X_val)[:,1]

# Confusion Matrix
cm = confusion_matrix(y_val, y_pred)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Classification Report
print("\nClassification Report:")
print(classification_report(y_val, y_pred))

# ROC AUC Score
roc_auc = roc_auc_score(y_val, y_proba)
print("\nROC AUC Score:", roc_auc)

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_val, y_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

# Accuracy Score
accuracy = accuracy_score(y_val, y_pred)
print("\nAccuracy Score:", accuracy)

# Average Precision Score
average_precision = average_precision_score(y_val, y_proba, average='macro')
print("\nAverage Precision Score:", average_precision)

In [None]:
# Menggunakan model terbaik untuk prediksi pada data test
y_test_pred = model.predict(X_test)

# Membuat DataFrame untuk submisi
submission = pd.DataFrame({
    'user_id': test['user_id'],
    'label': y_test_pred
})

# Menyimpan submisi ke file CSV
submission.to_csv('submission.csv', index=False)

In [None]:
import pandas as pd
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, auc, average_precision_score
import matplotlib.pyplot as plt
import seaborn as sns

# Memuat data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
loan_activities = pd.read_csv('loan_activities.csv')
non_borrower_user = pd.read_csv('non_borrower_user.csv')

# Membuat graf
G = nx.DiGraph()

# Menambahkan simpul dari train, test, dan non_borrower_user
for df in [train, test, non_borrower_user]:
    for user_id in df['user_id']:
        G.add_node(user_id)

# Menambahkan tepi dari loan_activities
for _, row in loan_activities.iterrows():
    user_id = row['user_id']
    reference_contact = row['reference_contact']
    G.add_edge(user_id, reference_contact)

# Menambahkan fitur dari analisis jaringan
train['degree_centrality'] = train['user_id'].apply(lambda x: nx.degree_centrality(G).get(x, 0))
test['degree_centrality'] = test['user_id'].apply(lambda x: nx.degree_centrality(G).get(x, 0))
non_borrower_user['degree_centrality'] = non_borrower_user['user_id'].apply(lambda x: nx.degree_centrality(G).get(x, 0))

# Menggabungkan data berdasarkan user_id
combined_train = pd.merge(train, non_borrower_user, on='user_id', how='left', suffixes=('', '_non_borrower'))
combined_test = pd.merge(test, non_borrower_user, on='user_id', how='left', suffixes=('', '_non_borrower'))

# Mengisi nilai-nilai yang hilang
combined_train.fillna(0, inplace=True)
combined_test.fillna(0, inplace=True)

# Memisahkan fitur dan target
X = combined_train.drop(['user_id', 'label'], axis=1)
y = combined_train['label']
X_test = combined_test.drop(['user_id'], axis=1)

# Membagi data training menjadi training set dan validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Melatih model Random Forest
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Membuat prediksi pada validation set
y_pred = model.predict(X_val)
y_proba = model.predict_proba(X_val)[:, 1]

# Confusion Matrix
cm = confusion_matrix(y_val, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Classification Report
print("\nClassification Report:")
print(classification_report(y_val, y_pred))

# ROC AUC Score
roc_auc = roc_auc_score(y_val, y_proba)
print("\nROC AUC Score:", roc_auc)

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_val, y_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

# Accuracy Score
accuracy = accuracy_score(y_val, y_pred)
print("\nAccuracy Score:", accuracy)

# Average Precision Score
average_precision = average_precision_score(y_val, y_proba, average='macro')
print("\nAverage Precision Score:", average_precision)

# Membuat prediksi pada data test
y_test_pred = model.predict(X_test)

# Membuat DataFrame untuk submisi
submission = pd.DataFrame({
    'user_id': combined_test['user_id'],
    'label': y_test_pred
})

# Menyimpan submisi ke file CSV
submission.to_csv('/mnt/data/submission1.csv', index=False)

