# Downloading dataset

In [None]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c ieee-fraud-detection

In [None]:
!unzip '/content/ieee-fraud-detection.zip'

# importing modules

In [None]:
import pandas as pd
pd.set_option('display.max_rows',None)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# loading data to dataframe after merging

In [None]:
df= pd.merge(pd.read_csv('/content/train_transaction.csv'),pd.read_csv('/content/train_identity.csv'), on='TransactionID',how='outer')


In [None]:
df.head()

In [None]:
df.info()

# Checking for null values

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
dfdroplist=df.isnull().sum().sort_values(ascending=False).head(232).index.tolist()

some features has high number of missing values,we cannot work with that,so lets drop it

In [None]:
df.drop(columns=dfdroplist,inplace=True)

In [None]:
df.isnull().sum().sort_values(ascending=False)

now lets get numeric and categorical features in two lists

In [None]:
dfcategorical=df.select_dtypes(include=('object')).columns.tolist()
dfnumeric=df.select_dtypes(include=('int64','float64')).columns.tolist()

In [None]:
for feature in dfnumeric:
    df[feature].fillna(df[feature].mean(), inplace=True)
for feature in dfcategorical:
    df[feature].fillna(df[feature].mode()[0], inplace=True)



In [None]:
df.isnull().sum()

# Distribution of isfraud variable

In [None]:
sns.histplot(df['isFraud'], kde=True, color='blue')
plt.title('isfraud')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

data is highly imbalanced.we can guess there will be outliers,but removing outliers is risky since our dataset is highly imbalanced.



# Converting categorical into numeric

In [None]:
df0=pd.get_dummies(data=df,drop_first=True)


# confirming shapes

In [None]:
X=df0.drop(columns=['isFraud'])
y=df0['isFraud']

# Feature importance

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,stratify=y,test_size=0.2,random_state=42)

In [None]:
rf_clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf_clf.fit(X_train, y_train)

feature_importance = rf_clf.feature_importances_


feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})


feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(30, 30))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df, order=feature_importance_df.sort_values('Importance', ascending=False)['Feature'])
plt.title('Feature Importance - Random Forest')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.show()

In [None]:
y_pred_proba = rf_clf.predict_proba(X_test)[:,1]
roc_auc = roc_auc_score(y_test, y_pred_proba)

print("ROC-AUC Score:", roc_auc)

pick only most important features

In [None]:
picked_features = []

for feature, importance_score in zip(X_train.columns, feature_importance):
    if importance_score >= 0.0010:#train the model by changing this value to get highest roc
        picked_features.append(feature)

train with the picked features

In [None]:
rf_clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf_clf.fit(X_train[picked_features], y_train)


In [None]:

y_pred_proba = rf_clf.predict_proba(X_test[picked_features])[:,1]
roc_auc = roc_auc_score(y_test, y_pred_proba)

print("ROC-AUC Score:", roc_auc)


delete the variables to save memory

In [None]:
del df
del X_train
del X_test
del y_train
del y_test
del df0
del X
del y

do the same for test dataframe

In [None]:
test=pd.merge(pd.read_csv('/content/test_transaction.csv'),pd.read_csv('/content/test_identity.csv'), on='TransactionID',how='outer')

In [None]:
test.isnull().sum().sort_values(ascending=False)

In [None]:
droplist=test.isnull().sum().sort_values(ascending=False).head(232).index.tolist()

In [None]:
test.drop(columns=droplist,inplace=True)

In [None]:
categorical=test.select_dtypes(include=('object')).columns.tolist()
numeric=test.select_dtypes(include=('int64','float64')).columns.tolist()

In [None]:
for feature in numeric:
    test[feature].fillna(test[feature].mean(), inplace=True)
for feature in categorical:
    test[feature].fillna(test[feature].mode()[0], inplace=True)

In [None]:
test.isnull().sum().sort_values(ascending=False)

In [None]:
test0=pd.get_dummies(data=test,drop_first=True)

# Make submission file

In [None]:
test_predictions = rf_clf.predict(test0[picked_features])
submission_df = pd.DataFrame({'TransactionID': test['TransactionID'], 'isFraud': test_predictions})

submission_df.to_csv('submission.csv', index=False)