In [None]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Read-in data
df = pd.read_csv('../input/paysim1/PS_20174392719_1491204439457_log.csv')

In [None]:
df.head(10)

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.describe()

In [None]:
# We can check how many fraudile transactions there are in the dataset
(df["isFraud"] == 1).sum(), (df["isFraud"] == 0).sum(), (df["isFlaggedFraud"] == 1).sum()

In [None]:
# Let's plot the transaction amounts
fig, ax = plt.subplots()
ax.scatter(df.index, df.amount);

In [None]:
df.isFlaggedFraud.value_counts()
sns.countplot(df.isFlaggedFraud)

In [None]:
df.type.value_counts()
sns.countplot(df.type)

### Data Processing

In [None]:
le = LabelEncoder()
df.type = le.fit_transform(df.type)

df.head()

In [None]:
# Explore correlations to label
df.corr().isFlaggedFraud.sort_values(ascending=False)

In [None]:
# Explore correlations visually
f, ax = plt.subplots(figsize=(12,6))
sns.heatmap(df.corr(), annot=True, fmt='.2f')

In [None]:
# Split data into 80% train and 20% test
X = df.drop(columns=['isFlaggedFraud', 'nameDest', 'nameOrig'], axis=1)
y = df['isFlaggedFraud']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

### Modelling

In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [None]:
# Apply model to validation data
y_predict = model.predict(X_val)

In [None]:
actual_vs_predict = pd.DataFrame({'Actual': y_val,
                                'Prediction': y_predict})
actual_vs_predict.sample(10)

In [None]:
 model_score = model.score(X_test, y_test)