## Load the dataset into a pandas DataFrame

In [None]:
import pandas

data_file = "data.csv"
df = pandas.read_csv(data_file)

print "No. of rows =",df.shape[0]
print "No. of columns =",df.shape[1]

### Let's take a look at the first rows of our data

In [None]:
df.head()

### Some statistics about our dataset

In [None]:
df.describe()

### Before we proceed it's important to shuffle our data

In [None]:
import numpy as np

df.reindex(np.random.permutation(df.index))

### Split labels from features

In [None]:
#split labels from features
y = df.pop('Fraud Instance')

#remove index column
df = df.drop('#', axis=1)

### Transform categorical features to numerical

In [None]:
df['Marital Status'] = df['Marital Status'].astype('category').cat.codes
df['Accomodation Type'] = df['Accomodation Type'].astype('category').cat.codes
df['Claim Amount'] = df['Claim Amount'].apply(lambda amount: int(amount.replace("$", "")))

### Let's see our data after the changes we have made

In [None]:
df.head()

### It will be usefull to see the label distribution in our data

In [None]:
print y.value_counts()

### Split data to train and test set
We will use 80% of the data for training and 20% of them for testing

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2)

### Classifier

Because this is a relatively easy problem and there is lack of training data we will choose to use desicion trees as our classifier.

### Let's plot a learning curve that will helps us to visualise problems like overffiing, lack of training data, etc...

In [None]:
from sklearn.model_selection import learning_curve
from sklearn import tree
%matplotlib inline
import matplotlib.pyplot as plt

classifier = tree.DecisionTreeClassifier(class_weight="balanced")

train_sizes, train_scores, validation_scores = learning_curve(classifier, X_train, y_train, train_sizes= np.linspace(0.1, 1.0, 10), cv=5)

#example taken from http://scikit-learn.org/0.15/auto_examples/plot_learning_curve.html
plt.figure()
plt.title("Learning curves (Desicion Tree)")
plt.ylim((0.95, 1.01))
plt.xlabel("Training examples")
plt.ylabel("Score")

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
validation_scores_mean = np.mean(validation_scores, axis=1)
validation_scores_std = np.std(validation_scores, axis=1)

plt.grid()

plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, validation_scores_mean - validation_scores_std, validation_scores_mean + validation_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, validation_scores_mean, 'o-', color="g", label="Cross-validation score")

plt.legend(loc="best")

plt.show()

As we see from the learning curve our model is overffited and adding more data does not affect the prediction's score. On way to solve the overfitting problem would be to include mode features in our dataset or to reduce the training data.

### Train our classifier
There are several tuning parameters for our classifier, but for this problem we will use the defaults.

In [None]:
classifier = tree.DecisionTreeClassifier()
classifier = classifier.fit(X_train, y_train)

### See how well our trained classifier predicts our test data

In [None]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = classifier.predict(X_test)

print classification_report(y_test, y_pred)
print "accuracy =",accuracy_score(y_test, y_pred)