# Problem statement of fraud check

## Importing libraries

In [1]:
import pandas as pd
from  sklearn.tree import DecisionTreeClassifier
import seaborn as sns

In [2]:
df=pd.read_csv('Fraud_check.csv')
df.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


## Initial analysis

In [3]:
df.shape

(600, 6)

In [4]:
df.isna().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

### Data preparation

In [5]:
df=pd.get_dummies(df,columns=['Undergrad','Marital.Status','Urban'])


In [6]:
df.head()

Unnamed: 0,Taxable.Income,City.Population,Work.Experience,Undergrad_NO,Undergrad_YES,Marital.Status_Divorced,Marital.Status_Married,Marital.Status_Single,Urban_NO,Urban_YES
0,68833,50047,10,1,0,0,0,1,0,1
1,33700,134075,18,0,1,1,0,0,0,1
2,36925,160205,30,1,0,0,1,0,0,1
3,50190,193264,15,0,1,0,0,1,0,1
4,81002,27533,28,1,0,0,1,0,1,0


In [7]:
df['Taxable.Income']=df['Taxable.Income'].map(lambda x:'risky' if x<30000 else 'good')

In [8]:
df

Unnamed: 0,Taxable.Income,City.Population,Work.Experience,Undergrad_NO,Undergrad_YES,Marital.Status_Divorced,Marital.Status_Married,Marital.Status_Single,Urban_NO,Urban_YES
0,good,50047,10,1,0,0,0,1,0,1
1,good,134075,18,0,1,1,0,0,0,1
2,good,160205,30,1,0,0,1,0,0,1
3,good,193264,15,0,1,0,0,1,0,1
4,good,27533,28,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...
595,good,39492,7,0,1,1,0,0,0,1
596,good,55369,2,0,1,1,0,0,0,1
597,good,154058,0,1,0,1,0,0,0,1
598,good,180083,17,0,1,0,1,0,1,0


## Splitting x and y

In [9]:
X = df.iloc[:,1:10]
y = df.iloc[:,0]

In [10]:
X.head()

Unnamed: 0,City.Population,Work.Experience,Undergrad_NO,Undergrad_YES,Marital.Status_Divorced,Marital.Status_Married,Marital.Status_Single,Urban_NO,Urban_YES
0,50047,10,1,0,0,0,1,0,1
1,134075,18,0,1,1,0,0,0,1
2,160205,30,1,0,0,1,0,0,1
3,193264,15,0,1,0,0,1,0,1
4,27533,28,1,0,0,1,0,1,0


In [11]:
y.head()

0    good
1    good
2    good
3    good
4    good
Name: Taxable.Income, dtype: object

In [12]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=12)

## Model building,training and testing

In [13]:
from sklearn.ensemble import RandomForestClassifier
rf_model=RandomForestClassifier(n_estimators=100,max_depth=15)
rf_model.fit(X_train,y_train)

RandomForestClassifier(max_depth=15)

In [14]:
y_train_pred=rf_model.predict(X_train)

In [15]:
y_test_pred=rf_model.predict(X_test)

# Model evaluation

## For train data

In [16]:
from sklearn.metrics import accuracy_score,confusion_matrix

In [17]:
accuracy_score(y_train,y_train_pred)

1.0

In [18]:
confusion_matrix(y_train,y_train_pred)

array([[369,   0],
       [  0, 111]], dtype=int64)

## For test data

In [19]:
accuracy_score(y_test,y_test_pred)


0.7666666666666667

In [20]:
confusion_matrix(y_test,y_test_pred)

array([[92, 15],
       [13,  0]], dtype=int64)

# Comparison between actual and predicted values

In [21]:
final=pd.DataFrame({'Actual':y_test,'Predicted':y_test_pred})
final

Unnamed: 0,Actual,Predicted
326,risky,good
360,good,good
348,good,good
567,risky,good
244,good,good
...,...,...
554,good,good
472,good,good
540,risky,good
531,good,good
