## Predict whether a person needs tax deduction

### Preprocessing 

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## Load the dataset
data = pd.read_csv('dataset.csv')
## Classify the data into features and target

X = data.drop(['donation_id','tax','name'], axis=1)
Y = data['tax']
labels = ['Not for tax deduction','For tax deduction']
## Split the data into training and testing data
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)


print (data['category'].isnull().sum())

0


### Try out SVC, Decision Tree, Random Forest and XGBoost, compare the performance (focus on accuracy)



### SVC

In [20]:
from sklearn import svm

ratio = 100
ratiovalues = [i for i in range(10, ratio, 10)]
kernelvalues = ["rbf", "poly"]#, "sigmoid"]
gammavalues =["scale"]#, "auto"]
relative_best_train_score = 0
relative_best_test_score = 0
relative_best_ratio = 0
relative_best_kernel = ""
relative_best_gamma = ""


for k in ratiovalues:
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = k/100, random_state=71)
    for i in kernelvalues:
        for j in gammavalues:
            clf = svm.SVC(random_state = 71, kernel = i, gamma = j)
            clf.fit(X_train, Y_train)
            Y_pred_train = clf.predict(X_train) #train
            train_acc = accuracy_score(Y_pred_train, Y_train)
            Y_pred_test = clf.predict(X_test) #train
            test_acc = accuracy_score(Y_pred_test, Y_test)

            if ((train_acc > relative_best_train_score) and (test_acc > relative_best_test_score)):
                relative_best_train_score = train_acc
                relative_best_test_score = test_acc
                relative_best_ratio = k
                relative_best_kernel = i
                relative_best_gamma = j

print("best ratio of testing data:", relative_best_ratio, "best kernel:", relative_best_kernel, "best gamma:", relative_best_gamma, \
      "\nTraining score:", relative_best_train_score, "Testing score:", relative_best_test_score)

best ratio of testing data: 10 best kernel: rbf best gamma: scale 
Training score: 0.6032765891678258 Testing score: 0.6173256507053447


Best Performance
- ratio of testing data: 10
- kernal: rbf
- best gamma: scale
- Training score: 0.6032765
- Testing score: 0.6173256

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = relative_best_ratio/100, random_state=71)
clf = svm.SVC(random_state = 71, kernel = relative_best_kernel, gamma = relative_best_gamma)
clf.fit(X_train, Y_train)
Y_predict = clf.predict(X_test)
print(classification_report(Y_test, Y_predict, target_names=labels))


                       precision    recall  f1-score   support

Not for tax deduction       0.56      0.37      0.44      2101
    For tax deduction       0.64      0.80      0.71      2932

             accuracy                           0.62      5033
            macro avg       0.60      0.58      0.58      5033
         weighted avg       0.61      0.62      0.60      5033



### Decision Tree

In [22]:
from sklearn import tree


ratio = 100
ratiovalues = [i for i in range(10, ratio, 10)]
depth = 21
depthvalues = [i for i in range(1, depth)]
leaf = 10
leafvalues = [i for i in range(1, leaf)]
relative_best_train_score = 0
relative_best_test_score = 0
relative_best_ratio = 0
relative_best_depth = 0
relative_best_leaf = 0

for k in ratiovalues:
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = k/100, random_state=71)
    for i in depthvalues:
        for j in leafvalues:
            clf = tree.DecisionTreeClassifier(random_state=2023, max_depth = i, min_samples_leaf = j)
            clf.fit(X_train, y_train)
            y_pred_train = clf.predict(X_train)
            train_acc = accuracy_score(y_pred_train, y_train)
            y_pred_test = clf.predict(X_test)
            test_acc = accuracy_score(y_pred_test, y_test)


            if ((train_acc > relative_best_train_score) and (test_acc > relative_best_test_score)):
                relative_best_train_score = train_acc
                relative_best_test_score = test_acc
                relative_best_ratio = k
                relative_best_depth = i
                relative_best_leaf = j


print("best ratio of testing data:", relative_best_ratio, "best depth:", relative_best_depth, "best min_sample_leaf:", relative_best_leaf, \
      "\nTraining score:", relative_best_train_score, "Testing score:", relative_best_test_score)

best ratio of testing data: 10 best depth: 15 best min_sample_leaf: 3 
Training score: 0.8026760283500033 Testing score: 0.793562487581959


Best Performance
- ratio of testing data: 10
- depth: 15
- min_sample_leaf: 3
- Training score: 0.80267
- Testing score: 0.79356

Classification Report

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = relative_best_ratio/100, random_state=71)
clf = tree.DecisionTreeClassifier(random_state = 71, max_depth = relative_best_depth, min_samples_leaf = relative_best_leaf)
clf.fit(X_train, Y_train)
Y_predict = clf.predict(X_test)
print(classification_report(Y_test, Y_predict, target_names=labels))


                       precision    recall  f1-score   support

Not for tax deduction       0.77      0.71      0.74      2101
    For tax deduction       0.81      0.85      0.83      2932

             accuracy                           0.79      5033
            macro avg       0.79      0.78      0.79      5033
         weighted avg       0.79      0.79      0.79      5033



### Random Forest

In [24]:
from sklearn.ensemble import RandomForestClassifier


ratio = 100
ratiovalues = [i for i in range(10, ratio, 10)]
depth = 21
depthvalues = [i for i in range(1, depth)]
leaf = 10
leafvalues = [i for i in range(1, leaf)]
n_estimator = 250
n_estimatorvalues = [i for i in range(100, n_estimator, 50)]
relative_best_estimators = 0
relative_best_train_score = 0
relative_best_test_score = 0
relative_best_ratio = 0
relative_best_depth = 0
relative_best_leaf = 0

for k in ratiovalues:
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = k/100, random_state=71)
    for l in n_estimatorvalues:
        for i in depthvalues:
            for j in leafvalues:
                clf = RandomForestClassifier(random_state=2023,n_estimators = l,  max_depth = i, min_samples_leaf = j)
                clf.fit(X_train, y_train)
                y_pred_train = clf.predict(X_train)
                train_acc = accuracy_score(y_pred_train, y_train)
                y_pred_test = clf.predict(X_test)
                test_acc = accuracy_score(y_pred_test, y_test)


                if ((train_acc > relative_best_train_score) and (test_acc > relative_best_test_score)):
                    relative_best_train_score = train_acc
                    relative_best_test_score = test_acc
                    relative_best_ratio = k
                    relative_best_estimators = l
                    relative_best_depth = i
                    relative_best_leaf = j


print("best ratio of testing data:", relative_best_ratio, "best depth:", relative_best_depth, "best min_sample_leaf:", relative_best_leaf, \
      "\nTraining score:", relative_best_train_score, "Testing score:", relative_best_test_score)

best ratio of testing data: 10 best depth: 12 best min_sample_leaf: 2 
Training score: 0.792320770130931 Testing score: 0.79256904430757


Best Performance
- ratio of testing data: 10
- depth: 12
- min_sample_leaf: 2
- Training score: 0.792320770130931
- Testing score: 0.79256904430757

Classification Report

In [25]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = relative_best_ratio/100, random_state=71)
clf = RandomForestClassifier(random_state = 71, max_depth = relative_best_depth, min_samples_leaf = relative_best_leaf)
clf.fit(X_train, Y_train)
Y_predict = clf.predict(X_test)
print(classification_report(Y_test, Y_predict, target_names=labels))

                       precision    recall  f1-score   support

Not for tax deduction       0.80      0.67      0.73      2101
    For tax deduction       0.79      0.88      0.83      2932

             accuracy                           0.79      5033
            macro avg       0.80      0.77      0.78      5033
         weighted avg       0.79      0.79      0.79      5033



### XGBoost

In [26]:
from xgboost.sklearn import XGBClassifier


ratio = 100
ratiovalues = [i for i in range(10, ratio, 10)]
n_estimator = 250
n_estimatorvalues = [i for i in range(100, n_estimator, 50)]
depth = 8
depthvalues = [i for i in range(1, depth)]
rate = 3
ratevalues = [i for i in range(1, rate)]
relative_best_train_score = 0
relative_best_test_score = 0
relative_best_ratio = 0
relative_best_estimators = 0
relative_best_depth = 0
relative_best_rate = 0

for k in ratiovalues:
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = k/100, random_state=71)
    for l in n_estimatorvalues:
        for i in depthvalues:
            for j in ratevalues:
                clf = XGBClassifier(random_state = 71, n_estimators = l, max_depth = i, learning_rate = j)
                clf.fit(X_train, y_train)
                y_pred_train = clf.predict(X_train) #train
                train_acc = accuracy_score(y_pred_train, y_train)
                y_pred_test = clf.predict(X_test) #train
                test_acc = accuracy_score(y_pred_test, y_test)

                if ((train_acc > relative_best_train_score) and (test_acc > relative_best_test_score)):
                    relative_best_train_score = train_acc
                    relative_best_test_score = test_acc
                    relative_best_ratio = k
                    relative_best_estimators = l
                    relative_best_depth = i
                    relative_best_rate = j

print("best ratio of testing data:", relative_best_ratio, "best no. of estimators:", relative_best_estimators, "best depth:", relative_best_depth, "best learning_rate:", relative_best_rate, \
      "\nTraining score:", relative_best_train_score, "Testing score:", relative_best_test_score)

best ratio of testing data: 10 best no. of estimators: 100 best depth: 4 best learning_rate: 1 
Training score: 0.8036475237905986 Testing score: 0.8025034770514604


Best Performance
- ratio of testing data: 10
- no. of estimators: 100
- depth: 4
- learning rate: 1
- Training score: 0.803647
- Testing score: 0.8025034

Classification Report

In [27]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = relative_best_ratio/100, random_state=71)
clf = XGBClassifier(random_state = 71, max_depth = relative_best_depth, n_estimators = relative_best_estimators, learning_rate = relative_best_rate)
clf.fit(X_train, Y_train)
Y_predict = clf.predict(X_test)
print(classification_report(Y_test, Y_predict, target_names=labels))


                       precision    recall  f1-score   support

Not for tax deduction       0.78      0.74      0.76      2101
    For tax deduction       0.82      0.85      0.83      2932

             accuracy                           0.80      5033
            macro avg       0.80      0.79      0.80      5033
         weighted avg       0.80      0.80      0.80      5033

