## Predict whether a person regularly donates

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Load Data
df = pd.read_csv('dataset.csv')
X = df.drop(['donation_id','donater_type','name'], axis=1)
y = df['donater_type']

#### Decision Tree

In [2]:
from sklearn.tree import DecisionTreeClassifier

ratiovalues = [10, 20, 30]
depthvalues = [i for i in range(1, 21)]
leafvalues = [i for i in range(1, 10)]
relative_best_train_score = 0
relative_best_test_score = 0
relative_best_ratio = 0
relative_best_depth = 0
relative_best_leaf = 0

for k in ratiovalues:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = k/100, random_state=71)
    for i in depthvalues:
        for j in leafvalues:
            clf = DecisionTreeClassifier(random_state = 71, max_depth = i, min_samples_leaf = j)
            clf.fit(X_train, y_train)
            y_pred_train = clf.predict(X_train)
            train_acc = accuracy_score(y_pred_train, y_train)
            y_pred_test = clf.predict(X_test)
            test_acc = accuracy_score(y_pred_test, y_test)

            if ((train_acc > relative_best_train_score) and (test_acc > relative_best_test_score)):
                relative_best_train_score = train_acc
                relative_best_test_score = test_acc
                relative_best_ratio = k
                relative_best_depth = i
                relative_best_leaf = j


print("Best ratio of testing data:", relative_best_ratio)
print(f"Best Parameters: 'max_depth': {relative_best_depth}, 'min_samples_leaf': {relative_best_leaf}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = relative_best_ratio/100, random_state=71)
clf = DecisionTreeClassifier(random_state = 71, max_depth = relative_best_depth, min_samples_leaf = relative_best_leaf)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))

Best ratio of testing data: 20
Best Parameters: 'max_depth': 12, 'min_samples_leaf': 1
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.44      0.55      1042
           1       0.94      0.98      0.96      9023

    accuracy                           0.92     10065
   macro avg       0.82      0.71      0.75     10065
weighted avg       0.91      0.92      0.92     10065



#### Random Forest

In [3]:
from sklearn.ensemble import RandomForestClassifier

ratiovalues = [10, 20, 30]
n_estimatorvalues = [i for i in range(100, 250, 50)]
depthvalues = [i for i in range(1, 8)]
leafvalues = [i for i in range(1, 5)]
relative_best_train_score = 0
relative_best_test_score = 0
relative_best_ratio = 0
relative_best_estimators = 0
relative_best_depth = 0
relative_best_leaf = 0

for k in ratiovalues:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = k/100, random_state=71)
    for l in n_estimatorvalues:
        for i in depthvalues:
            for j in leafvalues:
                clf = RandomForestClassifier(random_state = 71, n_estimators = l, max_depth = i, min_samples_leaf = j)
                clf.fit(X_train, y_train)
                y_pred_train = clf.predict(X_train) #train
                train_acc = accuracy_score(y_pred_train, y_train)
                y_pred_test = clf.predict(X_test) #train
                test_acc = accuracy_score(y_pred_test, y_test)

                if ((train_acc > relative_best_train_score) and (test_acc > relative_best_test_score)):
                    relative_best_train_score = train_acc
                    relative_best_test_score = test_acc
                    relative_best_ratio = k
                    relative_best_estimators = l
                    relative_best_depth = i
                    relative_best_leaf = j

print("Best ratio of testing data:", relative_best_ratio)
print(f"Best Parameters: 'n_estimators': {relative_best_estimators}, 'max_depth': {relative_best_depth}, 'min_samples_leaf': {relative_best_leaf}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = relative_best_ratio/100, random_state=71)
clf = RandomForestClassifier(random_state = 71, n_estimators = relative_best_estimators, max_depth = relative_best_depth, min_samples_leaf = relative_best_leaf)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))

Best ratio of testing data: 30
Best Parameters: 'n_estimators': 100, 'max_depth': 7, 'min_samples_leaf': 1
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.32      0.45      1565
           1       0.93      0.99      0.96     13533

    accuracy                           0.92     15098
   macro avg       0.84      0.65      0.70     15098
weighted avg       0.91      0.92      0.90     15098



#### SVC

In [4]:
from sklearn import svm

ratiovalues = [10, 20, 30]
kernelvalues = ["rbf", "poly"]
gammavalues =["scale"]
relative_best_train_score = 0
relative_best_test_score = 0
relative_best_ratio = 0
relative_best_kernel = ""
relative_best_gamma = ""

for k in ratiovalues:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = k/100, random_state=71)
    for i in kernelvalues:
        for j in gammavalues:
            clf = svm.SVC(random_state = 71, kernel = i, gamma = j)
            clf.fit(X_train, y_train)
            y_pred_train = clf.predict(X_train)
            train_acc = accuracy_score(y_pred_train, y_train)
            y_pred_test = clf.predict(X_test)
            test_acc = accuracy_score(y_pred_test, y_test)

            if ((train_acc > relative_best_train_score) and (test_acc > relative_best_test_score)):
                relative_best_train_score = train_acc
                relative_best_test_score = test_acc
                relative_best_ratio = k
                relative_best_kernel = i
                relative_best_gamma = j

print("Best ratio of testing data:", relative_best_ratio)
print(f"Best Parameters: 'kernel': {relative_best_kernel}, 'gamma': {relative_best_gamma}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = relative_best_ratio/100, random_state=71)
clf = svm.SVC(random_state = 71, kernel = relative_best_kernel, gamma = relative_best_gamma)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))

Best ratio of testing data: 10
Best Parameters: 'kernel': rbf, 'gamma': scale
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.19      0.30       540
           1       0.91      0.99      0.95      4493

    accuracy                           0.91      5033
   macro avg       0.83      0.59      0.63      5033
weighted avg       0.89      0.91      0.88      5033



#### XGBoost

In [5]:
from xgboost.sklearn import XGBClassifier

ratiovalues = [10, 20, 30]
n_estimatorvalues = [i for i in range(100, 250, 50)]
depthvalues = [i for i in range(1, 8)]
ratevalues = [i for i in range(1, 3)]
relative_best_train_score = 0
relative_best_test_score = 0
relative_best_ratio = 0
relative_best_estimators = 0
relative_best_depth = 0
relative_best_rate = 0

for k in ratiovalues:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = k/100, random_state=71)
    for l in n_estimatorvalues:
        for i in depthvalues:
            for j in ratevalues:
                clf = XGBClassifier(random_state = 71, n_estimators = l, max_depth = i, learning_rate = j)
                clf.fit(X_train, y_train)
                y_pred_train = clf.predict(X_train)
                train_acc = accuracy_score(y_pred_train, y_train)
                y_pred_test = clf.predict(X_test)
                test_acc = accuracy_score(y_pred_test, y_test)

                if ((train_acc > relative_best_train_score) and (test_acc > relative_best_test_score)):
                    relative_best_train_score = train_acc
                    relative_best_test_score = test_acc
                    relative_best_ratio = k
                    relative_best_estimators = l
                    relative_best_depth = i
                    relative_best_rate = j

print("Best ratio of testing data:", relative_best_ratio)
print(f"Best Parameters: 'n_estimators': {relative_best_estimators}, 'max_depth': {relative_best_depth}, 'learning_rate': {relative_best_rate}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = relative_best_ratio/100, random_state=71)
clf = XGBClassifier(random_state = 71, n_estimators = relative_best_estimators, max_depth = relative_best_depth, learning_rate = relative_best_rate)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))

Best ratio of testing data: 20
Best Parameters: 'n_estimators': 100, 'max_depth': 6, 'learning_rate': 1
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.46      0.56      1042
           1       0.94      0.98      0.96      9023

    accuracy                           0.93     10065
   macro avg       0.84      0.72      0.76     10065
weighted avg       0.92      0.93      0.92     10065

