# Initial Data Exploration 

In [None]:
import pandas as pd
import numpy as np

data = pd.read_csv('DonationHistory.csv')

print(data.head())
print(data.info())
print(data.describe())

# Data Cleaning and Preprocessing 

In [None]:
import pandas as pd
import numpy as np

# MI: 1 if MI is present and 0 if not
data['MI'] = np.where(data['MI'].isna(), 0, 1)

# ALUMNI_TYPE: Binary encoding, assuming 'Undergraduate' as 0, 'Graduate' as 1
data['ALUMNI_TYPE'] = np.where(data['ALUMNI_TYPE'] == 'Graduate', 1, 0)

# GENDER: 'M' to 1 and 'F' to 0
data['GENDER'] = np.where(data['GENDER'] == 'M', 1, 0)

# MARRIED_TO_ALUM: 'Y' to 1 and 'N' to 0
data['MARRIED_TO_ALUM'] = np.where(data['MARRIED_TO_ALUM'] == 'Y', 1, 0)

# Feature Engineering 

In [None]:
# UG_AND_GRAD: Indicating both UG and GRAD degrees were completed
data['UG_AND_GRAD'] = np.where((data['UG_CLASS_YEAR'] > 0) & (data['GRAD_CLASS_YEAR'] > 0), 1, 0)

# TIME_PERIOD: Indicating the time period between the donors' UG and GRAD studies
data['TIME_PERIOD'] = np.where(data['GRAD_CLASS_YEAR'] > 0, 
                               data['GRAD_CLASS_YEAR'] - data['UG_CLASS_YEAR'], 
                               -1) # If there is no graduate degree, 'TIME_PERIOD' is set to -1

In [None]:
# Dropping Columns that would not be useful for further analysis
data.drop(columns=['CITY', 'ZIP', 'UG_SCHOOL', 'GRAD_DEGREE', 'GRAD_SCHOOL', 'GENDER'], inplace=True)
print(data.head())

# Model Training and Evaluation

### Splitting Dataset into Training and Test Sets 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

# Target variable
data['DONATED'] = np.where(data['NUMBER_OF_DONATIONS'] > 0, 1, 0)

X = data.drop(columns=['ACCOUNT_ID', 'NUMBER_OF_DONATIONS', 'VALUE_OF_DONATIONS', 'DONATED', 'STATE'])
y = data['DONATED']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Hyperparameter Tuning 

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Hyperparameter tuning for Random Forest
rf_param_grid = {
    'n_estimators': [50, 100, 200],      # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],     # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],     # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],       # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]           # Method of selecting samples for training each tree
}

rf_random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=rf_param_grid,
    n_iter=20,  
    cv=3,
    n_jobs=-1,
    scoring='roc_auc',
    verbose=2,
    random_state=42
)
rf_random_search.fit(X_train, y_train)

print("Best Random Forest parameters:", rf_random_search.best_params_)

In [None]:
# Hyperparameter tuning for XGBoost

xgb_param_grid = {
    'n_estimators': [50, 100, 200],       # Number of boosting rounds
    'max_depth': [3, 6, 10],              # Maximum depth of a tree
    'learning_rate': [0.01, 0.1, 0.2],    # Step size shrinkage
    'subsample': [0.6, 0.8, 1.0],         # Fraction of samples to be used for fitting the individual base learners
    'colsample_bytree': [0.6, 0.8, 1.0],  # Fraction of features to be used for each tree
    'gamma': [0, 0.1, 0.2],               # Minimum loss reduction required to make a further partition
}

# Randomized Search for XGBoost
xgb_random_search = RandomizedSearchCV(
    estimator=XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    param_distributions=xgb_param_grid,
    n_iter=20,  
    cv=3,
    n_jobs=-1,
    scoring='roc_auc',
    verbose=2,
    random_state=42
)
xgb_random_search.fit(X_train, y_train)

print("Best XGBoost parameters:", xgb_random_search.best_params_)

### Model Evaluation

In [None]:
rf_best_model = rf_random_search.best_estimator_
xgb_best_model = xgb_random_search.best_estimator_

rf_best_predictions = rf_best_model.predict(X_test)
xgb_best_predictions = xgb_best_model.predict(X_test)

# Evaluate the models
rf_best_accuracy = accuracy_score(y_test, rf_best_predictions)
rf_best_auc = roc_auc_score(y_test, rf_best_model.predict_proba(X_test)[:, 1])

xgb_best_accuracy = accuracy_score(y_test, xgb_best_predictions)
xgb_best_auc = roc_auc_score(y_test, xgb_best_model.predict_proba(X_test)[:, 1])

print("Random Forest Accuracy:", rf_best_accuracy)
print("Random Forest AUC Score:", rf_best_auc)
print("XGBoost Accuracy:", xgb_best_accuracy)
print("XGBoost AUC Score:", xgb_best_auc)


### Choosen Best Model: XGBoost 

In [None]:
rf_best_model = rf_random_search.best_estimator_
xgb_best_model = xgb_random_search.best_estimator_

# predictions on the entire dataset
data['donation_probability'] = xgb_best_model.predict_proba(X)[:, 1]

# Sort all alumni by predicted donation probability in descending order 
sorted_alumni = data.sort_values(by='donation_probability', ascending=False)

# Top 10,000 alumni selected
top_10000_alumni = sorted_alumni.head(10000)

print(top_10000_alumni)

# Clustering

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Select features for clustering within the top 10,000 donors
top_donors_features = top_10000_alumni[['MI', 'TIME_PERIOD', 'JC', 'UG_AND_GRAD', 'ROTC', 'MARRIED_TO_ALUM']]

# Standardize the features
scaler = StandardScaler()
scaled_top_donors_features = scaler.fit_transform(top_donors_features)

# K-Means Clustering
kmeans_top_donors = KMeans(n_clusters=4, random_state=42)  # Optimal K=4 was choosen by Elbow Method
top_10000_alumni['Cluster'] = kmeans_top_donors.fit_predict(scaled_top_donors_features)

print(top_10000_alumni[['ACCOUNT_ID', 'donation_probability', 'Cluster']].head())

# Analyse clusters
for cluster in range(4):
    print(f"Cluster {cluster}")
    cluster_data = top_10000_alumni[top_10000_alumni['Cluster'] == cluster]
    print(cluster_data.describe())


### Number of Donors in each Cluster

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Number of donors in each cluster
cluster_counts = top_10000_alumni['Cluster'].value_counts().sort_index()

plt.figure(figsize=(8, 5))
sns.barplot(x=cluster_counts.index, y=cluster_counts.values, palette='viridis')
plt.xlabel('Cluster')
plt.ylabel('Number of Donors')
plt.title('Number of Donors in Each Cluster')
plt.show()


# Visualisations

### JC (Junior Counselor) and MI (Middle Initial) Across Clusters

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

features_to_plot = ['JC', 'MI']

for feature in features_to_plot:
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Cluster', y=feature, data=top_10000_alumni, palette='viridis', ci=None)
    plt.title(f'Average {feature} across Clusters')
    plt.show()

### Alumnus Distributed Across Different States in US

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Group by state and count the number of donors
state_counts = top_10000_alumni['STATE'].value_counts()

plt.figure(figsize=(14, 7))
state_counts.head(10).plot(kind='bar', title='Top States for Donors')
plt.xlabel('State')
plt.ylabel('Number of Donors')
plt.xticks(rotation=90)
plt.show()


### Number of Donors in each Time Period

In [None]:
import matplotlib.pyplot as plt

# Bins for the time periods
bins = [-1, 0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
labels = ['No gap', '(1-5)', '(5-10)', '(10-15)', '(15-20)', '(20-25)', '(25-30)', '(30-35)', '(35-40)', '(40-45)', '(45-50)']

# Categorize the time periods into bins
top_10000_alumni['TIME_PERIOD_BIN'] = pd.cut(top_10000_alumni['TIME_PERIOD'], bins=bins, labels=labels)

# Number of donors in each time period category
time_period_counts = top_10000_alumni['TIME_PERIOD_BIN'].value_counts().sort_index()

plt.figure(figsize=(10, 6))
time_period_counts.plot(kind='bar', color='navy')
plt.title('Distribution of Time Gap Between Studies')
plt.xlabel('Time Gap (Years)')
plt.ylabel('Number of Donors')
plt.xticks(rotation=45)
plt.show()
