In [1]:
import pandas as pd
import numpy as np

numerical = pd.read_csv('./files_for_lab/numerical.csv')
categorical = pd.read_csv('./files_for_lab/categorical.csv')
targets = pd.read_csv('./files_for_lab/target.csv')
data = pd.concat([numerical, categorical, targets], axis = 1)

In [2]:
category_1 = data[data['TARGET_B']==1].sample(len(data[data['TARGET_B']==0]), replace=True)
print(category_1.shape)

category_0 = data[data['TARGET_B'] == 0 ]
data = pd.concat([category_0, category_1], axis = 0)
data = data.dropna()
data = data.sample(frac =1) #randomize the rows
data = data.reset_index(drop=True)
print(data.shape)

(90569, 339)
(181138, 339)


In [3]:
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis = 1)

numericalX = X.select_dtypes(np.number)
categoricalX = X.select_dtypes(np.object)

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(categoricalX)
encoded_categorical = encoder.transform(categoricalX).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)
X = pd.concat([numericalX, encoded_categorical], axis = 1)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [4]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

y_train_regression = X_train['TARGET_D']
y_test_regression = X_test['TARGET_D']

X_train = X_train.drop(['TARGET_D'], axis = 1)
X_test = X_test.drop(['TARGET_D'], axis = 1)

In [5]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

0.616106548892416
0.614497074086342


In [6]:
# # Runs for too long...

# from sklearn.model_selection import cross_val_score

# clf = RandomForestClassifier(max_depth=5,
#                              min_samples_split=20,
#                              min_samples_leaf =20)
# cross_val_scores = cross_val_score(clf, X_train, y_train, cv=10)
# print(np.mean(cross_val_scores))

In [7]:
from sklearn.metrics import confusion_matrix

y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

# True positive | False positive
# False negative | True negative

array([[11150,  7028],
       [ 6938, 11112]])

Discuss the output and its impact in the bussiness scenario. Is the cost of a false positive equals to the cost of the false negative? How would you change your algorithm or data in order to maximize the return of the bussiness?

We got a model that shows a stable score of approx. 0.61, similar to the downscaling one.
The false negative's cost is much more adverse in this situation - these are people who will not be targeted with the fundraising campaign and then are likely not to contribute.
Given the mean donation amount of 13 USD, a total of approx. 7k false negatives will translate to a 90k USD loss.
On the flip side, directing the false positives (people who will not donate despite targetted campaign) will only cost approx. 0.6 YSD x 7k, or 4.2k USD in total.

To minimize the losses, we should aim to reduce the number of false negatives. 
We should try out different classification models (there are 29), see below, combined with imbalance treatments (incl. SMOTE) to finds out a model that shows the best results.

In [8]:
# from lazypredict.Supervised import LazyClassifier

# clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
# models,predictions = clf.fit(X_train, X_test, y_train, y_test)

# print(models)

In [9]:
X_test.reset_index(drop=True, inplace=True)
y_test_regression.reset_index(drop=True, inplace=True)
data = pd.DataFrame([])
data = pd.concat([X_test,pd.Series(y_pred)], axis = 1)
data.columns = [*data.columns[:-1], 'predicted_donor']
data = pd.concat([data,y_test_regression], axis = 1)
data = data.rename(columns={'TARGET_D': 'actual donation'})

In [None]:
# ---- Lab 2 Starts here

In [10]:
# Building regression model

from sklearn.tree import DecisionTreeRegressor

regr = DecisionTreeRegressor(max_depth=15)
model = regr.fit(X_train, y_train)

print("test data accuracy was: ",regr.score(X_test, y_test_regression))
print("train data accuracy was: ",regr.score(X_train, y_train_regression))

#-0.36.. interesting

test data accuracy was:  -0.3489348526815028
train data accuracy was:  -0.3566195534503511


In [None]:
# Treat with RFE first

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

y = y_train_regression

from sklearn.feature_selection import RFE
from sklearn import linear_model
lm = linear_model.LinearRegression()
rfe = RFE(lm, n_features_to_select=20, verbose=False)
rfe.fit(X_train_scaled, y)

In [None]:
df = pd.DataFrame(data = rfe.ranking_, columns=['Rank'])
df['Column_name'] = pd.DataFrame(X_train).columns
df[df['Rank']==1]