# Lab | Random Forests

For this lab, you will be using the CSV files provided in the `files_for_lab` folder.



In [217]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder

In [218]:
numerical = pd.read_csv('files_for_lab/numerical.csv')
categorical = pd.read_csv('files_for_lab/categorical.csv')
targets = pd.read_csv('files_for_lab/target.csv')


In [219]:
data = pd.concat([numerical, categorical, targets], axis = 1)
data['TARGET_B'].value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

In [220]:
data.isna().sum().sum()

0



- Apply the Random Forests algorithm but this time only by upscaling the data.


In [221]:
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis = 1)

In [222]:
numericalX = X.select_dtypes(np.number)
categoricalX = X.select_dtypes('object')

In [223]:
encoder = OneHotEncoder(drop='first').fit(categoricalX)
encoded_categorical = encoder.transform(categoricalX).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)
X = pd.concat([numericalX, encoded_categorical], axis = 1)


In [224]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [225]:
y_train.value_counts()

0    72464
1     3865
Name: TARGET_B, dtype: int64

In [226]:
# for downsampling we need to temporarily concat X_train and y_train
trainset = pd.concat([X_train, y_train], axis=1)

category_0 = trainset[trainset['TARGET_B'] == 0] #Highest population
category_1 = trainset[trainset['TARGET_B'] == 1] #lowest population

In [227]:
from sklearn.utils import resample

category_1_oversampled = resample(category_1, 
                                  replace=True, # the difference
                                  n_samples = len(category_0))

In [228]:
print(category_0.shape)
print(category_1_oversampled.shape)

(72464, 356)
(72464, 356)


The data is upscaled.

In [229]:
# category_0 = trainset[trainset['TARGET_B']== 1 ]
trainset_new = pd.concat([category_0, category_1_oversampled], axis = 0)
trainset_new = trainset_new.sample(frac =1) #randomize the rows
X_train = trainset_new.drop(['TARGET_B'], axis=1)
y_train = trainset_new['TARGET_B']
#data = data.reset_index(drop=True)
print(X_train.shape)

(144928, 355)


In [230]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

y_train_regression = X_train['TARGET_D']
y_test_regression = X_test['TARGET_D']

# Now we can remove the column target d from the set of features
X_train = X_train.drop(['TARGET_D'], axis = 1)
X_test = X_test.drop(['TARGET_D'], axis = 1)

In [231]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.2,
                             random_state = 42)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

y_pred = clf.predict(X_test)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))



0.6183829211746522




0.6027354189592832




0    18105
1      978
Name: TARGET_B, dtype: int64

array([[10976,  7129],
       [  452,   526]], dtype=int64)

Now I will try to use feature selection.

In [232]:
numerical = pd.read_csv('files_for_lab/numerical.csv')
categorical = pd.read_csv('files_for_lab/categorical.csv')
targets = pd.read_csv('files_for_lab/target.csv')


Limiting the amount of categories.

In [233]:
from sklearn.feature_selection import VarianceThreshold 
var_threshold = 0.02
sel = VarianceThreshold(threshold=(var_threshold))

In [234]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
numerical_scaled = scaler.fit_transform(numerical)
# I need the numerical data scaled to use variance threshold

In [235]:
# This drops the columns that have a variance less than this threshold
sel = sel.fit(numerical_scaled)
temp = sel.transform(numerical_scaled)
temp = pd.DataFrame(temp)
print(numerical_scaled.shape)
print(temp.shape)

(95412, 315)
(95412, 79)


In [236]:
# get columns to keep/drop
sel.get_support()
var_list = list(sel.get_support())
[col[0] for col in zip(numerical.columns, var_list) if col[1] == False]

removed_columns = pd.DataFrame(data=(numerical.columns,sel.variances_,sel.get_support()), index=('column_name','variance','statement')).T
removed_columns.loc[(removed_columns['statement'] == False),:]

Unnamed: 0,column_name,variance,statement
0,TCODE,0.000175,False
4,HIT,0.001491,False
5,MALEMILI,0.002618,False
6,MALEVET,0.013424,False
9,LOCALGOV,0.001968,False
...,...,...,...
306,MINRAMNT,0.000077,False
307,MAXRAMNT,0.000025,False
308,LASTGIFT,0.000195,False
309,TIMELAG,0.000051,False


In [237]:
removed_columns.loc[(removed_columns['statement'] == True),:]['column_name']

1           AGE
2        INCOME
3       WEALTH1
7      VIETVETS
8      WWIIVETS
         ...   
299    CARDPROM
311    CONTROLN
312    HPHONE_D
313      RFA_2F
314    CLUSTER2
Name: column_name, Length: 79, dtype: object

In [238]:
numerical_selected = numerical[removed_columns.loc[(removed_columns['statement'] == True),:]['column_name']]
numerical_selected

Unnamed: 0,AGE,INCOME,WEALTH1,VIETVETS,WWIIVETS,WEALTH2,POP90C1,POP90C2,POP90C3,ETH1,...,HC17,HC18,HC19,MHUC1,MHUC2,CARDPROM,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,60.000000,5,9,34,18,5,0,35,65,92,...,33,65,40,6,2,27,95515,0,4,39
1,46.000000,6,9,55,11,9,99,0,0,67,...,99,0,99,20,4,12,148535,0,2,1
2,61.611649,3,1,29,33,1,0,2,98,96,...,22,77,17,9,2,26,15078,1,4,60
3,70.000000,1,4,14,31,0,0,8,92,61,...,23,77,22,16,2,27,172556,1,4,41
4,78.000000,3,2,9,53,9,99,0,0,2,...,99,1,21,6,2,43,7112,1,2,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,61.611649,5,9,47,11,9,99,0,0,78,...,93,7,98,16,4,6,184568,0,1,12
95408,48.000000,7,9,43,19,9,96,0,4,91,...,69,31,67,18,5,4,122706,1,1,2
95409,60.000000,5,9,46,20,9,99,0,0,82,...,99,0,99,5,2,14,189641,1,3,34
95410,58.000000,7,9,35,20,7,99,0,0,92,...,99,0,99,12,3,36,4693,1,4,11


In [239]:
data = pd.concat([numerical_selected, categorical, targets], axis = 1)
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis = 1)
numericalX = X.select_dtypes(np.number)
categoricalX = X.select_dtypes('object')

encoder = OneHotEncoder(drop='first').fit(categoricalX)
encoded_categorical = encoder.transform(categoricalX).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)
X = pd.concat([numericalX, encoded_categorical], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

trainset = pd.concat([X_train, y_train], axis=1)

category_0 = trainset[trainset['TARGET_B'] == 0] #Highest population
category_1 = trainset[trainset['TARGET_B'] == 1] #lowest population

category_1_oversampled = resample(category_1, 
                                  replace=True, # the difference
                                  n_samples = len(category_0))

In [240]:
trainset_new = pd.concat([category_0, category_1_oversampled], axis = 0)
trainset_new = trainset_new.sample(frac =1) #randomize the rows
X_train = trainset_new.drop(['TARGET_B'], axis=1)
y_train = trainset_new['TARGET_B']
print(X_train.shape)

(144928, 119)


In [241]:
category_0.shape

(72464, 120)

In [242]:
category_1.shape

(3865, 120)

In [243]:
category_1_oversampled.shape

(72464, 120)

In [244]:
trainset_new.isna().sum().sum()

0

In [245]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

y_train_regression = X_train['TARGET_D']
y_test_regression = X_test['TARGET_D']

# Now we can remove the column target d from the set of features
X_train = X_train.drop(['TARGET_D'], axis = 1)
X_test = X_test.drop(['TARGET_D'], axis = 1)

In [246]:
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.2,
                             random_state = 42)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

y_pred = clf.predict(X_test)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))



0.6211774122322808




0.591206833307132




0    18105
1      978
Name: TARGET_B, dtype: int64

array([[10713,  7392],
       [  409,   569]], dtype=int64)

- Discuss the output and its impact in the bussiness scenario. Is the cost of a false positive equals to the cost of the false negative? How would you change your algorithm or data in order to maximize the return of the bussiness?

As we can see, the amount of false negatives is very low compared to the actual positives, but the false positives might be too many.  
In the case of this dataset, the main cost would be the targeted marketing plans: 7k more ads could cost too much, but those extra ads could work to convince people that are close enough to the target to be convinced by that marketing stunt.

The algorithm itself could be improved by using a different model for the feature selection, or even more advanced techniques like bootstrapping

# Lab | Final regression model in "Health Care for All" Case

### Instructions

At this point, we have created a model to predict who will make a donation and who won't. But, what about the ammount of money that each person will give?
In this lab, subset those that made a donation and use that subset to create a model to predict how much money will they give.

Evaluate the result of your model and estimate how much better the result are for the bussiness in comparison with the naive scenario we discuss on Monday.

In [247]:
#I only want data from the people who donated
subset = data[data['TARGET_B'] == 1]
subset

Unnamed: 0,AGE,INCOME,WEALTH1,VIETVETS,WWIIVETS,WEALTH2,POP90C1,POP90C2,POP90C3,ETH1,...,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM,TARGET_B,TARGET_D
20,62.000000,3,8,40,27,9,99,0,0,97,...,88,1,94,4,96,3,87,1,1,4.0
30,61.611649,5,9,58,16,9,99,0,0,94,...,90,4,93,1,95,12,90,4,1,7.0
45,66.000000,5,9,24,39,9,89,0,11,99,...,93,12,94,4,96,2,87,4,1,5.0
78,69.000000,6,9,20,54,9,99,0,0,97,...,90,1,95,3,95,11,90,1,1,13.0
93,73.000000,1,7,53,8,7,99,0,0,7,...,92,9,95,9,95,9,92,9,1,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95298,45.000000,5,9,28,37,2,0,99,1,94,...,89,6,96,1,96,1,86,8,1,20.0
95309,51.000000,5,6,43,24,6,99,0,0,90,...,93,10,94,2,95,12,93,10,1,15.0
95398,86.000000,5,9,21,26,9,99,0,0,85,...,89,6,95,11,96,2,87,11,1,3.0
95403,58.000000,4,9,46,20,5,0,1,99,99,...,90,3,93,12,96,1,90,3,1,10.0


In [248]:
subset.dtypes.value_counts()

int64      92
object      7
float64     4
dtype: int64

In [249]:
len(subset.drop(['TARGET_D', 'TARGET_B'], axis = 1) )

4843

In [250]:
y = subset['TARGET_D']
len(y)

4843

In [251]:
X = subset.drop(['TARGET_D', 'TARGET_B'], axis = 1) #I also don't need the column TARGET_B, as it will be 1 in every row

In [252]:
X.shape

(4843, 101)

In [253]:
numericalX = X.select_dtypes(np.number).reset_index()
categoricalX = X.select_dtypes('object')

encoder = OneHotEncoder(drop='first').fit(categoricalX)
encoded_categorical = encoder.transform(categoricalX).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical).reset_index()
X = pd.concat([numericalX, encoded_categorical], axis = 1)

In [254]:
X.shape

(4843, 120)

In [255]:
y

20        4.0
30        7.0
45        5.0
78       13.0
93       10.0
         ... 
95298    20.0
95309    15.0
95398     3.0
95403    10.0
95410    18.0
Name: TARGET_D, Length: 4843, dtype: float64

In [256]:
# linear regression
from sklearn import linear_model

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [257]:

lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)

from sklearn.metrics import r2_score
predictions = lm.predict(X_train)
r2_score(y_train, predictions)




0.3963402296244126

In [260]:
predictions = lm.predict(X_test)
r2_score(y_test, predictions)



0.36463844703898707

In [261]:

from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_test,predictions))

9.720668509159967