# Lab | Random Forests

For this lab, you will be using the CSV files provided in the `files_for_lab` folder.  These are cleaned versions of the learningSet data from the Case Study 'Healthcare for All'.   
Begin a new Jupyter Notebook after Forking and Cloning this Repo.

### Instructions

- Apply the Random Forests algorithm but this time only by upscaling the data.
- Use Feature Selections that you have learned in class to decide if you want to use all of the features (Variance Threshold, RFE, PCA, etc.)
- Discuss the output and its impact in the bussiness scenario. Is the cost of a false positive equals to the cost of the false negative? How would you change your algorithm or data in order to maximize the return of the business?

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

#### Importing datasets

In [2]:
numerical = pd.read_csv('./files_for_lab/numerical.csv')
categorical = pd.read_csv('./files_for_lab/categorical.csv')
target = pd.read_csv('./files_for_lab/target.csv')

data = pd.concat([categorical,numerical,target],axis=1)

In [21]:
display(data.head(),data.shape)

Unnamed: 0,STATE,CLUSTER,HOMEOWNR,GENDER,DATASRCE,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B,...,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,TARGET_B,TARGET_D
0,IL,36,H,F,3,L,E,C,T,2,...,12.0,10.0,4,7.741935,95515,0,4,39,0,0.0
1,CA,14,H,M,3,L,G,A,S,1,...,25.0,25.0,18,15.666667,148535,0,2,1,0,0.0
2,NC,43,U,M,3,L,E,C,R,2,...,16.0,5.0,12,7.481481,15078,1,4,60,0,0.0
3,CA,44,U,F,3,L,E,C,R,2,...,11.0,10.0,9,6.8125,172556,1,4,41,0,0.0
4,FL,16,H,F,3,L,F,A,S,2,...,15.0,15.0,14,6.864865,7112,1,2,26,0,0.0


(95412, 339)

###### Target D (Amount donated)

In [4]:
target_d = data[data['TARGET_D']!=0]

###### Checking NaN

In [5]:

nulls = [column for column in data if data[column].isna().sum() > 0]
nulls       

[]

###### Upsampling - Imbalanced data

In [6]:
data['TARGET_B'].value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

In [7]:
# Upsampling
no_donate = data[data['TARGET_B']==0]
yes_donate = data[data['TARGET_B']==1]

from sklearn.utils import resample
yes_donate_oversampled = resample(yes_donate, 
                                    replace=True,
                                    n_samples = len(no_donate),
                                    random_state=42)


display(no_donate.shape)
display(yes_donate_oversampled.shape)

(90569, 339)

(90569, 339)

In [8]:
oversampled = pd.concat([no_donate,yes_donate_oversampled])

In [9]:
#X-Y Split, train-test split

X = oversampled.drop(['TARGET_B','TARGET_D'],axis=1)
y = oversampled['TARGET_B']


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [11]:
#cat/num X-train
#make a split between the continuous and discrete variables as I don't want to scale or encode the discrete ones, I want to keep their current values
X_train_num = X_train.select_dtypes(include = np.number)
X_train_cat = X_train.select_dtypes(object)

#cat/num X-test
X_test_num = X_test.select_dtypes(include = np.number)
X_test_cat = X_test.select_dtypes(object)


from sklearn.preprocessing import StandardScaler
#Scale X-train
transformer = StandardScaler().fit(X_train_num)
X_scaled_train = pd.DataFrame(transformer.transform(X_train_num),columns=X_train_num.columns)

#Scale X-test
X_scaled_test = pd.DataFrame(transformer.transform(X_test_num),columns=X_test_num.columns)
X_scaled_test

Unnamed: 0,CLUSTER,DATASRCE,DOMAIN_B,ODATEW_YR,ODATEW_MM,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,...,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,1.182838,0.626022,0.159428,0.531017,-0.029559,-1.320474,-0.311571,1.334569,-0.617088,0.963573,...,-0.329634,0.286884,0.237411,0.623210,-0.444501,0.537244,1.554770,-0.998298,0.829085,-1.159685
1,-0.000362,0.626022,0.159428,1.112893,-0.029559,-1.320474,-0.311571,0.960959,-0.099493,0.412726,...,-0.964663,0.832180,-0.167817,-0.092231,0.148719,0.218165,-0.954379,-0.998298,-0.949825,-0.947620
2,-1.392363,-0.973903,1.535152,-0.050858,-0.029559,0.994498,2.598269,-0.159871,1.194494,0.412726,...,-0.752987,0.286884,0.237411,0.623210,0.593634,0.386909,0.868392,-0.998298,-0.949825,-0.152375
3,0.138838,0.626022,1.535152,1.403831,-0.029559,-1.320474,-0.311571,1.334569,-1.393481,0.963573,...,-0.964663,1.377477,0.034797,0.265490,0.148719,0.678375,1.082045,-0.998298,-0.949825,1.013985
4,-0.000362,-2.573828,0.159428,-0.050858,-0.029559,0.346306,-0.634886,-0.159871,0.676899,0.412726,...,-0.964663,0.286884,-0.127294,-0.020687,-1.186026,-0.057962,-1.699336,-0.998298,-0.949825,-1.053652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54337,1.113238,0.626022,0.159428,-1.505548,-0.029559,-1.320474,-0.311571,1.334569,-1.393481,0.412726,...,2.633833,-0.258413,-0.370430,-0.807672,-0.444501,-0.677155,-1.219564,-0.998298,1.718541,-1.530799
54338,1.391639,0.626022,0.159428,-0.923672,-0.029559,-1.320474,-0.311571,-0.533481,0.935697,-0.138120,...,0.093718,-0.476531,-0.167817,-0.163775,2.224989,-0.426131,0.260573,-0.998298,-0.949825,1.491132
54339,1.391639,-0.973903,0.159428,-0.923672,-0.029559,-1.320474,-0.311571,-0.907091,-0.358291,-2.341506,...,0.305394,-0.694650,0.034797,-0.449952,0.297024,-0.173015,0.771088,-0.998298,-0.060370,1.491132
54340,1.252439,0.626022,0.159428,-1.505548,-0.029559,-0.301886,0.981691,-0.159871,0.935697,0.963573,...,0.728747,-0.258413,-0.137425,-0.038573,-0.444501,-0.266974,1.406762,1.001705,0.829085,1.491132


In [12]:
from sklearn.preprocessing import OneHotEncoder
#Create encoder
encoder = OneHotEncoder(drop='first').fit(X_train_cat)

#encode train
encoded_train = encoder.transform(X_train_cat).toarray()
encoded_train = pd.DataFrame(encoded_train)

#encode test
encoded_test = encoder.transform(X_test_cat).toarray()
encoded_test = pd.DataFrame(encoded_test)

In [13]:
train_scaled = pd.concat([encoded_train,X_scaled_train],axis=1)
test_scaled = pd.concat([encoded_test,X_scaled_test],axis=1)
df = pd.concat([test_scaled,train_scaled],axis=0)

In [14]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
clf.fit(train_scaled, y_train)
print(clf.score(train_scaled, y_train))
print(clf.score(test_scaled, y_test))



0.6157528628663365




0.6124728570902801


In [15]:
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
cross_val_scores = cross_val_score(clf, train_scaled, y_train, cv=10)
print(np.mean(cross_val_scores))
print(cross_val_scores)



0.6119120870522134
[0.60615142 0.6101735  0.60780757 0.6112776  0.61600946 0.61506309
 0.61211452 0.61140468 0.61455951 0.61455951]




In [16]:
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

In [17]:
clf = RandomForestClassifier(max_depth=10,
                             min_samples_split=20,
                             min_samples_leaf =20).fit(train_scaled, y_train)

pred_RF = clf.predict(test_scaled)

print('accuracy:', accuracy_score(y_test, pred_RF))
print("precision: ",precision_score(y_test,pred_RF))
print("recall: ",recall_score(y_test,pred_RF))
print("f1: ",f1_score(y_test,pred_RF))



accuracy: 0.8018843620036068
precision:  0.7887343484443828
recall:  0.8294729162867044
f1:  0.8085908331259113


In [18]:
round(target_d['TARGET_D'].mean(),2)


15.62

In [19]:
donation_gained = 11236 * 15.62
donation_lost = 7018 * 15.62
spend_marketing = (7072+11236) * 0.68
no_return_marketing = 7072 * 0.68

print('Donation amount gained:',round(donation_gained,2))
print('Donation amount lost:',round(donation_lost,2))
print('Amount spend on marketing:',round(spend_marketing,2))
print('Amount lost on marketing:',round(no_return_marketing,2))

Donation amount gained: 175506.32
Donation amount lost: 109621.16
Amount spend on marketing: 12449.44
Amount lost on marketing: 4808.96


# Lab | Final regression model in "Health Care for All" Case

Instructions
At this point, we have created a model to predict who will make a donation and who won't (Classification Model). But, what about the ammount of money that each person will give?

In this lab, subset those that have made a donation (Target B) and use that subset to create a model to predict how much money will they give (Target D) (Regression Model).

- Only look at people who have donated (Target B = 1)
- Use this new dataframe to predict how much they will donate (Target D)
- Using the regression model, make predictions on all of the people our classification model predicted will donate.
- Evaluate the result of your model and estimate how much better the result are for the business in comparison with the naive scenario we discuss on Monday. (Just sending donation cards to everyone)

In [262]:
numerical = data.select_dtypes(include = np.number)
numerical = numerical.drop(['TARGET_B','TARGET_D'], axis = 1)
categorical = data.select_dtypes(include = object)
display(numerical.shape,categorical.shape)

(95412, 330)

(95412, 7)

In [263]:
num_scaled = pd.DataFrame(transformer.transform(numerical), columns = numerical.columns, index = numerical.index)

In [264]:
encoded = pd.DataFrame(encoder.transform(categorical).toarray(), index=categorical.index)

In [265]:
#column_names = encoder.get_feature_names_out(X_train_cat.columns)

In [266]:
X_all = pd.concat([encoded , num_scaled], axis=1)

In [267]:
X_all.shape

(95412, 354)

In [268]:
X_all.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.787128,-0.258413,-0.289385,-0.449952,-0.592806,-0.449883,-0.029524,-0.998298,1.718541,0.430805
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.964663,0.286884,0.237411,0.62321,1.483464,0.279526,0.90458,-0.998298,-0.06037,-1.583816
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.787128,-0.58559,-0.127294,-0.807672,0.593634,-0.473856,-1.446659,1.001705,1.718541,1.544148
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.305394,-0.58559,-0.329908,-0.449952,0.148719,-0.535431,1.32778,1.001705,1.718541,0.536838
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.517071,-0.476531,-0.167817,-0.092231,0.890244,-0.530611,-1.587004,1.001705,-0.06037,-0.258407


In [269]:
#X_all_selected = X_all.drop(droplist_full, axis=1)

###### Column 'Predicted_Donnors'

In [270]:
y_pred = clf.predict(X_all)
X_all['Predicted_Donnors'] = y_pred
display(X_all, X_all.shape)



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,Predicted_Donnors
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.258413,-0.289385,-0.449952,-0.592806,-0.449883,-0.029524,-0.998298,1.718541,0.430805,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.286884,0.237411,0.623210,1.483464,0.279526,0.904580,-0.998298,-0.060370,-1.583816,0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.585590,-0.127294,-0.807672,0.593634,-0.473856,-1.446659,1.001705,1.718541,1.544148,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.585590,-0.329908,-0.449952,0.148719,-0.535431,1.327780,1.001705,1.718541,0.536838,1
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.476531,-0.167817,-0.092231,0.890244,-0.530611,-1.587004,1.001705,-0.060370,-0.258407,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.922773,0.237411,0.623210,0.148719,1.138586,1.539407,-0.998298,-0.949825,-1.000636,0
95408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.377477,0.034797,0.265490,0.148719,0.678375,0.449525,1.001705,-0.949825,-1.530799,0
95409,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.476531,-0.370430,-0.449952,-0.741111,-0.399833,1.628783,1.001705,0.829085,0.165724,1
95410,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.258413,0.075320,0.122401,-0.592806,-0.044492,-1.629622,1.001705,1.718541,-1.053652,1


(95412, 355)

In [271]:
X_all.to_csv("Full_data.csv", index=False)

###### Saving dataframe with 'B_predictions' (predicted donnors)

In [272]:
Full_data = pd.read_csv("Full_data.csv")

##### Adding TARGET_B and TARGET_D columns ( Subset and Prediction target)

In [273]:
Full_data['TARGET_B'] = data['TARGET_B']
Full_data['TARGET_D'] = data['TARGET_D']
display(Full_data.head(),Full_data.shape)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,Predicted_Donnors,TARGET_B,TARGET_D
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.449952,-0.592806,-0.449883,-0.029524,-0.998298,1.718541,0.430805,1,0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.62321,1.483464,0.279526,0.90458,-0.998298,-0.06037,-1.583816,0,0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.807672,0.593634,-0.473856,-1.446659,1.001705,1.718541,1.544148,1,0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.449952,0.148719,-0.535431,1.32778,1.001705,1.718541,0.536838,1,0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.092231,0.890244,-0.530611,-1.587004,1.001705,-0.06037,-0.258407,0,0,0.0


(95412, 357)

###### PREPARING TO TRAIN MODEL (subset TARGET B = 1)

In [274]:
subset_b1 = Full_data[Full_data['TARGET_B'] == 1]
subset_b1.shape

(4843, 357)

In [275]:
y = subset_b1['TARGET_D']
X = subset_b1.drop(['TARGET_D', 'TARGET_B'], axis = 1) # TARGET B IS NOT USEFUL ANYMORE

###### TRAIN-TEST SPLIT

In [276]:
# As dataset is already scaled an encoded, we just proceed to Train-Test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

###### LINEAR REGRESSION MODEL

In [277]:
# Fit LM with train data
from sklearn import linear_model

lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)

LinearRegression()

In [278]:
# Predictions for train data
from sklearn.metrics import r2_score

predictions = lm.predict(X_train)
r2_score(y_train, predictions)

0.5755666244101929

In [279]:
# Predictions for test data
predictions_test = lm.predict(X_test)
r2_score(y_test, predictions_test)

0.4847344367875447

In [280]:
# Comparing Y_Test against Predictions_test.
display("y_test results", y_test[:10] )
display ("predictions_test results", predictions_test[:10])

'y_test results'

37309      8.0
31837    100.0
30842      5.0
3940       4.0
24300     12.0
59951     23.0
87924     12.0
19175     10.0
71769     10.0
81751     10.0
Name: TARGET_D, dtype: float64

'predictions_test results'

array([18.94598235, 20.47121306, 16.86260359,  4.70430201,  5.17566846,
       30.09882015, 15.13281263,  8.78492873, 15.04378475, 10.32333754])

###### Column predicted_donation

In [281]:
y_pred = lm.predict(X)
X['Predicted_Donnations'] = y_pred
pred_donation=X
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,Predicted_Donnors,Predicted_Donnations
20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.491999,-0.807672,0.593634,-0.788163,-0.251017,1.001705,0.829085,-1.477783,1,4.239255
30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,-0.370430,-0.664584,0.148719,-0.593480,1.640622,1.001705,0.829085,-0.894603,1,6.126871
45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.532522,-0.807672,-0.741111,-0.715406,-0.363032,1.001705,0.829085,-1.053652,1,4.807571
78,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.086771,-0.449952,1.928379,-0.150004,1.042757,-0.998298,-0.060370,-1.530799,1,17.408309
93,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.289385,-0.306863,-0.296196,-0.297272,-1.260561,1.001705,0.829085,-0.470473,1,16.704272
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.086771,0.050857,-0.147891,-0.432052,1.010446,-0.998298,-0.949825,1.120018,1,18.806763
95309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.167817,-0.092231,-0.592806,-0.088643,1.305687,1.001705,-0.949825,-0.576505,1,14.608666
95398,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.237411,0.265490,1.038549,0.162939,-0.323462,-0.998298,0.829085,-1.477783,1,23.096479
95403,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.034797,0.265490,0.297024,-0.096313,-0.220450,-0.998298,-0.949825,1.332083,0,18.148775


In [282]:
subset_b2 = Full_data[Full_data['Predicted_Donnors'] == 1]
subset_b2.shape

(22932, 357)

In [283]:
X = subset_b2.drop(['TARGET_D', 'TARGET_B'], axis = 1)

In [284]:
X.shape

(22932, 355)

In [285]:
y_pred = lm.predict(X)
X['Predicted_Donnations'] = y_pred
pred_donation=X
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,Predicted_Donnors,Predicted_Donnations
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.289385,-0.449952,-0.592806,-0.449883,-0.029524,-0.998298,1.718541,0.430805,1,8.354649
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.127294,-0.807672,0.593634,-0.473856,-1.446659,1.001705,1.718541,1.544148,1,7.627283
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.329908,-0.449952,0.148719,-0.535431,1.327780,1.001705,1.718541,0.536838,1,8.568910
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.491999,-0.736128,-0.444501,-0.671576,-0.205122,1.001705,1.718541,1.385099,1,3.276017
20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.491999,-0.807672,0.593634,-0.788163,-0.251017,1.001705,0.829085,-1.477783,1,4.239255
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95387,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.532522,-0.807672,-0.147891,-0.741704,-1.156051,1.001705,1.718541,0.218740,1,9.176595
95398,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.237411,0.265490,1.038549,0.162939,-0.323462,-0.998298,0.829085,-1.477783,1,23.096479
95406,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.410953,-1.022305,-0.741111,-0.851826,-1.094124,-0.998298,1.718541,0.642871,1,3.005724
95409,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.370430,-0.449952,-0.741111,-0.399833,1.628783,1.001705,0.829085,0.165724,1,13.616540


In [286]:
pred_donation[['Predicted_Donnors','Predicted_Donnations']]


Unnamed: 0,Predicted_Donnors,Predicted_Donnations
0,1,8.354649
2,1,7.627283
3,1,8.568910
12,1,3.276017
20,1,4.239255
...,...,...
95387,1,9.176595
95398,1,23.096479
95406,1,3.005724
95409,1,13.616540


# PCA

In [287]:
from sklearn.decomposition import PCA

pca = PCA(n_components=90)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [288]:
lm_pca = linear_model.LinearRegression()
lm_pca.fit(X_train_pca, y_train)

predict_pca = lm_pca.predict(X_train_pca)
r2_score(y_train, predict_pca)

0.3952085505651508

# subset B_predictions = 1 ( predicted donnor)

In [253]:
New_pred = Full_data[Full_data['B_predictions'] == 1]
New_pred.shape

(22932, 357)

In [254]:
###### Fit and train regressor 

In [255]:
New_pred['B_predictions'] # predicted donnors // NO TARGET_B

0        1
2        1
3        1
12       1
20       1
        ..
95387    1
95398    1
95406    1
95409    1
95410    1
Name: B_predictions, Length: 22932, dtype: int64

###### X-y split

In [256]:
y = New_pred['TARGET_D']
X = New_pred.drop(['TARGET_D', 'TARGET_B'], axis = 1)