# Lesson 1

## Loading libraries

In [5]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_boston
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

from scipy.stats import t, norm

## Loading the dataset

In [6]:
X, y = load_boston(return_X_y=True)
print(X.shape)
print(y.shape)

(506, 13)
(506,)


## Train test split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

## Training different models

In [8]:
model1 = DecisionTreeRegressor()
model2 = LinearRegression()
model3 = KNeighborsRegressor()

In [9]:
X_train.shape

(339, 13)

## Automating model creation

In [10]:
model_pipeline = [model1, model2, model3]
model_names = ['Regression Tree', 'Linear Regression', 'KNN']


def confidence_intervals(model_pipeline, model_names, X_train, y_train, alpha = 0.05, K = 10):
# We set the significance level
#alpha = 0.05
#K = 10
    scores = {}
    i=0
    for model in model_pipeline:
        mean_score = np.mean(cross_val_score(model, X_train, y_train, cv=K))
        if (K < 30):
            # t.ppf(area) gives us the critical value corresponding to the area for the t-student distribution.
            t_critical = abs(t.ppf(1-alpha/2, K-1)) 
            interval = t_critical*(np.std(cross_val_score(model, X_train, y_train, cv=K))/np.sqrt(K))
        else:
            # norm.ppf(area) gives us the critical value corresponding to the area for the normal distribution
            z_critical = abs(norm.ppf(1-alpha/2)) 
            interval = z_critical*(np.std(cross_val_score(model, X_train, y_train, cv=K))/np.sqrt(K))
        scores[model_names[i]] = [mean_score, mean_score - interval, mean_score + interval]
        print("The rmse of the {} model is (CV witk K={}) = {:4.2f} +/- {:4.2f}".format(model_names[i], K, mean_score, interval))
        i = i+1

confidence_intervals(model_pipeline, model_names, X_train, y_train, 0.05, 5)

The rmse of the Regression Tree model is (CV witk K=5) = 0.80 +/- 0.06
The rmse of the Linear Regression model is (CV witk K=5) = 0.71 +/- 0.07
The rmse of the KNN model is (CV witk K=5) = 0.51 +/- 0.05


## Activity 1

## Loading data

In [62]:
numerical = pd.read_csv('/Users/carolinvogt/Becoming_Data_Analyst/Week_08/Day_04/lab-random-forests/files_for_lab/numerical.csv')
categorical = pd.read_csv('/Users/carolinvogt/Becoming_Data_Analyst/Week_08/Day_04/lab-random-forests/files_for_lab/categorical.csv')
targets = pd.read_csv('/Users/carolinvogt/Becoming_Data_Analyst/Week_08/Day_04/lab-random-forests/files_for_lab/target.csv')
data = pd.concat([numerical, categorical, targets], axis = 1)
#data['TARGET_B'].value_counts()

In [63]:
categorical


Unnamed: 0,STATE,CLUSTER,HOMEOWNR,GENDER,DATASRCE,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B,...,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
0,IL,36,H,F,3,L,E,C,T,2,...,37,12,92,8,94,2,95,12,89,11
1,CA,14,H,M,3,L,G,A,S,1,...,52,2,93,10,95,12,95,12,93,10
2,NC,43,U,M,3,L,E,C,R,2,...,0,2,91,11,92,7,95,12,90,1
3,CA,44,U,F,3,L,E,C,R,2,...,28,1,87,11,94,11,95,12,87,2
4,FL,16,H,F,3,L,F,A,S,2,...,20,1,93,10,96,1,96,1,79,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,other,27,H,M,3,L,G,C,C,2,...,0,2,96,2,96,2,96,2,96,2
95408,TX,24,H,M,3,L,F,A,C,1,...,50,1,96,3,96,3,96,3,96,3
95409,MI,30,H,M,3,L,E,B,C,3,...,38,1,96,3,95,1,96,10,94,10
95410,CA,24,H,F,2,L,F,A,C,1,...,40,5,90,11,96,8,97,1,86,12


In [64]:
len(categorical["CLUSTER"].unique())


53

In [65]:
max(categorical["CLUSTER"].unique())

53

In [66]:
categorical[['STATE', 'CLUSTER', 'HOMEOWNR', 'GENDER', 'DATASRCE', 'RFA_2R',
       'RFA_2A', 'GEOCODE2', 'DOMAIN_A', 'DOMAIN_B', 'ODATEW_YR', 'ODATEW_MM',
       'DOB_YR', 'DOB_MM', 'MINRDATE_YR', 'MINRDATE_MM', 'MAXRDATE_YR',
       'MAXRDATE_MM', 'LASTDATE_YR', 'LASTDATE_MM', 'FIRSTDATE_YR',
       'FIRSTDATE_MM']].nunique()

STATE           12
CLUSTER         53
HOMEOWNR         2
GENDER           3
DATASRCE         3
RFA_2R           1
RFA_2A           4
GEOCODE2         4
DOMAIN_A         5
DOMAIN_B         4
ODATEW_YR       15
ODATEW_MM       12
DOB_YR          96
DOB_MM          12
MINRDATE_YR     20
MINRDATE_MM     12
MAXRDATE_YR     18
MAXRDATE_MM     12
LASTDATE_YR      3
LASTDATE_MM     12
FIRSTDATE_YR    26
FIRSTDATE_MM    12
dtype: int64

In [67]:
categorical.columns

Index(['STATE', 'CLUSTER', 'HOMEOWNR', 'GENDER', 'DATASRCE', 'RFA_2R',
       'RFA_2A', 'GEOCODE2', 'DOMAIN_A', 'DOMAIN_B', 'ODATEW_YR', 'ODATEW_MM',
       'DOB_YR', 'DOB_MM', 'MINRDATE_YR', 'MINRDATE_MM', 'MAXRDATE_YR',
       'MAXRDATE_MM', 'LASTDATE_YR', 'LASTDATE_MM', 'FIRSTDATE_YR',
       'FIRSTDATE_MM'],
      dtype='object')

In [68]:
to_drop=['STATE', 'CLUSTER', 'ODATEW_MM', 'DOB_MM', 'MINRDATE_MM',
       'MAXRDATE_MM', 'LASTDATE_MM','FIRSTDATE_MM']

categorical.drop(to_drop,axis=1,inplace=True)

In [69]:
categorical.drop(["RFA_2R"],axis=1,inplace=True)

In [70]:
categorical

Unnamed: 0,HOMEOWNR,GENDER,DATASRCE,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B,ODATEW_YR,DOB_YR,MINRDATE_YR,MAXRDATE_YR,LASTDATE_YR,FIRSTDATE_YR
0,H,F,3,E,C,T,2,89,37,92,94,95,89
1,H,M,3,G,A,S,1,94,52,93,95,95,93
2,U,M,3,E,C,R,2,90,0,91,92,95,90
3,U,F,3,E,C,R,2,87,28,87,94,95,87
4,H,F,3,F,A,S,2,86,20,93,96,96,79
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,H,M,3,G,C,C,2,96,0,96,96,96,96
95408,H,M,3,F,A,C,1,96,50,96,96,96,96
95409,H,M,3,E,B,C,3,95,38,96,95,96,94
95410,H,F,2,F,A,C,1,86,40,90,96,97,86


In [71]:
categorical.columns

Index(['HOMEOWNR', 'GENDER', 'DATASRCE', 'RFA_2A', 'GEOCODE2', 'DOMAIN_A',
       'DOMAIN_B', 'ODATEW_YR', 'DOB_YR', 'MINRDATE_YR', 'MAXRDATE_YR',
       'LASTDATE_YR', 'FIRSTDATE_YR'],
      dtype='object')

In [72]:
years=['ODATEW_YR', 'DOB_YR', 'MINRDATE_YR', 'MAXRDATE_YR',
       'LASTDATE_YR', 'FIRSTDATE_YR']

for year in years:
    categorical[year] = pd.to_numeric(categorical[year],errors="coerce")

In [73]:
categorical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95412 entries, 0 to 95411
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   HOMEOWNR      95412 non-null  object
 1   GENDER        95412 non-null  object
 2   DATASRCE      95412 non-null  int64 
 3   RFA_2A        95412 non-null  object
 4   GEOCODE2      95412 non-null  object
 5   DOMAIN_A      95412 non-null  object
 6   DOMAIN_B      95412 non-null  int64 
 7   ODATEW_YR     95412 non-null  int64 
 8   DOB_YR        95412 non-null  int64 
 9   MINRDATE_YR   95412 non-null  int64 
 10  MAXRDATE_YR   95412 non-null  int64 
 11  LASTDATE_YR   95412 non-null  int64 
 12  FIRSTDATE_YR  95412 non-null  int64 
dtypes: int64(8), object(5)
memory usage: 9.5+ MB


In [74]:
categorical[['HOMEOWNR', 'GENDER', 'RFA_2A', 'GEOCODE2', 'DOMAIN_A','DOMAIN_B']].nunique()

HOMEOWNR    2
GENDER      3
RFA_2A      4
GEOCODE2    4
DOMAIN_A    5
DOMAIN_B    4
dtype: int64

In [75]:
dummies=['HOMEOWNR', 'GENDER', 'RFA_2A', 'GEOCODE2', 'DOMAIN_A','DOMAIN_B']

df_dumm=categorical[dummies]
df_dumm = pd.get_dummies(df_dumm, drop_first = True)

dummies2=list(df_dumm.columns)

#df_dumm=df[dummies]
#categoricals= pd.get_dummies(df_dumm, drop_first = True)

In [82]:
dummies2=list(df_dumm.columns)

for dumm in dummies2:
    categorical[dumm] = df_dumm[dumm]
    
categorical=categorical.drop(dummies,axis=1)
    
#df_dumm=df[dummies]
#categoricals= pd.get_dummies(df_dumm, drop_first = True)

In [84]:
categorical.shape

(95412, 20)

In [89]:
def variance_threshold_selector(numerical, threshold=0.9):
    selector = VarianceThreshold(threshold)
    selector.fit(numerical)
    return numerical[numerical.columns[selector.get_support(indices=True)]]

In [90]:
temp=variance_threshold_selector(numerical, threshold=0.9)
temp

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,NGIFTALL,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,RFA_2F,CLUSTER2
0,0,60.000000,5,9,0,0,39,34,18,10,...,31,14,5.0,12.0,10.0,4,7.741935,95515,4,39
1,1,46.000000,6,9,16,0,15,55,11,6,...,3,1,10.0,25.0,25.0,18,15.666667,148535,2,1
2,1,61.611649,3,1,2,0,20,29,33,6,...,27,14,2.0,16.0,5.0,12,7.481481,15078,4,60
3,0,70.000000,1,4,2,0,23,14,31,3,...,16,7,2.0,11.0,10.0,9,6.812500,172556,4,41
4,0,78.000000,3,2,60,1,28,9,53,26,...,37,8,3.0,15.0,15.0,14,6.864865,7112,2,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,1,61.611649,5,9,0,14,36,47,11,7,...,1,0,25.0,25.0,25.0,9,25.000000,184568,1,12
95408,1,48.000000,7,9,1,0,31,43,19,4,...,1,0,20.0,20.0,20.0,9,20.000000,122706,1,2
95409,1,60.000000,5,9,0,0,18,46,20,7,...,7,4,3.0,10.0,10.0,3,8.285714,189641,3,34
95410,0,58.000000,7,9,0,0,28,35,20,9,...,41,18,5.0,21.0,18.0,4,12.146341,4693,4,11


In [92]:
numerical=temp

## SelectKBest

In [95]:
targets.isnull().sum()

TARGET_B    0
TARGET_D    0
dtype: int64

In [99]:
y2 = pd.DataFrame([targets["TARGET_D"]])

In [101]:
X = numerical
y1 = targets["TARGET_B"]

from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, chi2


kbest1 = SelectKBest(chi2, k=10).fit_transform(X,y1)
#kbest2 = SelectKBest(chi2, k=10).fit_transform(X,y2)
# Here we chose 10 so that is easier to analyze results later, as we will see
selected1 = pd.DataFrame(kbest1)
selected1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,992.0,264.0,479.0,635.0,0.0,318.0,12883.0,240.0,95515.0
1,1.0,3611.0,940.0,5468.0,5218.0,4480.0,1096.0,36175.0,47.0,148535.0
2,1.0,7001.0,2040.0,497.0,546.0,0.0,292.0,11576.0,202.0,15078.0
3,0.0,640.0,160.0,1000.0,1263.0,9340.0,388.0,15130.0,109.0,172556.0
4,0.0,2520.0,627.0,576.0,594.0,5000.0,250.0,9836.0,254.0,7112.0
...,...,...,...,...,...,...,...,...,...,...
95407,1.0,27380.0,7252.0,988.0,1025.0,380.0,481.0,18807.0,25.0,184568.0
95408,1.0,1254.0,322.0,1679.0,1723.0,3360.0,836.0,26538.0,20.0,122706.0
95409,1.0,552.0,131.0,376.0,377.0,4040.0,264.0,12178.0,58.0,189641.0
95410,0.0,1746.0,432.0,2421.0,2459.0,8735.0,544.0,15948.0,498.0,4693.0


In [None]:
selected2 = pd.DataFrame(kbest2)
selected2

In [105]:
# To check the scores

#(1) saving the model, which fitted the kbest before
model = SelectKBest(chi2, k=30).fit(X,y1)
#try more matrix for num
#unique values of numericals (Bin)
df =pd.DataFrame(data = model.scores_, columns = ['score'])
df['Column'] = numerical.columns
# Sorting data
print(df.sort_values(by = ['score'], ascending = False).head(30))

             score    Column
302  527716.426176  CONTROLN
139  187983.976667       IC5
82    49855.611718       HV1
83    49561.067003       HV2
0     39087.069814     TCODE
132   26891.429352       MSA
13    17167.230879    POP901
136    2921.367106       IC2
14     2811.233301    POP902
294    2756.199364  RAMNTALL
138    2751.447661       IC4
109    2620.983688      HVP1
110    2549.991693      HVP2
135    2448.754088       IC1
137    2415.716681       IC3
15     2166.139007    POP903
111    2132.962948      HVP3
114    1906.878033      HVP6
295    1883.560588  NGIFTALL
112    1472.531558      HVP4
299    1354.814567  LASTGIFT
22     1290.383905      ETH2
128    1245.572226       RP1
291    1178.545945   NUMPROM
296    1128.176823  CARDGIFT
129    1091.241022       RP2
134     975.874975       DMA
297     892.237811  MINRAMNT
301     872.722150   AVGGIFT
298     847.342102  MAXRAMNT


In [109]:
list_to_keep = list(df.sort_values(by = ['score'], ascending = False).head(30)['Column'].values)

In [115]:
type(list_to_keep)

list

In [116]:
num_final=numerical[list_to_keep]

In [119]:
X = pd.concat([num_final,categorical],axis=1)

In [120]:
X

Unnamed: 0,CONTROLN,IC5,HV1,HV2,TCODE,MSA,POP901,IC2,POP902,RAMNTALL,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,95515,12883,479,635,0,0.0,992,318,264,240.0,...,1,0,0,0,1,0,0,0,1,0
1,148535,36175,5468,5218,1,4480.0,3611,1096,940,47.0,...,0,0,1,0,0,0,0,1,0,0
2,15078,11576,497,546,1,0.0,7001,292,2040,202.0,...,1,0,0,0,1,0,1,0,0,0
3,172556,15130,1000,1263,0,9340.0,640,388,160,109.0,...,1,0,0,0,1,0,1,0,0,0
4,7112,9836,576,594,0,5000.0,2520,250,627,254.0,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,184568,18807,988,1025,1,380.0,27380,481,7252,25.0,...,0,0,1,0,1,0,0,0,0,0
95408,122706,26538,1679,1723,1,3360.0,1254,836,322,20.0,...,0,1,0,0,0,0,0,0,0,0
95409,189641,12178,376,377,1,4040.0,552,264,131,58.0,...,1,0,0,1,0,0,0,0,0,0
95410,4693,15948,2421,2459,0,8735.0,1746,544,432,498.0,...,0,1,0,0,0,0,0,0,0,0


In [121]:
#TRAIN-TEST-SPLIT
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, y1, test_size=0.3, random_state=100)

In [123]:
#-- 4.2 --
#BALANCING
from imblearn.over_sampling import SMOTE

sa = SMOTE()
X_s, Y_s = sa.fit_sample(X_train, Y_train)

In [126]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_s, Y_s)
print("The accuracy of the Random forest is: {:4.2f}".format(clf.score(X_test, Y_test)))
print()

alpha = 0.05
K = 10
# For cross validation
clf = RandomForestClassifier(max_depth=2, random_state=0)
cross_val_scores = cross_val_score(clf, X_s, Y_s, cv=K)

if (K < 30):
    t_critical = abs(t.ppf(1-alpha/2, K-1))
    interval = t_critical*(np.std(cross_val_score(clf, X_s, Y_s, cv=10))/np.sqrt(K))
else:
    z_critical = abs(norm.ppf(1-alpha/2))
    interval = z_critical*(np.std(cross_val_score(clf, X_s, Y_s , cv=10))/np.sqrt(K)) 
print("The accuracy of the Random Forest model (CV witk K={}) is: {:4.2f} +/- {:4.2f}".format(K,np.mean(cross_val_scores),interval))

The accuracy of the Random forest is: 0.72

The accuracy of the Random Forest model (CV witk K=10) is: 0.77 +/- 0.03


## Downsampling to balance data

In [None]:
category_0 = data[data['TARGET_B']==0].sample(len(data[data['TARGET_B']==1]))
print(category_0.shape)

category_1 = data[data['TARGET_B']== 1 ]
data = pd.concat([category_0, category_1], axis = 0)
data = data.sample(frac =1)
data = data.reset_index(drop=True)
print(data.shape)

(4843, 339)
(9686, 339)


## Data processing

In [None]:
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis = 1)

numericalX = X.select_dtypes(np.number)
categorcalX = X.select_dtypes(np.object)

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first').fit(categorcalX)
encoded_categorical = encoder.transform(categorcalX).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)
X = pd.concat([numericalX, encoded_categorical], axis = 1)

## Train - test split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

## Retaining info for regression model

In [None]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

y_train_regression = X_train['TARGET_D']
y_test_regression = X_test['TARGET_D']

# Now we can remove the column target d from the set of features 
X_train = X_train.drop(['TARGET_D'], axis = 1)
X_test = X_test.drop(['TARGET_D'], axis = 1)

## Building the model

In [None]:
X_train.shape

(7264, 354)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
print("The accuracy of the Random forest is: {:4.2f}".format(clf.score(X_test, y_test)))
print()

alpha = 0.05
K = 10
# For cross validation
clf = RandomForestClassifier(max_depth=2, random_state=0)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=K)

if (K < 30):
    t_critical = abs(t.ppf(1-alpha/2, K-1))
    interval = t_critical*(np.std(cross_val_score(clf, X_train, y_train, cv=10))/np.sqrt(K))
else:
    z_critical = abs(norm.ppf(1-alpha/2))
    interval = z_critical*(np.std(cross_val_score(clf, X_train, y_train, cv=10))/np.sqrt(K)) 
print("The accuracy of the Random Forest model (CV witk K={}) is: {:4.2f} +/- {:4.2f}".format(K,np.mean(cross_val_scores),interval))

The accuracy of the Random forest is: 0.56

The accuracy of the Random Forest model (CV witk K=10) is: 0.58 +/- 0.01
