In [24]:
import numpy as np
import scipy as sc
import pandas as pd
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA

In [25]:
data = pd.read_csv("donors.csv", header=0, sep=',')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [26]:
data.head()

Unnamed: 0.1,Unnamed: 0,ODATEDW,OSOURCE,TCODE,STATE,ZIP,MAILCODE,PVASTATE,DOB,NOEXCH,...,AVGGIFT,CONTROLN,HPHONE_D,RFA_2R,RFA_2F,RFA_2A,MDMAUD_R,MDMAUD_F,MDMAUD_A,GEOCODE2
0,0,2009-01-01,GRI,0,IL,61081,,,1957-12-01,0,...,7.741935,95515,0,L,4,E,X,X,X,C
1,1,2014-01-01,BOA,1,CA,91326,,,1972-02-01,0,...,15.666667,148535,0,L,2,G,X,X,X,A
2,2,2010-01-01,AMH,1,NC,27017,,,,0,...,7.481481,15078,1,L,4,E,X,X,X,C
3,3,2007-01-01,BRY,0,CA,95953,,,1948-01-01,0,...,6.8125,172556,1,L,4,E,X,X,X,C
4,4,2006-01-01,,0,FL,33176,,,1940-01-01,0,...,6.864865,7112,1,L,2,F,X,X,X,A


In [27]:
data.columns

Index(['Unnamed: 0', 'ODATEDW', 'OSOURCE', 'TCODE', 'STATE', 'ZIP', 'MAILCODE',
       'PVASTATE', 'DOB', 'NOEXCH',
       ...
       'AVGGIFT', 'CONTROLN', 'HPHONE_D', 'RFA_2R', 'RFA_2F', 'RFA_2A',
       'MDMAUD_R', 'MDMAUD_F', 'MDMAUD_A', 'GEOCODE2'],
      dtype='object', length=476)

In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95412 entries, 0 to 95411
Columns: 476 entries, Unnamed: 0 to GEOCODE2
dtypes: float64(49), int64(302), object(125)
memory usage: 346.5+ MB


# Data Preparation

## Problems:
- Duplicates?
- Data types?
- Missing values?
- Strange values?
- Descriptive statistics?


In [29]:
# replace "" by nans
data.replace("", np.nan, inplace=True)

In [30]:
data.dtypes

Unnamed: 0     int64
ODATEDW       object
OSOURCE       object
TCODE          int64
STATE         object
               ...  
RFA_2A        object
MDMAUD_R      object
MDMAUD_F      object
MDMAUD_A      object
GEOCODE2      object
Length: 476, dtype: object

In [31]:
# check descriptive statistics again
data.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Unnamed: 0,95412,,,,47705.5,27543.2,0,23852.8,47705.5,71558.2,95411
ODATEDW,95412,54,2015-01-01,15358,,,,,,,
OSOURCE,95412,896,MBC,4539,,,,,,,
TCODE,95412,,,,54.2231,953.844,0,0,1,2,72002
STATE,95412,57,CA,17343,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
RFA_2A,95412,4,F,46964,,,,,,,
MDMAUD_R,95412,5,X,95118,,,,,,,
MDMAUD_F,95412,4,X,95118,,,,,,,
MDMAUD_A,95412,5,X,95118,,,,,,,


## Missing values


### Data Reduction

In order to deal with missing values it was necessary to calculate it dimension (percentage) of each column in the data set. 
A copy of data was created to assess an overview of our data exploration in every step of the data cleansing. 
After the missing values assessment, the data set will have a reduction of features which percentage is above 30% in order to correct inconsistencies in data. 

In [32]:
data_central = data.copy()

In [33]:
# Percentage of missing values in each column
nan_percentage = data_central.isna().sum()/len(data_central)*100.00
nan_percentage

Unnamed: 0    0.000000
ODATEDW       0.000000
OSOURCE       0.000000
TCODE         0.000000
STATE         0.000000
                ...   
RFA_2A        0.000000
MDMAUD_R      0.000000
MDMAUD_F      0.000000
MDMAUD_A      0.000000
GEOCODE2      0.138347
Length: 476, dtype: float64

In [34]:
# Total value of missing values
data_central.isna().values.sum()

5158884

In [35]:
# Select the columns with a "nan" percentage above 30%
above_na = nan_percentage[nan_percentage>=30]
above_na

NUMCHLD     87.018404
WEALTH1     46.882992
MBCRAFT     55.395548
MBGARDEN    55.395548
MBBOOKS     55.395548
              ...    
RAMNT_20    91.732696
RAMNT_21    90.029556
RAMNT_22    78.123297
RAMNT_23    91.763091
RAMNT_24    81.409047
Length: 69, dtype: float64

In [36]:
# Check the number of columns above 30%
len(above_na)

69

In [37]:
# Drop the list above_na which contains the columns names of data 
data_central = data_central.drop(columns=above_na.index)

In [38]:
# Check the missing values
data_central.isna().sum().unique()

array([    0, 23883, 21286,   132,  1950,  2191,  3557,  8874,  3511,
       11245, 10422,  8923, 18867, 20364, 27650, 21263, 24480, 25648,
           2,  9973])

### Fill Missing Values

In order to fill the missing values, we have to divide the data into metric and non-metric features to 

In [39]:
# Check the data types in data
data.dtypes.unique()


array([dtype('int64'), dtype('O'), dtype('float64')], dtype=object)

In [40]:
# Check the object columns = non-metric columns
non_metric_features = data_central.select_dtypes(include=['object']).columns

# Drop the non metric features
metric_features = data_central.columns.drop(non_metric_features).to_list()

In [41]:
# Calculate the sum of missing values
data_central.isna().sum()

Unnamed: 0      0
ODATEDW         0
OSOURCE         0
TCODE           0
STATE           0
             ... 
RFA_2A          0
MDMAUD_R        0
MDMAUD_F        0
MDMAUD_A        0
GEOCODE2      132
Length: 407, dtype: int64

In [42]:
#Calculate the median of each quantitive feature
data_central.median()

Unnamed: 0    47705.500000
TCODE             1.000000
INCOME            4.000000
HIT               0.000000
MALEMILI          0.000000
                  ...     
TIMELAG           6.000000
AVGGIFT          11.636364
CONTROLN      95681.500000
HPHONE_D          1.000000
RFA_2F            2.000000
Length: 312, dtype: float64

In [None]:
# Não tenho a certeza disto
# Get the mode(s) of each element along the non-metric features.
# The mode of a set of values is the value that appears most often. It can be multiple values.
modes = data_central[non_metric_features]
modes

In [47]:
modes = data_central[non_metric_features].mode().loc[0]
modes

ODATEDW     2015-01-01
OSOURCE            MBC
STATE               CA
ZIP              85351
MAILCODE              
               ...    
RFA_2A               F
MDMAUD_R             X
MDMAUD_F             X
MDMAUD_A             X
GEOCODE2             A
Name: 0, Length: 95, dtype: object

Filling the non-metric and metric missing values

In [48]:
# Fill NaNs on df_central
data_central.fillna(data_central.median(), inplace=True)
data_central.fillna(modes, inplace=True)
data_central.isna().sum()  # checking how many NaNs we still have

Unnamed: 0    0
ODATEDW       0
OSOURCE       0
TCODE         0
STATE         0
             ..
RFA_2A        0
MDMAUD_R      0
MDMAUD_F      0
MDMAUD_A      0
GEOCODE2      0
Length: 407, dtype: int64

In [49]:
# Check the number of missing values 
data_central.isna().values.sum()

0

É suposto dar um valor tão grande de missing values?

### Outlier Removal

In [51]:
data_original = data.copy()

In [52]:
data = data_central.copy()

In [53]:
# Compute the IQR
#only for metric variables 
q25 = data.quantile(.25)
q75 = data.quantile(.75)
iqr = (q75 - q25) * 1.5

# Compute upper and lower limit (lower_limit = Q1 -1.5*IQR | upper_limit = Q3 + 1.5*IQR)
upper_lim = q75 + 1.5 * iqr
lower_lim = q25 - 1.5 * iqr

filters = []
for metric in metric_features:
    llim = lower_lim[metric]
    ulim = upper_lim[metric]
    filters.append(data[metric].between(llim, ulim, inclusive=True))

data_2 = data[np.all(filters, 0)]
print('Percentage of data kept after removing outliers:', np.round(data_2.shape[0] / data_original.shape[0], 4))

Percentage of data kept after removing outliers: 0.0034


ISTO ESTÁ MAL!!

## Data Normalization

In [54]:
data_standard = data.copy()

In [55]:
scaler = StandardScaler()
scaled_feat = scaler.fit_transform(data_standard[metric_features])
scaled_feat

array([[-1.73203265, -0.05684721,  0.05402941, ..., -0.00476043,
        -1.00123751,  1.94822609],
       [-1.73199635, -0.05579882,  1.27676515, ...,  0.95428208,
        -1.00123751,  0.08384741],
       [-1.73196004, -0.05579882, -0.55733846, ..., -1.4597303 ,
         0.99876402,  1.94822609],
       ...,
       [ 1.73196004, -0.05579882,  0.05402941, ...,  1.6978204 ,
         0.99876402,  1.01603675],
       [ 1.73199635, -0.05684721,  1.88813301, ..., -1.64757746,
         0.99876402,  1.94822609],
       [ 1.73203265, -0.05475042,  0.66539728, ...,  1.61593459,
         0.99876402, -0.84834193]])

In [56]:
# See what the fit method is doing (notice the trailing underscore):
print("Parameters fitted:\n", scaler.mean_, "\n", scaler.var_)

Parameters fitted:
 [4.77055000e+04 5.42231166e+01 3.91162537e+00 3.32143756e+00
 1.04844254e+00 3.04453318e+01 2.97023121e+01 3.26377185e+01
 6.84232591e+00 4.56856580e+00 3.11086656e+00 3.25588020e+03
 8.64993083e+02 1.22257346e+03 5.85896533e+01 1.36224794e+01
 2.61405274e+01 4.82117239e+01 5.09513898e+01 8.48545047e+01
 7.46740452e+00 7.76904373e-01 2.90570369e+00 7.45807655e+00
 2.16010565e-01 3.97350438e-01 6.12805517e-01 5.61491217e-01
 2.50586928e-01 2.10308976e-01 6.83876242e-02 5.13816920e+00
 3.02477676e-01 3.27432608e-01 1.51361464e+00 3.44752756e+01
 4.19113214e+01 4.51086446e+01 3.59173794e+01 4.46942313e+01
 4.78894374e+01 2.45150924e+01 3.95927451e+01 3.83576385e+01
 2.10147151e+01 1.21968725e+01 2.21898189e+01 2.06529787e+01
 1.40577810e+01 1.18577852e+01 1.05432755e+01 7.66796629e+00
 1.61756592e+01 1.61394269e+01 3.16346476e+01 1.97541714e+01
 1.52568545e+01 2.46032889e+01 9.37184002e+00 2.23084727e+01
 2.28217101e+01 3.36304448e+01 4.26974280e+01 2.56154153e+01
 1.0

In [57]:
data_standard[metric_features] = scaled_feat
data_standard.head()

Unnamed: 0.1,Unnamed: 0,ODATEDW,OSOURCE,TCODE,STATE,ZIP,MAILCODE,PVASTATE,DOB,NOEXCH,...,AVGGIFT,CONTROLN,HPHONE_D,RFA_2R,RFA_2F,RFA_2A,MDMAUD_R,MDMAUD_F,MDMAUD_A,GEOCODE2
0,-1.732033,2009-01-01,GRI,-0.056847,IL,61081,,,1957-12-01,0,...,-0.520509,-0.00476,-1.001238,L,1.948226,E,X,X,X,C
1,-1.731996,2014-01-01,BOA,-0.055799,CA,91326,,,1972-02-01,0,...,0.21531,0.954282,-1.001238,L,0.083847,G,X,X,X,A
2,-1.73196,2010-01-01,AMH,-0.055799,NC,27017,,,1968-01-01,0,...,-0.544692,-1.45973,0.998764,L,1.948226,E,X,X,X,C
3,-1.731924,2007-01-01,BRY,-0.056847,CA,95953,,,1948-01-01,0,...,-0.606808,1.388782,0.998764,L,1.948226,E,X,X,X,C
4,-1.731887,2006-01-01,,-0.056847,FL,33176,,,1940-01-01,0,...,-0.601946,-1.603822,0.998764,L,0.083847,F,X,X,X,A


In [58]:
# Checking mean and variance of standardized variables
data_standard[metric_features].describe().round(2)

Unnamed: 0.1,Unnamed: 0,TCODE,INCOME,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,STATEGOV,...,NGIFTALL,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F
count,95412.0,95412.0,95412.0,95412.0,95412.0,95412.0,95412.0,95412.0,95412.0,95412.0,...,95412.0,95412.0,95412.0,95412.0,95412.0,95412.0,95412.0,95412.0,95412.0,95412.0
mean,-0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,...,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.73,-0.06,-1.78,-0.36,-0.21,-2.65,-1.97,-1.84,-1.56,-0.89,...,-1.01,-1.12,-0.9,-0.6,-1.24,-1.01,-1.12,-1.73,-1.0,-0.85
25%,-0.87,-0.06,-0.56,-0.36,-0.21,-0.56,-0.64,-0.66,-0.65,-0.7,...,-0.77,-0.68,-0.56,-0.24,-0.52,-0.5,-0.46,-0.87,-1.0,-0.85
50%,0.0,-0.06,0.05,-0.36,-0.21,0.05,-0.05,-0.04,-0.19,-0.31,...,-0.3,-0.23,-0.33,-0.12,-0.17,-0.24,-0.16,-0.0,1.0,0.08
75%,0.87,-0.05,0.67,-0.03,-0.21,0.57,0.62,0.59,0.49,0.28,...,0.4,0.43,0.24,0.12,0.19,0.27,0.2,0.87,1.0,1.02
max,1.73,75.43,1.89,25.54,19.34,5.98,4.59,3.75,20.98,18.42,...,26.58,7.94,112.94,198.66,70.41,138.5,91.61,1.74,1.0,1.95


In [59]:
data = data_standard.copy()

## One-hot encoding

In [60]:
data_ohc = data.copy()
data_ohc

Unnamed: 0.1,Unnamed: 0,ODATEDW,OSOURCE,TCODE,STATE,ZIP,MAILCODE,PVASTATE,DOB,NOEXCH,...,AVGGIFT,CONTROLN,HPHONE_D,RFA_2R,RFA_2F,RFA_2A,MDMAUD_R,MDMAUD_F,MDMAUD_A,GEOCODE2
0,-1.732033,2009-01-01,GRI,-0.056847,IL,61081,,,1957-12-01,0,...,-0.520509,-0.004760,-1.001238,L,1.948226,E,X,X,X,C
1,-1.731996,2014-01-01,BOA,-0.055799,CA,91326,,,1972-02-01,0,...,0.215310,0.954282,-1.001238,L,0.083847,G,X,X,X,A
2,-1.731960,2010-01-01,AMH,-0.055799,NC,27017,,,1968-01-01,0,...,-0.544692,-1.459730,0.998764,L,1.948226,E,X,X,X,C
3,-1.731924,2007-01-01,BRY,-0.056847,CA,95953,,,1948-01-01,0,...,-0.606808,1.388782,0.998764,L,1.948226,E,X,X,X,C
4,-1.731887,2006-01-01,,-0.056847,FL,33176,,,1940-01-01,0,...,-0.601946,-1.603822,0.998764,L,0.083847,F,X,X,X,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,1.731887,2016-01-01,ASE,-0.055799,AK,99504,,,1968-01-01,0,...,1.081920,1.606058,-1.001238,L,-0.848342,G,X,X,X,C
95408,1.731924,2016-01-01,DCD,-0.055799,TX,77379,,,1970-01-01,0,...,0.617665,0.487079,0.998764,L,-0.848342,F,X,X,X,A
95409,1.731960,2015-01-01,MBC,-0.055799,MI,48910,,,1958-01-01,0,...,-0.470019,1.697820,0.998764,L,1.016037,E,X,X,X,B
95410,1.731996,2006-01-01,PRV,-0.056847,CA,91320,,,1960-05-01,0,...,-0.111555,-1.647577,0.998764,L,1.948226,F,X,X,X,A


In [62]:
# Use OneHotEncoder to encode the categorical features. Get feature names and create a DataFrame 
# with the one-hot encoded categorical features (pass feature names)
ohc = OneHotEncoder(sparse=False, drop="first")
ohc_feat = ohc.fit_transform(data_ohc[non_metric_features])

TypeError: Encoders require their input to be uniformly strings or numbers. Got ['int', 'str']

### Redo data exploration