In [15]:
import pandas as pd
import numpy as np

In [16]:
train = pd.read_csv('./data/trainFeatures.csv')
test = pd.read_csv('./data/testFeatures.csv')

## Samples

In [17]:
train_samples = train.shape[0]
test_samples = test.shape[0]
train_features = train.shape[1]
test_features = test.shape[1]
print("train samples={}, test_samples={}".format(train_samples, test_samples))
print("train features={}, test_features={}".format(train_features, test_features))

train samples=34189, test_samples=14653
train features=14, test_features=14


## Is the data containing missing value?

In [18]:
train_miss = train.isna().any()
test_mis = test.isna().any()

In [19]:
train_miss

age               False
workclass         False
fnlwgt            False
education         False
education-num     False
Marital-status    False
occupation        False
relationship      False
race              False
sex               False
capital-gain      False
capital-loss      False
hours-per-week    False
native-country    False
dtype: bool

test_mis

## Continuous Variable Analysis

#For Continus Variables age, fnlwgt,education-num,capital-gain,capital-loss,hours-per-week, we need to transform them into categorical variables.

#### Transform Age

In [20]:
age = train['age'].values
max_age = age.max()
min_age = age.min()
avg_age = int(age.mean())
print('max age={}, min_age={}, avg_age={}'.format(max_age, min_age, avg_age))

max age=90, min_age=17, avg_age=38


#### According to general knowledge, It would be better if we transform the age into amature, adult, wrinkly, old, and long-life-age. here is the range for different age people
- amature: 0~18
- adult: 18~30
- wrinkly 30~55
- old: 55~70
- long-life-age: 70~90

####  Transform fnlwgt. Final weight, this is the number of people the census believes the entry represents

In [21]:
fnlwgt = train['fnlwgt']
max_fnlwgt = fnlwgt.max()
min_fnlwgt = fnlwgt.min()
avg_fnlwgt = int(fnlwgt.mean())
differ_fnlwgt = max_fnlwgt-min_fnlwgt
print('max_fnlwgt={}, min_fnlwgt={}, mean_fnlwgt={}, fnlwgt_gap={}'.format(max_fnlwgt, min_fnlwgt, avg_fnlwgt, differ_fnlwgt))

max_fnlwgt=1490400, min_fnlwgt=12285, mean_fnlwgt=189792, fnlwgt_gap=1478115


#### split the region by Quartile Q1,Q2,Q3 and double outlier
#### $outlier_{up} = Q3+1.5IQR$
#### $outlier_{down} = Q1-1.5IQR$
#### $IQR = Q3-Q1$

In [22]:
def split_region(soted_vector):
    samples = soted_vector.shape[0]
    Q1 = soted_vector[int(0.25*samples)]
    Q2 = soted_vector[int(0.5*samples)]
    Q3 = soted_vector[int(0.75*samples)]
    IQR = Q3-Q1
    outlier_up = Q3+1.5*IQR
    outlier_down = Q1-1.5*IQR
    
    return Q1, Q2,Q3,outlier_up,outlier_down
Q1,Q2,Q3,outlier_up,outlier_down = split_region(np.sort(fnlwgt.values))

In [23]:
print(Q1,Q2,Q3,outlier_up,outlier_down)

117847 178449 237624 417289.5 -61818.5


#### As you can see the variety of the Q values and outliers, we conclude that we could transform the whole 'fnlwgt' into 6 region split by Q1,Q2,Q3,outlier_up and outlier_down

#### Transform education-num, Highest level of education in numerical form
##### Seems like it is categorial, we can check it out

In [24]:
education_num = train['education-num']
education_num_unique = np.unique(education_num)
print(education_num_unique)

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16]


##### its value elements are limited, so we don't have to transfer them into categorial

#### Transform captial-gain

In [25]:
captial_gain = train['capital-gain']
np.unique(captial_gain)

array([    0,   114,   401,   594,   914,   991,  1055,  1086,  1111,
        1151,  1173,  1264,  1409,  1424,  1455,  1471,  1506,  1639,
        1731,  1797,  1831,  1848,  2009,  2036,  2050,  2062,  2105,
        2174,  2176,  2202,  2228,  2290,  2329,  2346,  2354,  2387,
        2407,  2414,  2463,  2538,  2580,  2597,  2635,  2653,  2829,
        2885,  2907,  2936,  2961,  2964,  2977,  2993,  3103,  3137,
        3273,  3325,  3411,  3418,  3432,  3456,  3464,  3471,  3674,
        3781,  3818,  3887,  3908,  3942,  4064,  4101,  4386,  4416,
        4508,  4650,  4687,  4787,  4865,  4931,  4934,  5013,  5060,
        5178,  5455,  5556,  5721,  6360,  6418,  6497,  6514,  6612,
        6723,  6767,  6849,  7262,  7298,  7430,  7443,  7688,  7896,
        7978,  8614,  9386,  9562, 10520, 10566, 10605, 11678, 13550,
       14084, 14344, 15020, 15024, 15831, 20051, 22040, 25124, 25236,
       27828, 34095, 41310, 99999], dtype=int64)

##### Just like education-num, here the captial-gain's elements are countable, so we don't have to transfer them. But their values are too big to be accepted by the model so we need to do a hash process, like 114--->hash->1, hash it to a smaller number, which will be fine.

#### Transfor captial-loss

In [26]:
captial_loss = train['capital-loss']
np.unique(captial_loss)

array([   0,  155,  213,  323,  419,  625,  653,  810,  880,  974, 1092,
       1138, 1258, 1340, 1380, 1408, 1411, 1429, 1485, 1504, 1510, 1539,
       1564, 1573, 1579, 1590, 1594, 1602, 1617, 1628, 1648, 1651, 1668,
       1669, 1672, 1719, 1721, 1726, 1740, 1741, 1755, 1762, 1816, 1825,
       1844, 1848, 1870, 1876, 1887, 1902, 1911, 1944, 1974, 1977, 1980,
       2001, 2002, 2042, 2051, 2057, 2080, 2129, 2149, 2163, 2174, 2179,
       2205, 2206, 2231, 2238, 2246, 2258, 2267, 2282, 2339, 2352, 2377,
       2392, 2415, 2444, 2457, 2465, 2467, 2472, 2489, 2547, 2559, 2603,
       2754, 2824, 3004, 3175, 3683, 3770, 3900, 4356], dtype=int64)

##### Same as captial-gain, hash it too.

#### Transform hours-per-week, hours-worked-per-week

In [27]:
hours_per_week = train['hours-per-week']
np.unique(hours_per_week)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 88,
       89, 90, 91, 92, 94, 95, 96, 97, 98, 99], dtype=int64)

###### Just like the variable above just don't need to be hashed

#### For the rest features(variables), nominal or unnominal, we are going to replace them by different numbers so that model could use them