In [20]:
import pandas as pd
data = pd.read_csv("train.csv")
data['Age'] = data['Age'].fillna(data['Age'].mean())

In [21]:
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C148,C


In [22]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.002015,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,29.699118,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


# Part I: 
    Automatic generation of a concept hierarchy for nomial data based on number of distinct values
    of attributes in the given schema

Note that: Name, Sex, Ticket, Cabin, Embarked are all nomial attributes. Let us build an automatic generation of 
concept hierarchy for these attributes.

In [23]:
nomial_data = data[['Name', 'Sex', 'Ticket','Cabin','Embarked']]

In [24]:
list=[]
for na in nomial_data:
 list.append([na,len(set(data[na]))])

In [25]:
list

[['Name', 891], ['Sex', 2], ['Ticket', 681], ['Cabin', 148], ['Embarked', 4]]

In [26]:
list.sort(key=lambda x: x[1])

In [27]:
list

[['Sex', 2], ['Embarked', 4], ['Cabin', 148], ['Ticket', 681], ['Name', 891]]

Now we rank hierarchy of these nomial attributes from high to low

In [28]:
for l in list:
    print(l[0])

Sex
Embarked
Cabin
Ticket
Name


# Part II: 
    Automatic generation of a concept hierarchy for numeric data based on equal-width partitioning 

Let us use attribute age as an example

In [29]:
concept_hierarchy=[]
range_min=0
range_max=80
step=10

Initialize concept_hierarchy array

In [30]:
i=0
j=0
while (i<range_max):
    name='Level_'+str(j)
    min=i
    max=i+step
    concept_hierarchy.append([name,min,max]) 
    i+=step
    j+=1

In [31]:
concept_hierarchy

[['Level_0', 0, 10],
 ['Level_1', 10, 20],
 ['Level_2', 20, 30],
 ['Level_3', 30, 40],
 ['Level_4', 40, 50],
 ['Level_5', 50, 60],
 ['Level_6', 60, 70],
 ['Level_7', 70, 80]]

In [32]:
for c_hierarchy in concept_hierarchy:
    count=0
    sum=0
    for age in data['Age']:
        if age<=c_hierarchy[2] and age>c_hierarchy[1]:
            count+=1
            sum+=age
    level_mean=sum/count
    c_hierarchy.append(count)
    c_hierarchy.append(sum)
    c_hierarchy.append(level_mean)

In [33]:
concept_hierarchy

[['Level_0', 0, 10, 64, 273.16999999999996, 4.268281249999999],
 ['Level_1', 10, 20, 115, 1991.5, 17.317391304347826],
 ['Level_2', 20, 30, 407, 11104.24382352941, 27.28315435756612],
 ['Level_3', 30, 40, 155, 5433.0, 35.05161290322581],
 ['Level_4', 40, 50, 86, 3902.0, 45.372093023255815],
 ['Level_5', 50, 60, 42, 2305.5, 54.892857142857146],
 ['Level_6', 60, 70, 17, 1086.0, 63.88235294117647],
 ['Level_7', 70, 80, 5, 366.5, 73.3]]

Note that: we divide Age attribute into 8 concept hierarchies, width=10, for each tuple, we have 
level, bin_min, bin_max, count, bin_sum, bin_mean

# Part III: 
    Automatic generation of a concept hierarchy for numeric data based on equal-frequency partitioning 

We still use attribute age as an example

In [34]:
concept_hierarchy=[]
range_min=0
range_max=80
bin_depth=6

In [35]:
len(data['Age'])

891

In [36]:
Age = data['Age'].sort_values()

In [37]:
Age.max()

80.0

In [38]:
start=0
for i in range(int(len(Age)/bin_depth)):
    sum=0
    name='Level_'+str(i)
    min=Age[Age.index[start]]
    max=Age[Age.index[start+bin_depth-1]]
    for j in range(bin_depth):
        sum+=Age[Age.index[start+j]]
    mean=sum/bin_depth
    concept_hierarchy.append([name,min,max,sum,mean]) 
    start+=bin_depth
    
sum=0
name='Level_'+str(i+1)
if len(Age)>int(len(Age)/bin_depth)*bin_depth:
    min=Age[Age.index[start]]
    count=len(Age)-int(int(len(Age)/bin_depth))*bin_depth
    for j in range(count):
        sum+=Age[Age.index[start+j]]
    max=Age[Age.index[start+count-1]]
    mean=sum/count
    concept_hierarchy.append([name,min,max,sum,mean]) 

In [39]:
concept_hierarchy

[['Level_0', 0.42, 0.83, 4.25, 0.7083333333333334],
 ['Level_1', 0.92, 1.0, 5.92, 0.9866666666666667],
 ['Level_2', 1.0, 2.0, 10.0, 1.6666666666666667],
 ['Level_3', 2.0, 2.0, 12.0, 2.0],
 ['Level_4', 3.0, 3.0, 18.0, 3.0],
 ['Level_5', 4.0, 4.0, 24.0, 4.0],
 ['Level_6', 4.0, 5.0, 26.0, 4.333333333333333],
 ['Level_7', 5.0, 7.0, 35.0, 5.833333333333333],
 ['Level_8', 7.0, 8.0, 46.0, 7.666666666666667],
 ['Level_9', 9.0, 9.0, 54.0, 9.0],
 ['Level_10', 9.0, 11.0, 60.0, 10.0],
 ['Level_11', 11.0, 14.0, 74.0, 12.333333333333334],
 ['Level_12', 14.0, 14.5, 84.5, 14.083333333333334],
 ['Level_13', 15.0, 16.0, 91.0, 15.166666666666666],
 ['Level_14', 16.0, 16.0, 96.0, 16.0],
 ['Level_15', 16.0, 16.0, 96.0, 16.0],
 ['Level_16', 16.0, 17.0, 98.0, 16.333333333333332],
 ['Level_17', 17.0, 17.0, 102.0, 17.0],
 ['Level_18', 17.0, 18.0, 103.0, 17.166666666666668],
 ['Level_19', 18.0, 18.0, 108.0, 18.0],
 ['Level_20', 18.0, 18.0, 108.0, 18.0],
 ['Level_21', 18.0, 18.0, 108.0, 18.0],
 ['Level_22', 18.0

Note that: we divide Age attribute into bins, bin_depth=6, for each tuple, we have level, bin_min, bin_max, bin_sum, bin_mean