# **Creating Training Data for Decision Trees**

In [1]:
import numpy as np
#import pandas as pd

## **Preparing data**

In [20]:
original_data = np.genfromtxt("cal_housing.csv", delimiter=",", skip_header=1).astype(int)

Shuffling data and removing the `decision` column:

In [21]:
np.random.shuffle(original_data)

In [22]:
original_data

array([[   2,   17,    4, ...,    5,   36, -120],
       [   4,   48,    5, ...,    3,   37, -122],
       [   6,   36,    6, ...,    3,   34, -118],
       ...,
       [   2,   52,    4, ...,    3,   34, -118],
       [   4,   35,    5, ...,    3,   34, -118],
       [   4,   48,    5, ...,    2,   34, -118]])

Converting textual values to numeric:

In [5]:
#helpers:
def convert_text_to_num(df):
    '''Converts categorical data to numerical in the passed pandas.DataFrame
       -----------
       parameters:
         - df: a pandas.DataFrame containing categorical data
       returns:
         - the converted DataFrame containing only numerical data
         - a map that represents the executed conversion
    '''

    df_return = pd.DataFrame()
    return_map = {}
    for c in df.columns:
        #checking whether the data is categorical:
        if df.dtypes[c]==object:
            _rc = {}
            replace_map = {}
            uniques = pd.unique(df[c])
            #converting categorical data:
            for i, val in enumerate(uniques):
                replace_map[val] = i #how to convert
                _rc[i] = val #what have we done (inverse of the conversion)
            df_return[c] = df[c].replace(replace_map, inplace=False)
            return_map[c] = _rc
        else:
            df_return[c] = df[c]
    return df_return, return_map

def convert_num_to_text(df, return_map):
    '''Converts numerical data back to categorical. (Inverse
       of the convert_text_to_num function)
       -----------
       parameters:
         - df: a pandas.DataFrame containing only numerical data
         - return_map: the map representing the executed conversion
       returns:
         - a pandas.DataFrame containing categorical data.
    '''
    df_return = pd.DataFrame()
    for c in df.columns:
        #transforming back the originally categorical data
        if c in return_map:
            df_return[c] = df[c].replace(return_map[c], inplace=False)
        else:
            df_return[c] = df[c]
    return df_return

In [6]:
converted_df, conv_map = convert_text_to_num(original_data)
conv_map

{}

## **Creating fake decisions**

In [7]:
keys = [x for x in conv_map]
keys

[]

In [8]:
converted_df = original_data

In [25]:
decision = np.full((len(original_data)), False)
while (np.sum(decision)<0.2 * len(decision)) or (np.sum(decision)>0.8 * len(decision)):
    #converted_df.drop(columns=["decision"])
    dec_num = np.random.randint(2, 5, 1)[0]
    dec_features = np.random.choice(np.arange(0, original_data.shape[1]), size=dec_num, replace=False)
    decision = np.full((len(original_data)), True)
    for feature in dec_features:
        limit = np.random.choice(np.unique(original_data[:, feature]), size=1)[0]
        print(feature, limit)
        decision = np.logical_and(decision, original_data[:, feature]>limit)
    print("============")

6 35
3 10
5 7
3 4
6 40
0 8
2 8
5 9
0 4
1 13
7 -119
5 64
1 14
3 1
2 4
6 34
3 9
0 15
4 1563
5 41
0 12
4 1029
6 36
5 51
0 9
4 4828
7 -122
1 20
2 53
6 37
0 11
7 -123
4 1091
3 1
1 43
6 33
0 1
4 737
1 2
0 14
7 -122
1 10
4 2609
6 34
0 11
0 13
7 -114
1 27
4 1666
6 41
4 1255
7 -114
1 27
4 3946
6 39
2 48
5 2
4 2041
6 37
4 59
5 83
5 8
2 5
6 36
7 -118
1 29
2 11
6 38
6 37
2 48
4 955
5 2
6 34


In [26]:
np.sum(decision)/len(decision)

0.20067829457364342

In [35]:
created_eval_data = np.hstack([original_data.astype(int), decision.reshape((len(decision),1))])
np.savetxt("../3_evaluation/evaluate.csv", created_eval_data, delimiter = ',', fmt="%d")
np.savetxt("../3_evaluation/train.csv", created_eval_data[:150], delimiter = ",", fmt="%d")
np.savetxt("../3_evaluation/test.csv", created_eval_data[:, :-1], delimiter=",", fmt="%d")
#converted_df.to_csv("../3_evaluation/evaluate.csv", index=False)
#converted_df.sample(n=150).to_csv("../2_solution/train.csv", index=False)

In [13]:
test = pd.read_csv("../3_evaluation/evaluate.csv")
test = test.drop(columns=["decision"])
test.to_csv("../2_solution/test.csv", index=False)