# Kaggle Forest Challenge - Modelling

In [6]:
# Import useful packages
import pandas as pd
import numpy as np
import os
import seaborn as sns
from matplotlib import pyplot as plt
import time
from datetime import datetime

# Import ML packages 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier	
from sklearn.ensemble import HistGradientBoostingClassifier				

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from lightgbm import LGBMClassifier

from catboost import CatBoostClassifier

# Import my functions 


In [4]:
os.chdir('/Users/camilleepitalon/Documents/DSB/11_machine_learning_2/Project')
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test-full.csv')

## Preprocessing

In [3]:
columns = list(df_test.columns)
num_columns = [col for col in columns if 'Soil_Type' not in col and 'Wilderness_Area' not in col and 'Id' not in col]
dummy_cols = [col for col in columns if col not in num_columns and 'Id' not in col]

In [4]:
X_train, X_test, y_train, y_test = split_trainset(df_train, test_size=0.2)
X, y = split_trainset(df_train, test_size=0)

In [59]:
X_train['Soil_Type15'].sum()

295

In [6]:
def climate_and_geo(df, climate=True, geographic=True, soil_dict=soil_dict):

    """ 
    Add climate zone and/or geographical zone to df
    """
    
    df_new = df.copy()

    for col in soil_dict.keys():
        df_new[col] = df_new[col] * int(soil_dict[col][0])

    if climate and geographic:
        df_new['ELU'] = df_new[list(soil_dict.keys())].sum(axis=1)
        df_new['ClimZone'] = df_new['ELU'].apply(lambda x : int(str(x)[0:1]))
        df_new['GeoZone'] = df_new['ELU'].apply(lambda x : int(str(x)[1:2]))
        df_new.drop(columns=['ELU'], inplace=True)
        return df_new
    
    elif climate:
        df_new['ELU'] = df_new[list(soil_dict.keys())].sum(axis=1)
        df_new['ClimZone'] = df_new['ELU'].apply(lambda x : int(str(x)[0:1]))
        df_new.drop(columns=['ELU'], inplace=True)
        return df_new

    else:
        df_new['ELU'] = df_new[list(soil_dict.keys())].sum(axis=1)
        df_new['GeoZone'] = df_new['ELU'].apply(lambda x : int(str(x)[1:2]))
        df_new.drop(columns=['ELU'], inplace=True)
        return df_new

In [7]:
X_new = climate_and_geo(X_train)
X_new.iloc[:,-80:]

KeyError: 0

In [None]:
print(family_dict)

{'F_Cathedral': 'Cathedral', 'F_Ratake': 'Ratake', 'F_Vanet': 'Vanet', 'F_Gothic': 'Gothic', 'F_Troutville': 'Troutville', 'F_Legault': 'Legault', 'F_Catamount': 'Catamount', 'F_Bullwark': 'Bullwark', 'F_Gateview': 'Gateview', 'F_Rogert': 'Rogert', 'F_Leighcan': 'Leighcan', 'F_Como': 'Como', 'F_Bross': 'Bross', 'F_Moran': 'Moran'}


In [116]:
def zone_info(df, climate=True, geographic=True, family=True, rock=True, stony=True, drop_initial_rows=True):

    """ 
    Add climate zone and/or geographical zone to df
    """

    df_temp = df.copy()
    
    if drop_initial_rows:
        soil_columns = [col for col in list(df.columns) if 'Soil_Type' in col]
        df_new = df.drop(columns=soil_columns)
    
    else: 
        df_new = df.copy()

    for col in elu_dict.keys():
        df_temp[col] = df_temp[col] * elu_dict[col]
        df_temp['ELU'] = df_temp[list(soil_dict.keys())].sum(axis=1)
    
    if climate:
        df_temp['ClimZone'] = df_temp['ELU'].apply(lambda x : int(str(x)[0:1]))
        df_new = df_new.join(pd.get_dummies(df_temp['ClimZone'], prefix='ClimZone'))

    if geographic:
        df_temp['GeoZone'] = df_temp['ELU'].apply(lambda x : int(str(x)[1:2]))
        df_new = df_new.join(pd.get_dummies(df_temp['GeoZone'], prefix='GeoZone'))

    if family or rock_dict or stony_dict:
        df_temp['ZoneDesc'] = df_temp['ELU'].apply(lambda x: desc_dict[x])

    if family:
        for fam in family_dict.keys():
            df_new[fam] = df_temp['ZoneDesc'].apply(lambda x: np.where(family_dict[fam] in x, 1, 0))
    
    if rock:
        for rock in rock_dict.keys():
            df_new[rock] = df_temp['ZoneDesc'].apply(lambda x: np.where(rock_dict[rock] in x, 1, 0))

    if stony:
        for stone in stony_dict.keys():
            df_new[stone] = df_temp['ZoneDesc'].apply(lambda x: np.where(stony_dict[stone] in x, 1, 0))

    return df_new

In [23]:
for fam in family_dict.keys():
    print(family_dict[fam])

Cathedral
Ratake
Vanet
Gothic
Troutville
Legault
Catamount
Bullwark
Gateview
Rogert
Leighcan
Como
Bross
Moran


In [98]:
X_new = zone_info(X, drop_row=False)
""" 
for fam in family_dict.keys():
    X_new[fam] = X_new["ZoneDesc"].apply(lambda x: np.where(family_dict[fam] in x, 1, 0))
 """

X_new.shape

(15120, 98)

In [80]:
X_new.join(pd.get_dummies(X_new['ClimZone'], prefix='ClimZone'))

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Wilderness_Area4,ClimZone,GeoZone,ClimZone_2,ClimZone_3,ClimZone_4,ClimZone_5,ClimZone_6,ClimZone_7,ClimZone_8
0,242642,2881,130,22,210,54,1020,250,221,88,...,0,7,7,0,0,0,0,0,1,0
1,309891,3005,351,14,242,-16,1371,194,215,159,...,0,7,7,0,0,0,0,0,1,0
2,287847,3226,63,14,618,2,1092,232,210,107,...,0,7,7,0,0,0,0,0,1,0
3,516307,3298,317,8,661,60,752,198,233,174,...,0,7,2,0,0,0,0,0,1,0
4,124860,3080,35,6,175,26,3705,219,227,144,...,0,7,7,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15115,475155,3328,321,13,323,12,5109,186,227,180,...,0,8,7,0,0,0,0,0,0,1
15116,514378,3455,37,5,841,92,939,220,229,146,...,0,8,7,0,0,0,0,0,0,1
15117,368425,3279,90,14,404,113,1513,240,218,105,...,0,7,7,0,0,0,0,0,1,0
15118,537844,3589,357,9,418,52,1868,205,223,155,...,0,8,7,0,0,0,0,0,0,1


In [99]:
for col in list(X_new.columns):
    if X_new[col].sum() < 1:
        print(col, X_new[col].sum())

Soil_Type15 0


In [37]:
list(soil_dict.values())[:]

[{'2702': 'Cathedral family - Rock outcrop complex, extremely stony'},
 {'2703': 'Vanet - Ratake families complex, very stony'},
 {'2704': 'Haploborolis - Rock outcrop complex, rubbly'},
 {'2705': 'Ratake family - Rock outcrop complex, rubbly'},
 {'2706': 'Vanet family - Rock outcrop complex complex, rubbly'},
 {'2717': 'Vanet - Wetmore families - Rock outcrop complex, stony'},
 {'3501': 'Gothic family'},
 {'3502': 'Supervisor - Limber families complex'},
 {'4201': 'Troutville family, very stony'},
 {'4703': 'Bullwark - Catamount families - Rock outcrop complex, rubbly'},
 {'4704': 'Bullwark - Catamount families - Rock land complex, rubbly'},
 {'4744': 'Legault family - Rock land complex, stony'},
 {'4758': 'Catamount family - Rock land - Bullwark family complex, rubbly'},
 {'5101': 'Pachic Argiborolis - Aquolis complex'},
 {'5151': 'unspecified in the USFS Soil and ELU Survey'},
 {'6101': 'Cryaquolis - Cryoborolis complex'},
 {'6102': 'Gateview family - Cryaquolis complex'},
 {'6731':

In [11]:
from my_dictionaries import elu_dict
X_new = zone_info(X_train)
X_new

AttributeError: 'int' object has no attribute 'keys'

In [105]:
# Test tuned LGBMClassifier on df_train + climate and geo


X_test1 = zone_info(df_train)
print(X_test1.shape)
model = LGBMClassifier(learning_rate=0.01, max_depth=32, num_iterations=1024, num_leaves=512, n_jobs=-1)

accuracy2b, matrix2b = local_metrics(
    X_test1, model, test_size=0.2,
    display_accuracy=True, display_matrix=True)

print(accuracy2b)
matrix2b

(15120, 99)




0.8974867724867724


Unnamed: 0,1,2,3,4,5,6,7
1,349,53,0,0,11,1,23
2,49,343,8,0,28,9,1
3,0,2,372,14,9,22,0
4,0,0,10,393,0,3,0
5,1,15,5,0,420,2,0
6,0,5,20,9,2,402,0
7,8,0,0,0,0,0,435


In [107]:
# Test tuned LGBMClassifier on df_train + climate and geo


X_test2 = zone_info(df_train, drop_initial_rows=True)
print(X_test2.shape)
model = LGBMClassifier(learning_rate=0.01, max_depth=32, num_iterations=1024, num_leaves=512, n_jobs=-1)

accuracy2b, matrix2b = local_metrics(
    X_test2, model, test_size=0.2,
    display_accuracy=True, display_matrix=True)

print(accuracy2b)
matrix2b

(15120, 59)




0.8991402116402116


Unnamed: 0,1,2,3,4,5,6,7
1,347,56,0,0,10,1,23
2,49,344,8,0,27,9,1
3,0,2,372,13,9,23,0
4,0,0,9,395,0,2,0
5,1,16,5,0,420,1,0
6,0,4,17,9,2,406,0
7,8,0,0,0,0,0,435


In [108]:
# Test tuned LGBMClassifier on df_train + climate and geo


X_test3 = zone_info(df_train, climate=True, geographic=True, family=False, rock=False, stony=False, drop_initial_rows=True)
print(X_test3.shape)
model = LGBMClassifier(learning_rate=0.01, max_depth=32, num_iterations=1024, num_leaves=512, n_jobs=-1)

accuracy2b, matrix2b = local_metrics(
    X_test3, model, test_size=0.2,
    display_accuracy=True, display_matrix=True)

print(accuracy2b)
matrix2b

(15120, 27)




0.8971560846560847


Unnamed: 0,1,2,3,4,5,6,7
1,345,56,0,0,10,3,23
2,43,349,7,0,28,9,2
3,0,3,372,12,6,26,0
4,0,0,9,394,0,3,0
5,0,16,5,0,420,2,0
6,0,5,24,8,2,399,0
7,9,0,0,0,0,0,434


In [109]:
# Test tuned LGBMClassifier on df_train + climate and geo


X_test4 = zone_info(df_train, climate=False, geographic=False, family=True, rock=True, stony=True, drop_initial_rows=True)
print(X_test3.shape)
model = LGBMClassifier(learning_rate=0.01, max_depth=32, num_iterations=1024, num_leaves=512, n_jobs=-1)

accuracy2b, matrix2b = local_metrics(
    X_test4, model, test_size=0.2,
    display_accuracy=True, display_matrix=True)

print(accuracy2b)
matrix2b

(15120, 27)




0.9021164021164021


Unnamed: 0,1,2,3,4,5,6,7
1,349,50,0,0,12,2,24
2,45,349,7,0,26,10,1
3,0,2,374,14,8,21,0
4,0,0,7,397,0,2,0
5,1,15,4,0,422,1,0
6,0,5,20,9,2,402,0
7,8,0,0,0,0,0,435


In [110]:
# Test tuned LGBMClassifier on df_train + climate and geo


X_test5 = zone_info(df_train, climate=False, geographic=False, family=True, rock=False, stony=False, drop_initial_rows=True)
print(X_test3.shape)
model = LGBMClassifier(learning_rate=0.01, max_depth=32, num_iterations=1024, num_leaves=512, n_jobs=-1)

accuracy2b, matrix2b = local_metrics(
    X_test5, model, test_size=0.2,
    display_accuracy=True, display_matrix=True)

print(accuracy2b)
matrix2b

(15120, 27)




0.8978174603174603


Unnamed: 0,1,2,3,4,5,6,7
1,352,48,0,0,13,1,23
2,47,346,7,0,28,9,1
3,0,2,367,13,8,29,0
4,0,0,7,397,0,2,0
5,0,16,5,0,420,2,0
6,0,5,19,9,4,401,0
7,11,0,0,0,0,0,432


In [111]:
# Test tuned LGBMClassifier on df_train + climate and geo


X_test5 = zone_info(df_train, climate=False, geographic=False, family=False, rock=True, stony=False, drop_initial_rows=True)
print(X_test5.shape)
model = LGBMClassifier(learning_rate=0.01, max_depth=32, num_iterations=1024, num_leaves=512, n_jobs=-1)

accuracy2b, matrix2b = local_metrics(
    X_test5, model, test_size=0.2,
    display_accuracy=True, display_matrix=True)

print(accuracy2b)
matrix2b

(15120, 30)




0.9004629629629629


Unnamed: 0,1,2,3,4,5,6,7
1,352,51,0,0,12,2,20
2,46,349,9,0,24,8,2
3,0,2,368,11,8,30,0
4,0,0,7,395,0,4,0
5,1,15,1,0,423,3,0
6,0,4,23,8,2,401,0
7,8,0,0,0,0,0,435


In [112]:
# Test tuned LGBMClassifier on df_train + climate and geo


X_test5 = zone_info(df_train, climate=False, geographic=False, family=False, rock=False, stony=True, drop_initial_rows=True)
print(X_test5.shape)
model = LGBMClassifier(learning_rate=0.01, max_depth=32, num_iterations=1024, num_leaves=512, n_jobs=-1)

accuracy2b, matrix2b = local_metrics(
    X_test5, model, test_size=0.2,
    display_accuracy=True, display_matrix=True)

print(accuracy2b)
matrix2b

(15120, 20)




0.8968253968253969


Unnamed: 0,1,2,3,4,5,6,7
1,346,53,0,0,12,2,24
2,42,347,9,0,30,8,2
3,0,2,371,13,6,27,0
4,0,0,7,396,0,3,0
5,0,14,3,0,425,1,0
6,0,5,26,10,4,393,0
7,9,0,0,0,0,0,434


In [113]:
# Test tuned LGBMClassifier on df_train + climate and geo


X_test5 = zone_info(df_train, climate=True, geographic=False, family=False, rock=False, stony=False, drop_initial_rows=True)
print(X_test5.shape)
model = LGBMClassifier(learning_rate=0.01, max_depth=32, num_iterations=1024, num_leaves=512, n_jobs=-1)

accuracy2b, matrix2b = local_metrics(
    X_test5, model, test_size=0.2,
    display_accuracy=True, display_matrix=True)

print(accuracy2b)
matrix2b

(15120, 23)




0.8984788359788359


Unnamed: 0,1,2,3,4,5,6,7
1,355,45,0,0,12,3,22
2,48,341,8,0,30,9,2
3,0,3,372,12,7,25,0
4,0,0,8,395,0,3,0
5,1,16,4,0,420,2,0
6,0,5,23,8,2,400,0
7,9,0,0,0,0,0,434


In [114]:
# Test tuned LGBMClassifier on df_train + climate and geo


X_test5 = zone_info(df_train, climate=False, geographic=True, family=False, rock=False, stony=False, drop_initial_rows=True)
print(X_test5.shape)
model = LGBMClassifier(learning_rate=0.01, max_depth=32, num_iterations=1024, num_leaves=512, n_jobs=-1)

accuracy2b, matrix2b = local_metrics(
    X_test5, model, test_size=0.2,
    display_accuracy=True, display_matrix=True)

print(accuracy2b)
matrix2b

(15120, 20)




0.8948412698412699


Unnamed: 0,1,2,3,4,5,6,7
1,347,52,0,0,12,3,23
2,47,345,8,0,28,8,2
3,0,4,368,12,7,28,0
4,0,0,8,395,0,3,0
5,0,15,5,0,421,2,0
6,0,6,22,9,4,397,0
7,9,1,0,0,0,0,433


In [115]:
# Test tuned LGBMClassifier on df_train + climate and geo


X_test5 = zone_info(df_train, climate=True, geographic=False, family=False, rock=True, stony=False, drop_initial_rows=True)
print(X_test5.shape)
model = LGBMClassifier(learning_rate=0.01, max_depth=32, num_iterations=1024, num_leaves=512, n_jobs=-1)

accuracy2b, matrix2b = local_metrics(
    X_test5, model, test_size=0.2,
    display_accuracy=True, display_matrix=True)

print(accuracy2b)
matrix2b

(15120, 37)




0.8984788359788359


Unnamed: 0,1,2,3,4,5,6,7
1,343,58,0,0,11,2,23
2,45,350,7,0,27,8,1
3,0,2,375,10,9,23,0
4,0,0,9,394,0,3,0
5,1,15,5,0,420,2,0
6,0,4,23,8,2,401,0
7,9,0,0,0,0,0,434


In [9]:
def main(train_df, test_df):
    # this is public leaderboard ratio
    start = datetime.now()
    type_ratio = np.array([0.37053, 0.49681, 0.05936, 0.00103, 0.01295, 0.02687, 0.03242])
    
    total_df = pd.concat([train_df.iloc[:, :-1], test_df])
    
    # Aspect
    total_df["Aspect_Sin"] = np.sin(np.pi*total_df["Aspect"]/180)
    total_df["Aspect_Cos"] = np.cos(np.pi*total_df["Aspect"]/180)
    print("Aspect", (datetime.now() - start).seconds)
    
    # Hillshade
    hillshade_col = ["Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm"]
    for col1, col2 in combinations(hillshade_col, 2):
        total_df[col1 + "_add_" + col2] = total_df[col2] + total_df[col1]
        total_df[col1 + "_dif_" + col2] = total_df[col2] - total_df[col1]
        total_df[col1 + "_div_" + col2] = (total_df[col2]+0.01) / (total_df[col1]+0.01)
        total_df[col1 + "_abs_" + col2] = np.abs(total_df[col2] - total_df[col1])
    
    total_df["Hillshade_mean"] = total_df[hillshade_col].mean(axis=1)
    total_df["Hillshade_std"] = total_df[hillshade_col].std(axis=1)
    total_df["Hillshade_max"] = total_df[hillshade_col].max(axis=1)
    total_df["Hillshade_min"] = total_df[hillshade_col].min(axis=1)
    print("Hillshade", (datetime.now() - start).seconds)
    
    # Hydrology ** I forgot to add arctan
    total_df["Degree_to_Hydrology"] = ((total_df["Vertical_Distance_To_Hydrology"] + 0.001) /
                                       (total_df["Horizontal_Distance_To_Hydrology"] + 0.01))
    
    # Holizontal
    horizontal_col = ["Horizontal_Distance_To_Hydrology",
                      "Horizontal_Distance_To_Roadways",
                      "Horizontal_Distance_To_Fire_Points"]
    
    
    for col1, col2 in combinations(hillshade_col, 2):
        total_df[col1 + "_add_" + col2] = total_df[col2] + total_df[col1]
        total_df[col1 + "_dif_" + col2] = total_df[col2] - total_df[col1]
        total_df[col1 + "_div_" + col2] = (total_df[col2]+0.01) / (total_df[col1]+0.01)
        total_df[col1 + "_abs_" + col2] = np.abs(total_df[col2] - total_df[col1])
    print("Holizontal", (datetime.now() - start).seconds)
    
    
    def categorical_post_mean(x):
        p = (x.values)*type_ratio
        p = p/p.sum()*x.sum() + 10*type_ratio
        return p/p.sum()
    
    # Wilder
    wilder = pd.DataFrame([(train_df.iloc[:, 11:15] * np.arange(1, 5)).sum(axis=1),
                          train_df.Cover_Type]).T
    wilder.columns = ["Wilder_Type", "Cover_Type"]
    wilder["one"] = 1
    piv = wilder.pivot_table(values="one",
                             index="Wilder_Type",
                             columns="Cover_Type",
                             aggfunc="sum").fillna(0)
    
    tmp = pd.DataFrame(piv.apply(categorical_post_mean, axis=1).tolist()).reset_index()
    tmp["index"] = piv.sum(axis=1).index
    tmp.columns = ["Wilder_Type"] + ["Wilder_prob_ctype_{}".format(i) for i in range(1, 8)]
    tmp["Wilder_Type_count"] = piv.sum(axis=1).values
    
    total_df["Wilder_Type"] = (total_df.filter(regex="Wilder") * np.arange(1, 5)).sum(axis=1)
    total_df = total_df.merge(tmp, on="Wilder_Type", how="left")
    
    for i in range(7):
        total_df.loc[:, "Wilder_prob_ctype_{}".format(i+1)] = total_df.loc[:, "Wilder_prob_ctype_{}".format(i+1)].fillna(type_ratio[i])
    total_df.loc[:, "Wilder_Type_count"] = total_df.loc[:, "Wilder_Type_count"].fillna(0)
    print("Wilder_type", (datetime.now() - start).seconds)
    
    
    # Soil type
    soil = pd.DataFrame([(train_df.iloc[:, -41:-1] * np.arange(1, 41)).sum(axis=1),
                          train_df.Cover_Type]).T
    soil.columns = ["Soil_Type", "Cover_Type"]
    soil["one"] = 1
    piv = soil.pivot_table(values="one",
                           index="Soil_Type",
                           columns="Cover_Type",
                           aggfunc="sum").fillna(0)
    
    tmp = pd.DataFrame(piv.apply(categorical_post_mean, axis=1).tolist()).reset_index()
    tmp["index"] = piv.sum(axis=1).index
    tmp.columns = ["Soil_Type"] + ["Soil_prob_ctype_{}".format(i) for i in range(1, 8)]
    tmp["Soil_Type_count"] = piv.sum(axis=1).values
    
    total_df["Soil_Type"] = (total_df.filter(regex="Soil") * np.arange(1, 41)).sum(axis=1)
    total_df = total_df.merge(tmp, on="Soil_Type", how="left")
    
    for i in range(7):
        total_df.loc[:, "Soil_prob_ctype_{}".format(i+1)] = total_df.loc[:, "Soil_prob_ctype_{}".format(i+1)].fillna(type_ratio[i])
    total_df.loc[:, "Soil_Type_count"] = total_df.loc[:, "Soil_Type_count"].fillna(0)
    print("Soil_type", (datetime.now() - start).seconds)
    
    icol = total_df.select_dtypes(np.int64).columns
    fcol = total_df.select_dtypes(np.float64).columns
    total_df.loc[:, icol] = total_df.loc[:, icol].astype(np.int32)
    total_df.loc[:, fcol] = total_df.loc[:, fcol].astype(np.float32)
    return total_df

total_df = main(df_train, df_test)
one_col = total_df.filter(regex="(Type\d+)|(Area\d+)").columns
total_df = total_df.drop(one_col, axis=1)

Aspect 0
Hillshade 1
Holizontal 1
Wilder_type 2
Soil_type 3


In [8]:
from itertools import combinations, chain
from datetime import datetime

In [10]:
total_df

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Wilder_Type_count,Soil_Type,Soil_prob_ctype_1,Soil_prob_ctype_2,Soil_prob_ctype_3,Soil_prob_ctype_4,Soil_prob_ctype_5,Soil_prob_ctype_6,Soil_prob_ctype_7,Soil_Type_count
0,242642,2881,130,22,210,54,1020,250,221,88,...,3568.0,30,0.304551,0.625727,0.000796,0.000014,0.061641,0.000360,0.006911,736.0
1,309891,3005,351,14,242,-16,1371,194,215,159,...,6302.0,24,0.485596,0.491459,0.002159,0.000037,0.003054,0.005741,0.011955,265.0
2,287847,3226,63,14,618,2,1092,232,210,107,...,3568.0,29,0.381510,0.603750,0.000450,0.000008,0.007517,0.000204,0.006561,1308.0
3,516307,3298,317,8,661,60,752,198,233,174,...,569.0,23,0.574680,0.404116,0.000789,0.000014,0.008334,0.000477,0.011590,742.0
4,124860,3080,35,6,175,26,3705,219,227,144,...,3568.0,24,0.485596,0.491459,0.002159,0.000037,0.003054,0.005741,0.011955,265.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
596127,581008,2396,153,20,85,17,108,240,237,118,...,6302.0,2,0.005817,0.135867,0.667654,0.003620,0.029771,0.156764,0.000509,627.0
596128,581009,2391,152,19,67,12,95,240,237,119,...,6302.0,2,0.005817,0.135867,0.667654,0.003620,0.029771,0.156764,0.000509,627.0
596129,581010,2386,159,17,60,7,90,236,241,130,...,6302.0,2,0.005817,0.135867,0.667654,0.003620,0.029771,0.156764,0.000509,627.0
596130,581011,2384,170,15,60,5,90,230,245,143,...,6302.0,2,0.005817,0.135867,0.667654,0.003620,0.029771,0.156764,0.000509,627.0


In [11]:
total_df.columns

Index(['Id', 'Elevation', 'Aspect', 'Slope',
       'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
       'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points', 'Aspect_Sin',
       'Aspect_Cos', 'Hillshade_9am_add_Hillshade_Noon',
       'Hillshade_9am_dif_Hillshade_Noon', 'Hillshade_9am_div_Hillshade_Noon',
       'Hillshade_9am_abs_Hillshade_Noon', 'Hillshade_9am_add_Hillshade_3pm',
       'Hillshade_9am_dif_Hillshade_3pm', 'Hillshade_9am_div_Hillshade_3pm',
       'Hillshade_9am_abs_Hillshade_3pm', 'Hillshade_Noon_add_Hillshade_3pm',
       'Hillshade_Noon_dif_Hillshade_3pm', 'Hillshade_Noon_div_Hillshade_3pm',
       'Hillshade_Noon_abs_Hillshade_3pm', 'Hillshade_mean', 'Hillshade_std',
       'Hillshade_max', 'Hillshade_min', 'Degree_to_Hydrology', 'Wilder_Type',
       'Wilder_prob_ctype_1', 'Wilder_prob_ctype_2', 'Wilder_prob_ctype_3',
       'Wilder_prob_ctype_4', 'Wilder_prob_c

In [18]:
soil = pd.DataFrame([(df_train.iloc[:, -41:-1] * np.arange(1, 41)).sum(axis=1),
                          df_train.Cover_Type]).T
soil.columns = ["Soil_Type", "Cover_Type"]
soil["one"] = 1

piv = soil.pivot_table(values="one",
                           index="Soil_Type",
                           columns="Cover_Type",
                           aggfunc="sum").fillna(0)

In [19]:
soil

Unnamed: 0,Soil_Type,Cover_Type,one
0,30,1,1
1,24,1,1
2,29,1,1
3,23,1,1
4,24,1,1
...,...,...,...
15115,38,7,1
15116,40,7,1
15117,29,7,1
15118,40,7,1


In [20]:
piv

Cover_Type,1,2,3,4,5,6,7
Soil_Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.0,0.0,110.0,135.0,0.0,94.0,0.0
2,0.0,7.0,305.0,95.0,62.0,158.0,0.0
3,0.0,11.0,169.0,803.0,0.0,23.0,0.0
4,1.0,31.0,448.0,131.0,141.0,76.0,11.0
5,0.0,0.0,64.0,36.0,0.0,81.0,0.0
6,0.0,10.0,250.0,262.0,0.0,157.0,0.0
7,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,2.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.0,3.0,0.0,0.0,0.0,0.0,0.0
10,7.0,70.0,680.0,159.0,55.0,1125.0,0.0


In [27]:
tmp = pd.DataFrame(piv.apply(categorical_post_mean, axis=1).tolist()).reset_index()
tmp["index"] = piv.sum(axis=1).index
tmp.columns = ["Soil_Type"] + ["Soil_prob_ctype_{}".format(i) for i in range(1, 8)]
tmp["Soil_Type_count"] = piv.sum(axis=1).values

In [24]:
def categorical_post_mean(x):
    p = (x.values)*type_ratio
    p = p/p.sum()*x.sum() + 10*type_ratio
    return p/p.sum()

In [26]:
type_ratio = np.array([0.37053, 0.49681, 0.05936, 0.00103, 0.01295, 0.02687, 0.03242])

In [28]:
tmp

Unnamed: 0,Soil_Type,Soil_prob_ctype_1,Soil_prob_ctype_2,Soil_prob_ctype_3,Soil_prob_ctype_4,Soil_prob_ctype_5,Soil_prob_ctype_6,Soil_prob_ctype_7,Soil_Type_count
0,1,0.010617,0.014235,0.691522,0.014719,0.000371,0.267606,0.000929,339.0
1,2,0.005817,0.135867,0.667653,0.00362,0.029771,0.156764,0.000509,627.0
2,3,0.003647,0.324284,0.58689,0.048349,0.000127,0.036384,0.000319,1006.0
3,4,0.012201,0.331584,0.563146,0.002866,0.038771,0.043507,0.007924,839.0
4,5,0.0194,0.026011,0.601875,0.005898,0.000678,0.344441,0.001697,181.0
5,6,0.005378,0.208721,0.602783,0.010961,0.000188,0.171499,0.000471,679.0
6,7,0.336855,0.542569,0.053965,0.000936,0.011773,0.024428,0.029474,1.0
7,8,0.475454,0.414019,0.049468,0.000858,0.010792,0.022392,0.027017,2.0
8,9,0.321559,0.583703,0.042401,0.000736,0.00925,0.019193,0.023158,4.0
9,10,0.025477,0.320363,0.369385,0.001502,0.006574,0.276545,0.000154,2096.0
