In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,BaggingRegressor,ExtraTreesRegressor
from sklearn.linear_model import LinearRegression,LogisticRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np

In [None]:
%load_ext autoreload
%autoreload 2

## Handling Categorical and Numerical Values

In [None]:
# Function for comparing different approaches
def score_dataset_rf(X_train, X_valid, y_train, y_valid,error_fn=mean_absolute_error):
    model = RandomForestRegressor(n_estimators=100, random_state=100)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return error_fn(y_valid, preds)

In [None]:
def score_dataset_gb(X_train, X_valid, y_train, y_valid,error_fn=mean_absolute_error):
    model = GradientBoostingRegressor(n_estimators=100, random_state=100)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return error_fn(y_valid, preds)

In [None]:
def split_num_and_cat(df,cat_keep_threshold=10):
    object_cols = [col for col in df.columns if df[col].dtype == "object"]
    low_cardinality_cols = [col for col in object_cols if df[col].nunique() < cat_keep_threshold]
    high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))
    cat_df = df[object_cols]
    num_df = df.drop(object_cols,axis=1)
    return cat_df,num_df,object_cols,low_cardinality_cols,high_cardinality_cols

In [None]:
# Read the data
X = pd.read_csv('../datasets/iowa_housing/train.csv', index_col='Id') 
X_test = pd.read_csv('../datasets/iowa_housing/test.csv', index_col='Id')

In [None]:
X.head()

In [None]:
train_id = X.index
test_id = X_test.index
# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

In [None]:
combined = pd.concat([X,X_test])

In [None]:
cat_df,num_df,object_cols,low_cardinality_cols,high_cardinality_cols = split_num_and_cat(combined)

In [None]:
object_cols

In [None]:
num_df.columns

In [None]:
len(num_df.columns),len(object_cols)

In [None]:
low_cardinality_cols

In [None]:
high_cardinality_cols

In [None]:
combined['Exterior2nd'].value_counts()

In [None]:
my_imputer = SimpleImputer(strategy='median')
imputed_num_df = pd.DataFrame(my_imputer.fit_transform(num_df))
imputed_num_df.columns = num_df.columns

In [None]:
num_df = imputed_num_df.copy()

In [None]:
my_imputer_cat = SimpleImputer(strategy='most_frequent')
imputed_cat_df = pd.DataFrame(my_imputer_cat.fit_transform(cat_df))
imputed_cat_df.columns = cat_df.columns

In [None]:
cat_df = imputed_cat_df.copy()

In [None]:
label_encoder = LabelEncoder()
for col in object_cols:
    cat_df[col] = label_encoder.fit_transform(cat_df[col])

In [None]:
len(num_df.columns)

In [None]:
len(cat_df.columns)

In [None]:
cat_df.head()

In [None]:
combined = pd.concat([cat_df,num_df],axis=1)

In [None]:
len(combined)

In [None]:
X = combined[:len(X)]
X_test = combined[len(X):]
X.shape,X_test.shape

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      train_size=0.8, test_size=0.2,
                                                      random_state=0)
X_train.shape,X_valid.shape

In [None]:
print(score_dataset_rf(X_train, X_valid, y_train, y_valid))

In [None]:
print(score_dataset_gb(X_train, X_valid, y_train, y_valid))

In [None]:
import preprocess

In [None]:
X = pd.read_csv('../datasets/iowa_housing/train.csv', index_col='Id') 
X_test = pd.read_csv('../datasets/iowa_housing/test.csv', index_col='Id')

In [None]:
pre_processed = preprocess.preprocess_df(X,'SalePrice',X_test)

In [None]:
len(pre_processed.X),len(pre_processed.X_test)

In [None]:
splits = pre_processed.split_df()

In [None]:
splits.keys()

In [None]:
X_train,X_valid,y_train,y_valid = splits['X_train'],splits['X_test'],splits['y_train'],splits['y_test']

In [None]:
print(score_dataset_rf(X_train, X_valid, y_train, y_valid))

In [None]:
print(score_dataset_gb(X_train, X_valid, y_train, y_valid))

In [None]:
X = pd.read_csv('../datasets/iowa_housing/train.csv', index_col='Id') 
X_test = pd.read_csv('../datasets/iowa_housing/test.csv', index_col='Id')

In [None]:
pre_processed = preprocess.preprocess_df(X,'SalePrice',X_test,one_hot=True)

In [None]:
len(pre_processed.combined.columns)

In [None]:
pre_processed.X.head() 

In [None]:
pre_processed.X_test.head()

In [None]:
splits = pre_processed.split_df()
X_train,X_valid,y_train,y_valid = splits['X_train'],splits['X_test'],splits['y_train'],splits['y_test']

In [None]:
print(score_dataset_rf(X_train, X_valid, y_train, y_valid))
print(score_dataset_gb(X_train, X_valid, y_train, y_valid))

In [None]:
X = pd.read_csv('../datasets/iowa_housing/train.csv', index_col='Id') 
X_test = pd.read_csv('../datasets/iowa_housing/test.csv', index_col='Id')
pre_processed = preprocess.preprocess_df(X,'SalePrice',X_test,one_hot=False,scaler=RobustScaler)

In [None]:
splits = pre_processed.split_df()
X_train,X_valid,y_train,y_valid = splits['X_train'],splits['X_test'],splits['y_train'],splits['y_test']

In [None]:
print(score_dataset_rf(X_train, X_valid, y_train, y_valid))
print(score_dataset_gb(X_train, X_valid, y_train, y_valid))

In [None]:
X = pd.read_csv('../datasets/iowa_housing/train.csv', index_col='Id') 
X_test = pd.read_csv('../datasets/iowa_housing/test.csv', index_col='Id')
pre_processed = preprocess.preprocess_df(X,'SalePrice',X_test,one_hot=True,scaler=RobustScaler)
splits = pre_processed.split_df()
X_train,X_valid,y_train,y_valid = splits['X_train'],splits['X_test'],splits['y_train'],splits['y_test']
print(score_dataset_rf(X_train, X_valid, y_train, y_valid))
print(score_dataset_gb(X_train, X_valid, y_train, y_valid))

In [None]:
def get_regression_scores(X_train,X_test,y_train,y_test,error_fn):
    best_error = np.inf
    best_model = None
    models = [('RF',RandomForestRegressor(n_estimators=100, random_state=100)),
              ('GB',GradientBoostingRegressor(n_estimators=100, random_state=100)),
              ('ET',ExtraTreesRegressor(n_estimators=100, random_state=100)),
              ('LR',LinearRegression()),
              ('Lasso',Lasso(max_iter=10000,random_state=100)),
              ('Ridge',Ridge(random_state=100)),
              ('Elastic',ElasticNet(max_iter=10000,random_state=100))
             ]
    
    for model in models:
        model_instance = model[1]
        model_instance.fit(X_train,y_train)
        preds = model_instance.predict(X_test)
        error = error_fn(y_test, preds)
        print("{} error: {:.2f}".format(model[0],error))
        if best_error > error:
            best_error = error
            best_model = model_instance
            
    return best_model    
    

In [None]:
model = get_regression_scores(X_train,X_valid,y_train,y_valid,mean_absolute_error)

In [None]:
X = pd.read_csv('../datasets/iowa_housing/train.csv', index_col='Id') 
X_test = pd.read_csv('../datasets/iowa_housing/test.csv', index_col='Id')
pre_processed = preprocess.preprocess_df(X,'SalePrice',X_test,one_hot=False,scaler=RobustScaler)
splits = pre_processed.split_df()
X_train,X_valid,y_train,y_valid = splits['X_train'],splits['X_test'],splits['y_train'],splits['y_test']


In [None]:
model = get_regression_scores(X_train,X_valid,y_train,y_valid,mean_absolute_error)

In [None]:
preds = model.predict(X_valid)
mean_absolute_error(y_valid,preds)