In [None]:
%matplotlib inline
import pandas as pd
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,BaggingRegressor,ExtraTreesRegressor
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split,cross_val_score,KFold,StratifiedKFold,GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder,Imputer,OneHotEncoder,MinMaxScaler, RobustScaler, StandardScaler,FunctionTransformer
from sklearn.impute import SimpleImputer
from scipy.stats import skew,randint
import numpy as np
from time import time
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,classification_report,roc_curve,roc_auc_score
%load_ext autoreload
%autoreload 2

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Pipelines

In [None]:
X = pd.read_csv('../datasets/iowa_housing/train.csv', index_col='Id') 
X_test = pd.read_csv('../datasets/iowa_housing/test.csv', index_col='Id')
y = X['SalePrice']
X.drop('SalePrice',axis=1,inplace=True)

In [None]:
X.head()

In [None]:
def split_num_and_cat(df):
        cat_cols = [col for col in df.columns if df[col].dtype == "object"]
        num_cols = [col for col in df.columns if df[col].dtype in ['int64', 'float64']]
        return cat_cols,num_cols
        

In [None]:
cat_cols,num_cols = split_num_and_cat(X)

In [None]:
low_count_cols = [col for col in cat_cols if X[col].nunique() < 10]
high_count_cols = list(set(cat_cols)-set(low_count_cols))

In [None]:
len(cat_cols),len(num_cols)

In [None]:
len(low_count_cols),len(high_count_cols)

In [None]:
def get_missing_values_percentage(df):
    missing_values_counts_list = df.isnull().sum()
    total_values = np.product(df.shape)
    total_missing = missing_values_counts_list.sum()
    # percent of data that is missing
    return (total_missing/total_values) * 100

In [None]:
 class myImputer(BaseEstimator, TransformerMixin):
        def __init__(self,strategy='median'):
            print('constructor with strategy {}'.format(strategy))
            self.strategy = strategy
        def fit(self, X, y=None):
            print('fit called')
            self.y = y
            return self
        def transform(self, X):
            X = X.replace(0, np.NaN)
            cols = X.columns
            print('transform called')
            if self.strategy == 'median':
                X = X.fillna(X.median())
            elif self.strategy == 'most_frequent':
                for col in cols:
                    X[col] = X[col].astype('category').cat.codes
                X = X.fillna(X.mode())
            
            return X

In [None]:
class iowaFeatureEngineering(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X,y):
        return self
    def transform(self,df):
        df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
        df["OverallGrade"] = df["OverallQual"] * df["OverallCond"]
        df['TotalLivArea'] = df['GrLivArea'] + df['GarageArea'] + df['LotArea']

        df["GrLivArea-2"] = df["GrLivArea"] ** 2
        df["GrLivArea-3"] = df["GrLivArea"] ** 3
        df["GrLivArea-Sq"] = np.sqrt(df["GrLivArea"])
        df["GarageArea-2"] = df["GarageArea"] ** 2
        df["GarageArea-3"] = df["GarageArea"] ** 3
        df["GarageArea-Sq"] = np.sqrt(df["GarageArea"])
        return df
   

In [None]:
get_missing_values_percentage(X)

In [None]:
#imp1 = myImputer(strategy='median')
#df1 = X[num_cols]
#imp1 = imp1.fit(df1)
#df1 = imp1.transform(df1)
#get_missing_values_percentage(df1)

In [None]:
#imp2 = myImputer(strategy='most_frequent')
#df2 = X[num_cols]
#imp2 = imp1.fit(df2)
#df2 = imp1.transform(df2)
#get_missing_values_percentage(df2)

In [None]:
#X = pd.concat([df1,df2],axis=1).reset_index()
#get_missing_values_percentage(X)

In [None]:
#imp1 = myImputer(strategy="median")
#imp2 = myImputer(strategy="most_frequent")
#n1 = imp1.fit_transform(X[num_cols])
#n2 = imp2.fit_transform(X[cat_cols])
#n1.shape,n2.shape
#X = pd.concat([n1,n2],axis=1).reset_index()
#get_missing_values_percentage(X)

In [None]:
num_pipeline = Pipeline([
        ('num_imputer', myImputer(strategy="median")),
        ('add_features',iowaFeatureEngineering()),
        ('scaler', RobustScaler()),
    ])
cat_pipeline = Pipeline([
        ("cat_imputer",myImputer(strategy="most_frequent")),
        #('onehot', OneHotEncoder(handle_unknown='ignore'))
   ])

preprocessors = ColumnTransformer([
        ("num",num_pipeline,num_cols),
        ("cat",cat_pipeline,cat_cols),
        
    ])


In [None]:
#fe = iowaFeatureEngineering()
#fe.fit(X,y)
#X = fe.transform(X)
#X.head()


In [None]:
#X = preprocessors.fit_transform(X)

In [None]:
X = pd.read_csv('../datasets/iowa_housing/train.csv', index_col='Id') 
X_test = pd.read_csv('../datasets/iowa_housing/test.csv', index_col='Id')
y = X['SalePrice']
X.drop('SalePrice',axis=1,inplace=True)
X_train, X_valid, y_train, y_valid = train_test_split(X, y,train_size=0.8,test_size=0.2,
                                                      random_state=100)

In [None]:
X_train = preprocessors.fit_transform(X_train,y_train)
X_valid = preprocessors.transform(X_valid)
get_missing_values_percentage(pd.DataFrame(X_train))
get_missing_values_percentage(pd.DataFrame(X_valid))

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y,train_size=0.8,test_size=0.2,
                                                      random_state=100)

In [None]:
final_pipeline = Pipeline([ ("preprocess",preprocessors),
                            ("model",GradientBoostingRegressor(n_estimators=100,random_state=100))])

In [None]:
model = final_pipeline.fit(X_train,y_train)
preds = model.predict(X_valid)
print('MAE:', mean_absolute_error(y_valid, preds))

In [None]:
X_train.shape,X_valid.shape

In [None]:
X = pd.read_csv('../datasets/iowa_housing/train.csv', index_col='Id') 
X_test = pd.read_csv('../datasets/iowa_housing/test.csv', index_col='Id')
y = X['SalePrice']
X.drop('SalePrice',axis=1,inplace=True)
X_train, X_valid, y_train, y_valid = train_test_split(X, y,train_size=0.8,test_size=0.2,
                                                      random_state=100)

In [None]:
X_train.shape

In [None]:
X_train = preprocessors.fit_transform(X_train,y_train)
X_valid = preprocessors.transform(X_valid)
get_missing_values_percentage(pd.DataFrame(X_train))
get_missing_values_percentage(pd.DataFrame(X_valid))

In [None]:
X_train.shape

In [None]:
params = {'n_estimators':[100,200,300,400,500,1500],
           "max_features": randint(0,89),
           "min_samples_split": randint(2, 11),
           "min_samples_leaf": randint(1, 11),
           "subsample":[0.6,0.7,0.75,0.8,0.9]
         }

kfold = KFold(n_splits=5, shuffle=True, random_state=0)
start = time()
randomSearch_gb = RandomizedSearchCV(GradientBoostingRegressor(warm_start=True,random_state=100),
                                     param_distributions=params,n_iter=20,
                                     cv=kfold,n_jobs=6)        
randomSearch_gb.fit(X_train,y_train)

print('training took {} minutes'.format((time() - start)/60.))

In [None]:
def score_dataset(model,X_train, X_valid, y_train, y_valid,error_fn=mean_absolute_error):
    preds = model.predict(X_valid)
    return error_fn(y_valid, preds)

In [None]:
score_dataset(randomSearch_gb.best_estimator_,X_train, X_valid, y_train, y_valid,error_fn=mean_absolute_error)

In [None]:
scores = cross_val_score(randomSearch_gb.best_estimator_,X_train,y_train,cv=4)
print("Cross-validation scores: {}, mean score = {}".format(scores,scores.mean()))

## Credit Risk Assessment: A Classification Problem

In [None]:
def get_classification_results(model,X_train,y_train,X_test,y_test,target_names=None):
    model = model.fit(X_train, y_train)
    print("Training set score: {:.3f}".format(model.score(X_train, y_train)))
    print("Test set score: {:.3f}".format(model.score(X_test, y_test)))
    preds = model.predict(X_test)
    confusion = confusion_matrix(y_test, preds)
    print("Confusion matrix:\n{}".format(confusion))
    print('F1 score = {:.3f}'.format(f1_score(y_test, preds)))
    print('ROC-AUC Score = {:.3f}'.format(roc_auc_score(y_test,preds)))
    if target_names is not None:
        print(classification_report(y_test, preds,target_names=target_names))
    

In [None]:
def binaryRandomSampler(X,target,sample_type='under'):
    count_class_0, count_class_1 = X[target].value_counts()
    X_class_0 = X[X[target] == 0]
    X_class_1 = X[X[target] == 1]
    if count_class_0 < count_class_1:
        X_lower_class = X_class_0
        X_higher_class = X_class_1
        count_class_lower = count_class_0
        count_class_higher = count_class_1
    else:
        X_lower_class = X_class_1
        X_higher_class = X_class_0
        count_class_lower = count_class_1
        count_class_higher = count_class_0
        
    if sample_type == 'under':
        X_higher_class = X_higher_class.sample(count_class_lower)    
    else:
        X_lower_class = X_lower_class.sample(count_class_higher,replace=True)
    
    X = pd.concat([X_higher_class, X_lower_class], axis=0).reset_index()
    X.drop('index',axis=1,inplace=True)

    print('Random under-sampling:')
    print(X[target].value_counts())
    return X


In [None]:
X = pd.read_csv('../datasets/give_me_credit/train.csv', low_memory=False,index_col=0)
X_test = pd.read_csv('../datasets/give_me_credit/test.csv', low_memory=False,index_col=0)

In [None]:
X.head()

In [None]:
X.dropna(axis=0,inplace=True)
get_missing_values_percentage(X)

In [None]:
X['SeriousDlqin2yrs'].value_counts()

In [None]:
X = binaryRandomSampler(X,'SeriousDlqin2yrs')
y = X['SeriousDlqin2yrs']

In [None]:
X = binaryRandomSampler(X,'SeriousDlqin2yrs',sample_type='over')
y = X['SeriousDlqin2yrs']

In [None]:
len(X),len(y)

In [None]:
X.drop(['SeriousDlqin2yrs'],axis=1,inplace=True)
X_test.drop(['SeriousDlqin2yrs'],axis=1,inplace=True)

In [None]:
def convert_to_cat(df,cols):
    for col in cols:
        df[col] = df[col].astype('str')

In [None]:
convert_to_cat(X,['NumberOfTime30-59DaysPastDueNotWorse',
                  'NumberOfTime60-89DaysPastDueNotWorse'])

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y,stratify=y,
                                                      train_size=0.8,random_state=100)

In [None]:
X_train.shape,y_train.shape

In [None]:
credit_pl_rf = Pipeline([ ("scaler",RobustScaler()),
                            ("model",RandomForestClassifier(n_estimators=100,n_jobs=6,random_state=100))])

In [None]:
credit_pl_gb = Pipeline([ ("scaler",RobustScaler()),
                            ("model",GradientBoostingClassifier(n_estimators=100,random_state=100))])

In [None]:
gb = GradientBoostingClassifier(n_estimators=100,warm_start=True)

In [None]:
model = RandomForestClassifier(n_estimators=100,n_jobs=6, random_state=100)

In [None]:
get_classification_results(credit_pl_gb,X_train,y_train,X_valid,y_valid,target_names=["Low Risk", "High Risk"])

In [None]:
get_classification_results(credit_pl_rf,X_train,y_train,X_valid,y_valid,target_names=["Low Risk", "High Risk"])