In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [12]:
train_clean = pd.read_csv('hw4-trainingset-wsa2113.csv')
train = train_clean.copy()

In [13]:
test_clean = pd.read_csv('hw4-testset-wsa2113.csv')
test = test_clean.copy()

In [14]:
cb = pd.read_csv('CodeBook-SELECT.csv')

In [15]:
print('train shape:',train.shape)
print('test shape:',test.shape)
print('code book shape:',cb.shape)

train shape: (20000, 380)
test shape: (24500, 380)
code book shape: (379, 2)


## TODO:  
* create baseline
    * drop high null variables and rows
* Data cleaning
    * drop duplicate columns
        * cntryid, cntryid_e
        * drop cntryid, as it is less specific
    * drop cols w/ high null %
    * drop rows w/ high null %
    * add is_null column for each category where value was null
    * find cols with mixed types and correct
    * convert numerical columns to int or float from objects
    * categorical variables
        * OHC categorical variables
        * make sure dtypes match up across train and test
    * missing data
        * experiment imputing with mean and median
            * check distribution of data; if there are big outliers, use median, if not, use mean
        * check all cols which have missing data and see if that data should be captured in a new feature
    * feature scaling
        * **NOTE: split and scale at the same time**
        * normalization --> min/max scaler
        * standardization (z-score normalization)
        * api
            * column transformer
                * numeric transformer
                    * imputer
                    * scaler
                * categorical transformer
                    * imputer
                    * OHE
    * dimensionality reduction
        * PCA
    * normalizing
        * yeo-johnson power transformer

# Baseline
* Remove columns
    * Drop high % null cols and rows
        * Threshold = 75%
    * Drop columns unrelated to performance
    * Drop target


* Split data into train, val, and test sets

In [16]:
from sklearn.model_selection import train_test_split

train = train\
    .dropna(thresh = len(train)*0.25, axis = 1)\
    .dropna(thresh = len(train.columns)*0.25, axis = 0)

X_train = train.drop(['uni','row','job_performance'], axis = 1)
y_train = train['job_performance']
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state = 4771)
X_test = test[X_train.columns]
y_test = test['job_performance']
print(X_train.shape)
print(y_train.shape)
# print(X_val.shape)
# print(y_val.shape)
print(X_test.shape)
print(y_test.shape)

(19994, 273)
(19994,)
(24500, 273)
(24500,)


* Identify categorical and numerical columns

In [17]:
categorical_obj_cols = X_train.dtypes == object
cat_cols = categorical_obj_cols[categorical_obj_cols==True].index.values
num_cols = categorical_obj_cols[categorical_obj_cols==False].index.values

Dropped 6 rows and 105 columns.

* Fix columns with multiple data types

In [19]:
for col in cat_cols:
    X_train[col] = X_train[col].astype('str')
#     X_val[col] = X_val[col].astype('str')
    
for col in num_cols:
    X_train[col] = X_train[col].astype('float')
#     X_val[col] = X_val[col].astype('float')

### Build pipeline
    * impute nulls
    * scale numerical columns
    * OHE categorical columns
    * Try with
        * Lasso
        * Ridge
        * RandomForestRegressor
        * SVM

In [27]:
def try_model(model):
    from sklearn.compose import ColumnTransformer
    from sklearn.pipeline import Pipeline
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    from sklearn.model_selection import cross_val_score

    num_transform = Pipeline(
        [
            ('num_imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]
    )

    cat_transform = Pipeline(
        [
            ('cat_imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]
    )

    preprocessor = ColumnTransformer(
        [
            ('num', num_transform, ~categorical_obj_cols),
            ('cat', cat_transform, categorical_obj_cols)
        ]
    )
    
    clf = Pipeline(
        [
            ('preprocessor', preprocessor),
            ('model', model)
        ]
    )
    scores = cross_val_score(clf, X_train, y_train, cv=3, scoring='neg_mean_squared_error')
    print('Average MSE = {:.2f} +/- {:.2f}'.format(-np.mean(scores), np.std(scores)))

In [28]:
from sklearn.linear_model import Lasso

try_model(Lasso())

Average MSE = 126657.46 +/- 9624.29


In [29]:
from sklearn.linear_model import Ridge

try_model(Ridge(alpha=1.0))

Average MSE = 130017.81 +/- 17253.86


In [30]:
from sklearn.ensemble import RandomForestRegressor

try_model(RandomForestRegressor(n_estimators=10, max_depth=20))

Average MSE = 117114.50 +/- 927.40


In [31]:
from sklearn.svm import SVR

try_model(SVR(kernel='rbf', C=100, gamma='scale', epsilon=.1))

Average MSE = 122613.98 +/- 1697.94


In [33]:
import xgboost as xgb
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score

num_transform = Pipeline(
    [
        ('num_imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

cat_transform = Pipeline(
    [
        ('cat_imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

preprocessor = ColumnTransformer(
    [
        ('num', num_transform, ~categorical_obj_cols),
        ('cat', cat_transform, categorical_obj_cols)
    ]
)

X_train_pr = preprocessor.fit_transform(X_train)
X_test_pr = preprocessor.transform(X_test)

Xy_train_xgb = xgb.DMatrix(data=X_train_pr, label=y_train)

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

params = {"objective":"reg:squarederror",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

cv_results = xgb.cv(dtrain=Xy_train_xgb, params=params, nfold=3, num_boost_round=50,
                    early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)

mse = cv_results["test-rmse-mean"].tail(1).values[0]**2
print('Average MSE = {:.2f}'.format(mse))

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


Average MSE = 120517.21


## Take 2  
* univariate statistics
* Incorporate PCA into pipeline
* Create new encoding for high cardinality categorical columns
* add missing value column for np.nan values
* Add feature interactions for numerical columns
* bootstrapping
* complex imputer
* Grid search best models from baseline
* Normalize data with yeo-johnson power transformer
* perutation importance for features? AML lec 12

#### Univariate statistics

#### PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA().fit(data_rescaled)
#Plotting the Cumulative Summation of the Explained Variance
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Pulsar Dataset Explained Variance')
plt.show()

In [10]:
# high cardinality cols
for col in cat_cols:
    if len(train[col].unique()) > 50:
        print(col)


lng_home
cnt_h
cnt_brth
reg_tl2
v272
v52
v184
v104
v135
v235
v1
v63
v87
v239
v224
v71
v105
isic2l
isic2c
isco2c
isco2l
