In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint

In [2]:
train_clean = pd.read_csv('hw4-trainingset-wsa2113.csv')
train = train_clean.copy()

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
test_clean = pd.read_csv('hw4-testset-wsa2113.csv')
test = test_clean.copy()

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
cb = pd.read_csv('CodeBook-SELECT.csv')

In [5]:
print('train shape:',train.shape)
print('test shape:',test.shape)
print('code book shape:',cb.shape)

train shape: (20000, 380)
test shape: (24500, 380)
code book shape: (379, 2)


## TODO:  
* create baseline
    * drop high null variables and rows
* Data cleaning
    * drop duplicate columns
        * cntryid, cntryid_e
        * drop cntryid, as it is less specific
    * drop cols w/ high null %
    * drop rows w/ high null %
    * add is_null column for each category where value was null
    * find cols with mixed types and correct
    * convert numerical columns to int or float from objects
    * categorical variables
        * OHC categorical variables
        * make sure dtypes match up across train and test
    * missing data
        * experiment imputing with mean and median
            * check distribution of data; if there are big outliers, use median, if not, use mean
        * check all cols which have missing data and see if that data should be captured in a new feature
    * feature scaling
        * **NOTE: split and scale at the same time**
        * normalization --> min/max scaler
        * standardization (z-score normalization)
        * api
            * column transformer
                * numeric transformer
                    * imputer
                    * scaler
                * categorical transformer
                    * imputer
                    * OHE
    * dimensionality reduction
        * PCA
    * normalizing
        * yeo-johnson power transformer

# Baseline
* Remove columns
    * Drop high % null cols and rows
        * Threshold = 75%
    * Drop columns unrelated to performance
    * Drop target


* Split data into train, val, and test sets

In [6]:
from sklearn.model_selection import train_test_split

train = train\
    .dropna(thresh = len(train)*0.25, axis = 1)\
    .dropna(thresh = len(train.columns)*0.25, axis = 0)

X_train = train.drop(['uni','row','job_performance'], axis = 1)
y_train = train['job_performance']
X_test = test[X_train.columns]
y_test = test['job_performance']

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(19994, 273)
(19994,)
(24500, 273)
(24500,)


* Identify categorical and numerical columns

In [7]:
categorical_obj_cols = X_train.dtypes == object
cat_cols = categorical_obj_cols[categorical_obj_cols==True].index.values
num_cols = categorical_obj_cols[categorical_obj_cols==False].index.values

Dropped 6 rows and 105 columns.

* Fix columns with multiple data types

In [8]:
for col in cat_cols:
    X_train[col] = X_train[col].astype('str')
    
for col in num_cols:
    X_train[col] = X_train[col].astype('float')

### Build pipeline
    * impute nulls
    * scale numerical columns
    * OHE categorical columns
    * Try with
        * Lasso
        * Ridge
        * RandomForestRegressor
        * SVM

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score

def try_model(model, X, y):
    scores = cross_val_score(model, X, y, cv=3, scoring='neg_mean_squared_error')
    print('Average MSE = {:.2f} +/- {:.2f}'.format(-np.mean(scores), np.std(scores)))
    
    
num_transform = Pipeline(
    [
        ('num_imputer', SimpleImputer(strategy='median')),
        ('num_scaler', StandardScaler())
    ]
)

cat_transform = Pipeline(
    [
        ('cat_imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore')),
        ('cat_scaler', StandardScaler(with_mean=False))
    ]
)

preprocessor = ColumnTransformer(
    [
        ('num', num_transform, ~categorical_obj_cols),
        ('cat', cat_transform, categorical_obj_cols)
    ]
)
    
X_train_pr = preprocessor.fit_transform(X_train)

In [None]:
from sklearn.preprocessing import PolynomialFeatures

pf = 

#### Lasso

In [10]:
from sklearn.linear_model import LassoCV

alphas = np.logspace(-4, -0.5, 10)
lasso = LassoCV(alphas=alphas, cv=3, max_iter=1000, n_jobs=-1, random_state=0)
lasso.fit(X_train_pr, y_train)



LassoCV(alphas=array([1.00000e-04, 2.44844e-04, 5.99484e-04, 1.46780e-03, 3.59381e-03,
       8.79923e-03, 2.15443e-02, 5.27500e-02, 1.29155e-01, 3.16228e-01]),
    copy_X=True, cv=3, eps=0.001, fit_intercept=True, max_iter=1000,
    n_alphas=100, n_jobs=-1, normalize=False, positive=False,
    precompute='auto', random_state=0, selection='cyclic', tol=0.0001,
    verbose=False)

In [23]:
lasso_alpha = lasso.alpha_
min(np.mean(lasso.mse_path_, axis=1))

122903.10071882904

#### Ridge

In [25]:
from sklearn.linear_model import RidgeCV

ridge = RidgeCV(alphas=alphas, store_cv_values=True)
ridge.fit(X_train_pr, y_train)
cv_results = ridge.cv_values_

In [36]:
np.mean(cv_results, axis=0)

array([134072.9879197 , 134070.12326468, 134063.71372956, 134051.03180588,
       134032.49684864, 134023.79114665, 134066.07291059, 134214.30534895,
       134489.98222743, 134736.66943605])

#### RF Regressor  
* Guidance on gridsearching RandomForestRegressor : https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

In [48]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap
              }

rfr = RandomForestRegressor()

rscv = RandomizedSearchCV(
    rfr,
    param_distributions=random_grid,
    n_iter=100,
    cv=3,
    verbose=2,
    n_jobs=-1,
    scoring='neg_mean_squared_error',
    random_state=0
)

rscv.fit(X_train_pr, y_train)

In [None]:
rscv.cv_results_

In [None]:
import xgboost as xgb

Xy_train_xgb = xgb.DMatrix(data=X_train_pr, label=y_train)

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

params = {"objective":"reg:squarederror",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

cv_results = xgb.cv(dtrain=Xy_train_xgb, params=params, nfold=3, num_boost_round=50,
                    early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)

mse = cv_results["test-rmse-mean"].tail(1).values[0]**2
print('Average MSE = {:.2f}'.format(mse))

## Take 2  
* univariate statistics
* Incorporate PCA into pipeline
* Create new encoding for high cardinality categorical columns
* add missing value column for np.nan values
* Add feature interactions for numerical columns
* bootstrapping
* complex imputer
* Grid search best models from baseline
* Normalize data with yeo-johnson power transformer
* perutation importance for features? AML lec 12
* try tensor flow?
* sklearn missing indicator
* run on databricks

#### Univariate statistics

#### Truncated SVD  
* I want to try to lower the dimensionality of my data, both to increase speed of training, and hopefully to increase accuracy

In [67]:
X_train_pr.shape

(19994, 3081)

In [71]:
from sklearn.decomposition import TruncatedSVD


tsvd = TruncatedSVD(n_components=X_train_pr.shape[1]-1)
X_tsvd = tsvd.fit(X_train_pr)

In [73]:
tsvd_var_ratios = tsvd.explained_variance_ratio_

def select_n_components(var_ratio, goal_var: float) -> int:
    # Set initial variance explained so far
    total_variance = 0.0
    
    # Set initial number of features
    n_components = 0
    
    # For the explained variance of each feature:
    for explained_variance in var_ratio:
        
        # Add the explained variance to the total
        total_variance += explained_variance
        
        # Add one to the number of components
        n_components += 1
        
        # If we reach our goal level of explained variance
        if total_variance >= goal_var:
            # End the loop
            break
            
    # Return the number of components
    return n_components

select_n_components(tsvd_var_ratios, 0.99)

2116

In [74]:
tsvd = TruncatedSVD(n_components=2116, random_state=4771)
X_tsvd = tsvd.fit_transform(X_train_pr)

In [75]:
X_tsvd.shape

(19994, 2116)

In [76]:
from sklearn.linear_model import Lasso

try_model(Lasso(), X_tsvd, y_train)

Average MSE = 123329.53 +/- 1731.29


In [77]:
from sklearn.ensemble import RandomForestRegressor

try_model(RandomForestRegressor(n_estimators=10, max_depth=20), X_tsvd, y_train)

Average MSE = 135859.02 +/- 1023.35


In [78]:
import xgboost as xgb

Xy_train_xgb = xgb.DMatrix(data=X_tsvd, label=y_train)

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

params = {"objective":"reg:squarederror",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

cv_results = xgb.cv(dtrain=Xy_train_xgb, params=params, nfold=3, num_boost_round=50,
                    early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)

mse = cv_results["test-rmse-mean"].tail(1).values[0]**2
print('Average MSE = {:.2f}'.format(mse))

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


Average MSE = 133011.93


In [47]:
from sklearn.decomposition import PCA

data_rescaled = StandardScaler().fit_transform(X_train)
pca = PCA().fit(data_rescaled)
#Plotting the Cumulative Summation of the Explained Variance
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Pulsar Dataset Explained Variance')
plt.show()

ValueError: could not convert string to float: 'France'

In [45]:
PCA?

[0;31mInit signature:[0m
[0mPCA[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m[[0m[0;34m'n_components=None'[0m[0;34m,[0m [0;34m'copy=True'[0m[0;34m,[0m [0;34m'whiten=False'[0m[0;34m,[0m [0;34m"svd_solver='auto'"[0m[0;34m,[0m [0;34m'tol=0.0'[0m[0;34m,[0m [0;34m"iterated_power='auto'"[0m[0;34m,[0m [0;34m'random_state=None'[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Principal component analysis (PCA)

Linear dimensionality reduction using Singular Value Decomposition of the
data to project it to a lower dimensional space.

It uses the LAPACK implementation of the full SVD or a randomized truncated
SVD by the method of Halko et al. 2009, depending on the shape of the input
data and the number of components to extract.

It can also use the scipy.sparse.linalg ARPACK implementation of the
truncated SVD.

Notice that this class does not support sparse input. See
:class:`TruncatedSVD` for a

In [10]:
# high cardinality cols
for col in cat_cols:
    if len(train[col].unique()) > 50:
        print(col)


lng_home
cnt_h
cnt_brth
reg_tl2
v272
v52
v184
v104
v135
v235
v1
v63
v87
v239
v224
v71
v105
isic2l
isic2c
isco2c
isco2l
