# Ridge Classifier Regression

In [17]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, make_scorer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from secom import data

%matplotlib inline

DATA_PATH = "../../data"
RANDOM_STATE = 0

# Load Data and merge dataframes


In [18]:
vendor_data  = data.load_vendor_json(DATA_PATH, feature_engineer=True)
secom_labels = data.load_secom_features(DATA_PATH)
secom_data = data.load_secom_labels(DATA_PATH, feature_engineer=True, human_labels=False)

df = data.combine_data_sources(secom_data, secom_labels, vendor_data)

In [19]:
df.shape

(1567, 636)

In [20]:
df.head()

Unnamed: 0,s_label_target,s_label_datetime_ordinal_eng,s_label_month__1,s_label_month__2,s_label_month__3,s_label_month__4,s_label_month__5,s_label_month__6,s_label_month__7,s_label_month__8,...,json_sop_vendor_bbb,json_sop_vendor_ccc,json_sop_vendor_ddd,json_sop_vendor_eee,json_sop_vendor_fff,json_sop_vendor_ggg,json_sop_vendor_hhh,json_sop_vendor_iii,json_sop_vendor_jjj,json_sop_vendor_kkk
0,1,733242,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
1,1,733242,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,733242,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,1,733242,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
4,1,733242,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [21]:
Y = df["s_label_target"]
X = df.drop("s_label_target", axis=1)

In [22]:
X.head()

Unnamed: 0,s_label_datetime_ordinal_eng,s_label_month__1,s_label_month__2,s_label_month__3,s_label_month__4,s_label_month__5,s_label_month__6,s_label_month__7,s_label_month__8,s_label_month__9,...,json_sop_vendor_bbb,json_sop_vendor_ccc,json_sop_vendor_ddd,json_sop_vendor_eee,json_sop_vendor_fff,json_sop_vendor_ggg,json_sop_vendor_hhh,json_sop_vendor_iii,json_sop_vendor_jjj,json_sop_vendor_kkk
0,733242,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
1,733242,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,733242,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,733242,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
4,733242,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [23]:
Y.head()

0    1
1    1
2    0
3    1
4    1
Name: s_label_target, dtype: int64

## Fit pipeline
### Impute Missing Values
Use median imputation because its straightforward

### RidgeClassifier
Using RidgeClassifer since linear models are easy to interpret, fit fast, and ridge regressions act like they have coefficient reduction

In [24]:
imputer = SimpleImputer(strategy="median")
clf = linear_model.RidgeClassifier(class_weight="balanced", normalize=True)
ridge_regression = Pipeline([('imputer', imputer), ('ridge_classifier', clf)])

# Perform Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True)
cross_val_score(ridge_regression, X, Y, cv=cv, scoring=make_scorer(f1_score))

array([0.9483066 , 0.94117647, 0.93023256, 0.96126761, 0.95971979])

# Fit on all data

In [25]:
ridge_regression.fit(X,Y)

Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0)), ('ridge_classifier', RidgeClassifier(alpha=1.0, class_weight='balanced', copy_X=True,
        fit_intercept=True, max_iter=None, normalize=True,
        random_state=None, solver='auto', tol=0.001))])

In [26]:
coefficients = ridge_regression.named_steps["ridge_classifier"].coef_

In [27]:
coef_weights= pd.Series(coefficients[0], index=X.columns)
coef_weights.sort_values()

s_data_95    -101.770678
s_data_358    -71.604245
s_data_100    -21.299861
s_data_357    -17.908472
s_data_544     -9.262575
s_data_131     -9.048956
s_data_103     -7.212395
s_data_365     -6.668734
s_data_219     -6.084571
s_data_114     -4.999415
s_data_582     -4.896260
s_data_239     -3.905104
s_data_56      -3.696351
s_data_542     -2.631987
s_data_238     -2.439884
s_data_588     -2.380174
s_data_349     -2.261532
s_data_387     -2.174967
s_data_93      -2.084466
s_data_579     -1.452633
s_data_587     -1.309901
s_data_348     -1.282890
s_data_580     -1.142692
s_data_227     -1.139098
s_data_81      -0.944942
s_data_211     -0.882871
s_data_75      -0.872083
s_data_385     -0.861418
s_data_366     -0.795222
s_data_57      -0.706148
                 ...    
s_data_157      0.730131
s_data_353      0.765000
s_data_11       0.771635
s_data_116      0.791781
s_data_352      0.913589
s_data_119      0.922499
s_data_109      0.949652
s_data_105      1.034669
s_data_146      1.038410


In [28]:
coef_weights["json_sil_vendor_eee"]

-0.5259685087527045