# Ridge Classifier Regression

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import f1_score, make_scorer, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from secom import data

%matplotlib inline

DATA_PATH = "../../data"
RANDOM_STATE = 0

# Load Data and merge dataframes

In [2]:
vendor_data  = data.load_vendor_json(DATA_PATH, feature_engineer=True)
secom_labels = data.load_secom_features(DATA_PATH)
secom_data = data.load_secom_labels(DATA_PATH, feature_engineer=True, human_labels=False)

df = data.combine_data_sources(secom_data, secom_labels, vendor_data)

In [3]:
df.shape

(1567, 636)

In [4]:
df.head()

Unnamed: 0,s_label_target,s_label_datetime_ordinal_eng,s_label_month__1,s_label_month__2,s_label_month__3,s_label_month__4,s_label_month__5,s_label_month__6,s_label_month__7,s_label_month__8,...,json_sop_vendor_bbb,json_sop_vendor_ccc,json_sop_vendor_ddd,json_sop_vendor_eee,json_sop_vendor_fff,json_sop_vendor_ggg,json_sop_vendor_hhh,json_sop_vendor_iii,json_sop_vendor_jjj,json_sop_vendor_kkk
0,1,733242,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
1,1,733242,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,733242,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,1,733242,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
4,1,733242,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
Y = df["s_label_target"]
X = df.drop("s_label_target", axis=1)

In [6]:
X.head()

Unnamed: 0,s_label_datetime_ordinal_eng,s_label_month__1,s_label_month__2,s_label_month__3,s_label_month__4,s_label_month__5,s_label_month__6,s_label_month__7,s_label_month__8,s_label_month__9,...,json_sop_vendor_bbb,json_sop_vendor_ccc,json_sop_vendor_ddd,json_sop_vendor_eee,json_sop_vendor_fff,json_sop_vendor_ggg,json_sop_vendor_hhh,json_sop_vendor_iii,json_sop_vendor_jjj,json_sop_vendor_kkk
0,733242,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
1,733242,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,733242,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,733242,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
4,733242,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
Y.head()

0    1
1    1
2    0
3    1
4    1
Name: s_label_target, dtype: int64

## Fit pipeline
### Impute Missing Values
Use median imputation because its straightforward

### RidgeClassifier
Using RidgeClassifer since linear models are easy to interpret, fit fast, and ridge regressions act like they have coefficient reduction

In [8]:
imputer = SimpleImputer(strategy="median")
clf = linear_model.RidgeClassifier(class_weight="balanced", normalize=True)
ridge_regression = Pipeline([('imputer', imputer), ('ridge_classifier', clf)])

# Perform Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True)
cross_val_score(ridge_regression, X, Y, cv=cv, scoring=make_scorer(f1_score))

array([0.94956522, 0.95774648, 0.94867257, 0.94285714, 0.93971631])

# Fit on all data

In [11]:
ridge_regression.fit(X,Y)
pred = ridge_regression.predict(X)

## Evaluate results

In [12]:
confusion_matrix(Y, pred)

array([[ 102,    2],
       [ 109, 1354]])

In [13]:
coefficients = ridge_regression.named_steps["ridge_classifier"].coef_

In [14]:
coef_weights= pd.Series(coefficients[0], index=X.columns)
abs(coef_weights).sort_values(ascending=False)

s_data_95     101.770678
s_data_358     71.604245
s_data_94      46.956075
s_data_278     32.078419
s_data_101     31.979985
s_data_100     21.299861
s_data_357     17.908472
s_data_104     14.515590
s_data_85      13.051465
s_data_143     12.464130
s_data_220     11.980681
s_data_544      9.262575
s_data_131      9.048956
s_data_543      8.666410
s_data_103      7.212395
s_data_365      6.668734
s_data_219      6.084571
s_data_368      6.006289
s_data_367      5.796082
s_data_114      4.999415
s_data_582      4.896260
s_data_281      4.139400
s_data_239      3.905104
s_data_56       3.696351
s_data_84       2.969101
s_data_280      2.954288
s_data_542      2.631987
s_data_377      2.500739
s_data_238      2.439884
s_data_588      2.380174
                 ...    
s_data_462      0.000000
s_data_463      0.000000
s_data_464      0.000000
s_data_465      0.000000
s_data_466      0.000000
s_data_243      0.000000
s_data_97       0.000000
s_data_284      0.000000
s_data_276      0.000000


In [None]:
coef_weights["json_sil_vendor_eee"]