In [1]:
'''
Use Elastic Nets to filter some columns and then use Zero-inflated Model for Y 0-100
Ref from https://timeseriesreasoning.com/contents/zero-inflated-poisson-regression-model/
'''
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import warnings

from tqdm import tqdm
from sklearn.linear_model import ElasticNet, Lasso, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:

# Init_Settings
warnings.filterwarnings("ignore")
plotting = True

# Read data
data = pd.read_csv('data/data_0802.csv', encoding='utf-8')
data = data.drop(data.columns[0], axis=1)
X_columns = [c for c in data.columns if c != '已填寫問卷數量']

# Y Transformation & X Pre-process
Y_data = data['已填寫問卷數量'].tolist()#.map(lambda x: 0  if int(x) == 1 else 1)
Y_data = [int(y-1) for y in Y_data]
X_data = data[X_columns]
scaler = MinMaxScaler() # StandardScaler()
X_data_scaled = pd.DataFrame(scaler.fit_transform(X_data), columns=X_data.columns)

# Elastic Net model for feature selection
enet = ElasticNet(alpha=0.01, l1_ratio=0.1, random_state=42)
enet.fit(X_data_scaled, Y_data)

# Get selected features based on non-zero coefficients --> X_enet, Y_data
selected_features = X_data_scaled.columns[enet.coef_ != 0].tolist()
X_enet = X_data_scaled[selected_features]

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X_enet, Y_data, test_size=0.2, random_state=42)

In [4]:
# Top performance
top_perf = pd.read_csv('data/seq_sel_top_10.csv', encoding='utf-8')
summary_list, feat_sel_list = [], []
for i, j in top_perf.iterrows():
    sel_feat = j[1][1:-1].split(', ')
    sel_feat = [int(i) for i in sel_feat]
    X_subset = X_train.iloc[:, sel_feat]

    # ZIP training, endog and exog with actual data and explanatory variables.
    zip_training_results = sm.ZeroInflatedPoisson(endog=y_train, exog=X_subset, exog_infl=X_subset, 
                                                    inflation='logit').fit(maxiter=1000) # Try to make it converge
    smy = zip_training_results.summary()
    
    # Evaluate by p-values
    p_value = zip_training_results.pvalues.tolist()
    exog_names = zip_training_results.model.exog_names
    
    feats = [e_name for e_name, pv in zip(exog_names, p_value) if pv < 0.05]
    feat_sel_list.append(feats)

Optimization terminated successfully.
         Current function value: 6.242181
         Iterations: 149
         Function evaluations: 151
         Gradient evaluations: 151
Optimization terminated successfully.
         Current function value: 6.034766
         Iterations: 174
         Function evaluations: 176
         Gradient evaluations: 176
Optimization terminated successfully.
         Current function value: 6.132376
         Iterations: 79
         Function evaluations: 81
         Gradient evaluations: 81
Optimization terminated successfully.
         Current function value: 6.049704
         Iterations: 110
         Function evaluations: 112
         Gradient evaluations: 112
Optimization terminated successfully.
         Current function value: 6.005973
         Iterations: 137
         Function evaluations: 139
         Gradient evaluations: 139
Optimization terminated successfully.
         Current function value: 6.034766
         Iterations: 174
         Function evalu

In [8]:
# Print all the results
for i, f in enumerate(feat_sel_list):
    print(f"{i}. \n {f}\n")

0. 
 ['inflate_年齡區間.x', 'inflate_教育階段.x', 'inflate_性別.x', 'inflate_城市_2', 'inflate_城市_14', 'inflate_政治傾向_10', '年齡區間.x', '教育階段.x', '性別.x', 'X70', '城市_9', '職業.x_8', '城市_14', '政治傾向_1', '政治傾向_10', '城市_6', '城市_12']

1. 
 ['inflate_年齡區間.x', 'inflate_教育階段.x', 'inflate_性別.x', 'inflate_X35', 'inflate_X48', 'inflate_城市_14', 'inflate_X53', 'inflate_城市_8', 'inflate_政治傾向_10', 'inflate_籍貫_6', '年齡區間.x', '教育階段.x', '性別.x', '婚姻狀態', 'X32', 'X70', 'X54', 'X48', '城市_14', 'X4', '城市_5', 'X53', '城市_8', 'X56', '城市_16', '籍貫_6', '城市_10']

2. 
 ['inflate_年齡區間.x', 'inflate_教育階段.x', 'inflate_性別.x', 'inflate_X35', 'inflate_城市_2', 'inflate_X53', '年齡區間.x', '教育階段.x', '性別.x', '婚姻狀態', 'X35', 'X48', 'X70', 'X4', 'X53']

3. 
 ['inflate_年齡區間.x', 'inflate_性別.x', 'inflate_X35', 'inflate_X53', 'inflate_職業.x_1', 'inflate_城市_8', 'inflate_城市_14', 'inflate_X14', '年齡區間.x', '教育階段.x', '性別.x', '婚姻狀態', 'X32', 'X53', 'X70', '城市_8', '城市_14', 'X4', 'X14']

4. 
 ['inflate_年齡區間.x', 'inflate_教育階段.x', 'inflate_性別.x', 'inflate_X35', 'inflate_X