In [None]:
import sys; sys.path.append('../../')
import numpy as np
import pandas as pd
import xgboost as xgb
import statsmodels.api as sm
import matplotlib.pyplot as plt
import statsmodels.graphics.api as smg
from warnings import filterwarnings
from scipy import stats
from scipy.stats import loguniform
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from analysis.constants import VOCAB_SIZE
from graphs.utils import plot_morris_method_graph
from analysis.utils import get_sa_problem
from SALib.sample.morris import sample
from SALib.analyze.morris import analyze
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

filterwarnings('ignore')

# Data Initialization

In [None]:
DATASET_PROPERTIES_PATH = r'..\..\results\dataset_properties.csv'
IMPURITY_DIFF_PATH = r'..\..\results\impurity_differences.csv'

In [None]:
dataset_properties_df = pd.read_csv(DATASET_PROPERTIES_PATH)
dataset_properties_df

In [None]:
impurity_differences_df = pd.read_csv(IMPURITY_DIFF_PATH)
impurity_differences_df.head()

In [None]:
data_df = impurity_differences_df.merge(dataset_properties_df, on='name')
data_df = data_df.sample(frac=1)
X_df = data_df.drop(['name', 'impurity_mean_diff'], axis=1)
y_df = data_df[['impurity_mean_diff']]

# Data Plots

In [None]:
corr_scores_df = dataset_properties_df[['silhouette_score', 'calinski_harabasz_score', 'davies_bouldin_score']]
corr_scores_df['custom_score'] = (dataset_properties_df['inter_cluster_spread'] 
                                  / dataset_properties_df['intra_cluster_spread']).reset_index(drop=True)
corr_scores_df = corr_scores_df[corr_scores_df['custom_score'] != np.inf]

corr_scores_matrix = np.corrcoef(corr_scores_df.T)
smg.plot_corr(corr_scores_matrix, corr_scores_df.columns)
plt.show()

In [None]:
corr_df = pd.concat([X_df, y_df], axis=1).reset_index(drop=True)
corr_matrix_base = np.corrcoef(corr_df.T)
smg.plot_corr(corr_matrix_base, corr_df.columns)
plt.show()

# Sensitivity Analysis

In [None]:
X_train_dmatrix = xgb.DMatrix(X_df, y_df)

num_round = 50
param = {'max_depth':6, 'objective':'reg:squarederror' }
bst = xgb.train(param, X_train_dmatrix, num_round)

In [None]:
problem = get_sa_problem(X_df)
inputs = sample(problem, 1000, num_levels=4)

X_morris_df = pd.DataFrame(inputs, columns=X_df.columns)
X_morris_dmatrix = xgb.DMatrix(X_morris_df)
results = bst.predict(X_morris_dmatrix)

sensitivity_indices = \
    analyze(problem,
            inputs,
            results,
            conf_level=0.95,
            num_levels=5)

plot_morris_method_graph(sensitivity_indices, 'Impurity Differences')
plt.show()

In [None]:
bst_model = xgb.XGBRegressor(max_depth=6, num_round=50)
bst_scores = cross_val_score(bst_model, X_df, y_df, cv=10, scoring='neg_mean_squared_error')
bst_scores.mean()

# Lasso

In [None]:
pipe = make_pipeline(StandardScaler(), Lasso())

param_dist = {'lasso__alpha': loguniform(1e-4, 2)}
random_search = RandomizedSearchCV(pipe, scoring = 'neg_mean_squared_error', 
                                   param_distributions=param_dist,
                                   n_iter=1000, cv=10, verbose=True)

random_search.fit(X_df, y_df)
random_search.best_params_

In [None]:
data = {
    'coef_names': X_df.columns,
    'coef_values': random_search.best_estimator_['lasso'].coef_,
}

coef_df = pd.DataFrame(data)
coef_df

# OLS

In [None]:
X_linear_df = X_df[['provided_labels_count', 'token_count_avg_entropy_a1']].assign(intercept=1)
X_linear_df['token_count_avg_entropy_a1^2'] = X_linear_df['token_count_avg_entropy_a1'] ** 2
X_linear_df['provided_labels_count*token_count_avg_entropy_a1'] = \
    X_linear_df['provided_labels_count'] * X_linear_df['token_count_avg_entropy_a1']

ols = sm.OLS(y_df, X_linear_df)
results = ols.fit()
results.summary()

# Validity Checks

In [None]:
X_linear_vif_df = X_linear_df[['provided_labels_count', 'token_count_avg_entropy_a1']]
p = X_linear_vif_df.shape[1]
vif_df = pd.DataFrame()
vif_df['VIF Factor'] = [vif(X_linear_vif_df.values, i) for i in range(p)]
vif_df['features'] = X_linear_vif_df.columns
vif_df

In [None]:
sm.qqplot(results.resid, line='s')
plt.grid()
plt.plot()
_, p = stats.shapiro(results.resid)
print('Shapiro-Wilk test p-value: {}'.format(p))

In [None]:
results.resid
plt.scatter(y_df, results.resid)
plt.xlabel('Response')
plt.ylabel('Residual')
plt.grid()
plt.show()