In [200]:
# !pip install sentence_transformers
# !pip install git+https://github.com/huggingface/transformers.git
# !pip install xgboost

In [1]:
import os
os.chdir("..")
print(os.listdir())
from utils.utils import notebook_line_magic
notebook_line_magic()

['.DS_Store', 'utils', 'models', '__pycache__', 'README.md', '.gitignore', 'figures', 'app.py', '.git', 'data', 'notebooks']
Line Magic Set


### Data Exploration 
- The nlp dataset has vector of inconsistent lenghts.
- Couple of options (among many):
- Since each integer represents a token. Each vector can be padded with an unused integer to make all vector have a consistent lenght -> then do PCA if needed to reduce dimensions.
- Use a pretrained encoder to generate embeddings.


## data processing
- manually add padding and use sentence transformer to generate embeddings
- use auto tokenize with padding
- manually add padding and then do tf-idf

## modeling ideas
- create a dataloader with a defined bacth size
- use the dataloader to create embeddings and then merge with the UniqueID
- combined embeddings with other features
- build an XGBoost model and a custom torch fully connected network
- you can als get fancy with concatenating the bert model above with a torch classifier 
- knn classifier might also work well with features and embeddings

In [25]:
# add padding to the tokens
import ast
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from collections import defaultdict



from models.pca import get_pca_components
from models.xgboost_train import train, get_trained_model
from models.xgboost_train import search_best_params
from utils.data_utils import (
    get_data_df, get_padded_tokens, generate_embeddings_from_tokens
)
from utils.utils import (
    plot_class_distribution, plot_pca_components_and_variance, generate_pair_plots,
    generate_correlation_plot, generate_2d_pca_distribution_plot, generate_3d_pca_distribution_plot
)

In [15]:
data_path = 'data'
data_type = 'train'
data_path = Path(data_path)
embeddings_file = f'x_{data_type}_nlp_embeddings.csv'
embeddings_file_path = data_path/data_type/embeddings_file

# combine embeddings with other features for modeling
embeddings_df = pd.read_csv(embeddings_file_path).drop(columns=['Unnamed: 0'])
data_df = get_data_df(data_type=data_type)
data_df = data_df.merge(embeddings_df, on='UniqueID', how='inner')
data_df.set_index('UniqueID', inplace=True)
data_df.drop(columns=['nlp_feature_vector'], inplace=True)

x = data_df[data_df.columns[~data_df.columns.isin(['Target'])]]
y = data_df['Target']

In [43]:
# data_df.info(verbose=True)

# Analysis and Feature Engineering
- Look for multicollinearity
- Distribution of Features which are not embeddings
- Target distribution
- Probably best to start with normalizing them between 0-1

In [55]:
features = data_df[['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Target']]
# generate_pair_plots(
#   data=features,
#   hue='Target'  
# )

In [28]:
# Compute correlation matrix
# generate_correlation_plot(
#     df=features[['Feature1', 'Feature2', 'Feature3', 'Feature4']]
# )

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, QuantileTransformer
pd.options.mode.chained_assignment = None  # default='warn'
scaler = StandardScaler()
for f in [1,2,3,4]:
    features[f'Feature{f}'] = scaler.fit_transform(features[[f'Feature{f}']])

In [None]:
generate_pair_plots(
    data=features,
    hue='Target',
    path='figures/feature_pairplot_after_scaling.pdf'
)

In [123]:
# for c in [2,100, 200, 300, 500]:
#     mod, z, z0 = get_pca_components(data=x, num_components=c)
#     plot_pca_components_and_variance(mod, save_fig=True)
# plot_class_distribution(target=y, save_fig=True)

- based on above it seems like having more features will continue to boost performance
- so maybe not reduce dimensionality while fitting the model
- a deep learning model may work well compared to a tree based model but it could potentially overfit to the data too.. experiment and find out

In [151]:
mod, z, z0 = get_pca_components(data=x, num_components=3)

PCA Components Shape: (25000, 3)


In [None]:
generate_2d_pca_distribution_plot(
    components=z0,
    target=y
)
generate_3d_pca_distribution_plot(
    components=z0,
    target=y
)

In [18]:
# search_best_params()
train()

In [20]:
model = get_trained_model()

In [29]:
# model
# y

1. Train with best params
2. Predict on validation set and add Unique IDs
3. Add docstrings to all functions
4. Set up a predict script
5. Set up train and predict routes