In [3]:
# Data manipulation
import pandas as pd
import numpy as np
import csv
from zipfile import ZipFile

In [4]:
import time
import datetime
from pandas.core.common import flatten
from itertools import chain
from tqdm import tqdm
import warnings


In [5]:
# Parsing and pre-processing
import glob, re, os, sys, random
from random import shuffle

from langdetect import detect, DetectorFactory

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from string import punctuation
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [6]:
# Vector representations and embeddings
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import gensim

In [7]:
# Modeling - Logistic, XGBOOST, SVM
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support

from sklearn.pipeline import Pipeline, FeatureUnion



from xgboost import XGBClassifier
import pickle

In [8]:
# LSTM 
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
import gc

In [9]:
# BERT models
from torch.utils.data import TensorDataset, RandomSampler, SequentialSampler
import transformers
from transformers import AutoModel, BertTokenizerFast

In [10]:
# specify GPU
device = "cuda" if torch.cuda.is_available() else "cpu"

Load data

In [77]:
df = pd.read_json(r"../../../data/processed/pre-processed_2023_03_11.json")

### Phase 1 vs. Phase 2

Oversampling minority class by balancing based on the number of cases and number of sections

In [78]:
df1 = df[df['phase2'].isin([0,1])]

In [126]:
print(df1.groupby('phase2')['case_num'].nunique(), "\n",
df1['phase2'].value_counts())

phase2
0    1485
1      98
Name: case_num, dtype: int64 
 0    5079
1     292
Name: phase2, dtype: int64


In [81]:
# Compute the number of cases per unique combination of phase2 and section_fin
case_counts = df1.groupby(['phase2', 'section_fin'])['case_num'].nunique()
print(case_counts)

phase2  section_fin              
0       Competitive Assessment       1256
        Concentration & Dimension    1467
        Market Definition             970
        Parties & Operation          1376
1       Competitive Assessment         75
        Concentration & Dimension      71
        Market Definition              81
        Parties & Operation            65
Name: case_num, dtype: int64


In [85]:
# Separate the data into features (text) and labels (phase and section_fin)
X = df1['text_clean']
y = df1[['phase2', 'section_fin']]
case_num = df1['case_num']

# Define the number of splits and the test size
n_splits = 1
test_size = 0.2

# Initialize the StratifiedShuffleSplit object
sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=42)

# Split the data into training and test sets
for train_index, test_index in sss.split(X, y):
    X_train_val, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train_val, y_test = y.iloc[train_index], y.iloc[test_index]
    case_num_train_val, case_num_test = case_num.iloc[train_index], case_num.iloc[test_index]

# Further split the training_validation set into 75% training and 25% validation sets
sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=0.25, random_state=42)
for train_index, val_index in sss.split(X_train_val, y_train_val):  # <-- Use X_train_val and y_train_val
    X_train, X_val = X_train_val.iloc[train_index], X_train_val.iloc[val_index]
    y_train, y_val = y_train_val.iloc[train_index], y_train_val.iloc[val_index]
    case_num_train, case_num_val = case_num_train_val.iloc[train_index], case_num_train_val.iloc[val_index]

# Print the shape of each set to verify that the data has been split correctly
print("Training set shape:", X_train.shape, y_train.shape)
print("Validation set shape:", X_val.shape, y_val.shape)
print("Test set shape:", X_test.shape, y_test.shape)

Training set shape: (3222,) (3222, 2)
Validation set shape: (1074,) (1074, 2)
Test set shape: (1075,) (1075, 2)


In [84]:
# from sklearn.model_selection import train_test_split

# # Separate the data into features (text) and labels (phase and section_fin)
# X = df1['text_clean']
# y = df1[['phase2', 'section_fin']]
# case_num = df1['case_num']

# # Split the data into train and test sets based on phase2 and section_fin
# X_train_val, X_test, y_train_val, y_test, case_num_train_val, case_num_test = train_test_split(X, y, case_num, test_size=0.2, stratify=y, random_state=42)

# # Further split the train_validation set into train and validation sets based on section_fin label distribution
# X_train, X_val, y_train, y_val, case_num_train, case_num_val = train_test_split(X_train_val, y_train_val, case_num_train_val, test_size=0.25, stratify=y_train_val, random_state=42)

# # Print the shape of each set to verify that the data has been split correctly
# print("Training set shape:", X_train.shape, y_train.shape)
# print("Validation set shape:", X_val.shape, y_val.shape)
# print("Test set shape:", X_test.shape, y_test.shape)


In [127]:
# Concatenate X_train_val, y_train_val, and case_num_train_val along axis=1
train_val_df = pd.concat([X_train_val, y_train_val, case_num_train_val], axis=1)

# Concatenate X_train, y_train, and case_num_train along axis=1
train_df = pd.concat([X_train, y_train, case_num_train], axis=1)

# Concatenate X_val, y_val, and case_num_val along axis=1
test_df = pd.concat([X_val, y_val, case_num_val], axis=1)

# Concatenate X_test, y_test, and case_num_test along axis=1
val_df = pd.concat([X_test, y_test, case_num_test], axis=1)

In [94]:
train_df.groupby(['phase2', 'section_fin'])['case_num'].nunique()

phase2  section_fin              
0       Competitive Assessment       754
        Concentration & Dimension    881
        Market Definition            581
        Parties & Operation          825
1       Competitive Assessment        45
        Concentration & Dimension     43
        Market Definition             49
        Parties & Operation           39
Name: case_num, dtype: int64

Vectorize X features in train, val, test

Oversample minority class (phase2=1) using vectorized X_train

In [124]:
from imblearn.over_sampling import RandomOverSampler

# Calculate the number of unique cases for each section_fin value in the minority class phase2=1
minority_class_data = train_df[train_df['phase2'] == 1]
unique_cases_per_section_fin = minority_class_data.groupby('section_fin')['case_num'].nunique()

# Calculate the number of unique cases for each section_fin value in the majority class phase2=0
majority_class_data = train_df[train_df['phase2'] == 0]
unique_cases_per_section_fin_majority = majority_class_data.groupby('section_fin')['case_num'].nunique()

# Calculate the minimum number of unique cases among all section_fin values
min_unique_cases = unique_cases_per_section_fin.min()

# Calculate the desired number of samples for each section_fin value in the minority class
desired_samples_per_section_fin = {}
for section_fin_value in unique_cases_per_section_fin.index:
    num_unique_cases = unique_cases_per_section_fin.loc[section_fin_value]
    desired_samples_per_section_fin[section_fin_value] = int(min_unique_cases / num_unique_cases)

# Adjust the desired number of samples based on the number of unique cases in the majority class for each section_fin value
for section_fin_value in unique_cases_per_section_fin.index:
    if section_fin_value in unique_cases_per_section_fin_majority.index:
        num_unique_cases_majority = unique_cases_per_section_fin_majority.loc[section_fin_value]
        desired_samples_per_section_fin[section_fin_value] = min(desired_samples_per_section_fin[section_fin_value], num_unique_cases_majority)

# Use RandomOverSampler to oversample the minority class based on the desired number of samples for each section_fin value
oversampler = RandomOverSampler(sampling_strategy=desired_samples_per_section_fin, random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)


TypeError: '<' not supported between instances of 'str' and 'int'

In [123]:
num_unique_cases_majority

825

In [122]:
desired_samples_per_section_fin

{'Competitive Assessment': 0,
 'Concentration & Dimension': 0,
 'Market Definition': 0,
 'Parties & Operation': 1}

In [121]:
unique_cases_per_section_fin

section_fin
Competitive Assessment       45
Concentration & Dimension    43
Market Definition            49
Parties & Operation          39
Name: case_num, dtype: int64

In [120]:
min_unique_cases

39

In [119]:
desired_samples_per_section_fin

{'Competitive Assessment': 0,
 'Concentration & Dimension': 0,
 'Market Definition': 0,
 'Parties & Operation': 1}

In [None]:
from imblearn.over_sampling import RandomOverSampler

oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)


In [None]:
from imblearn.over_sampling import RandomOverSampler

# Define the resampling strategy for each section
# Note that the ratios here are just examples and you may need to adjust them based on your specific data
sampling_strategy = {'Competitive Assessment': 0.5, 'Concentration & Dimension': 0.6, 'Market Definition': 0.8, 'Parties & Operation': 0.7}

# Create the RandomOverSampler object with the desired sampling strategy
oversampler = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)

# Resample the data
X_resampled, y_resampled = oversampler.fit_resample(X, y)

# Create a new DataFrame with the resampled data
df_resampled = pd.DataFrame({'text_clean': X_resampled, 'phase': y_resampled})

# You can now use the resampled data for training your machine learning model
