In [1]:
# Data visualization
import matplotlib.pyplot as plt

# Data manipulation
import pandas as pd
import numpy as np
import csv
from zipfile import ZipFile

from time import time
import datetime
from pandas.core.common import flatten
from itertools import chain
from tqdm import tqdm
import warnings

# Parsing and pre-processing
import glob, re, os, sys, random
from random import shuffle

from langdetect import detect, DetectorFactory

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from string import punctuation
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer


# Vector representations and embeddings
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import gensim

# Modeling - Logistic, XGBOOST, SVM
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support

from sklearn.pipeline import Pipeline, FeatureUnion

from xgboost import XGBClassifier
import pickle

Load data

In [2]:
df = pd.read_json(r"../../../data/processed/pre-processed_2023_03_11.json")

df.drop_duplicates(subset=['article_new', 'case_num'], keep='first', inplace=True)
df=df.reset_index(drop=True)

Phase1 vs. Phase2

In [3]:
df1 = df[df['phase2'].isin([0,1])].reset_index(drop=True)

le = LabelEncoder()
df1['label'] = le.fit_transform(df1['phase2'])

Balance data
- undersample minority class
- use excluded cases as test_set

In [4]:
print(df1.groupby('label')['case_num'].nunique(), "\n",
df1['label'].value_counts())

label
0    1485
1      98
Name: case_num, dtype: int64 
 0    1485
1      98
Name: label, dtype: int64


In [5]:
# Get unique number of decisions
df_unique = df1.groupby(['label', 'case_num','article_new']).first().reset_index()[['label', 'case_num', 'article_new']]
print("Total decisions:", len(df_unique.index))
print(df_unique['label'].value_counts())
print(df_unique['article_new'].value_counts())
print(df_unique.head(1))

Total decisions: 1583
0    1485
1      98
Name: label, dtype: int64
article6(1)(b)    1284
article6(2)        201
article8(2)         57
article8(1)         32
article8(3)          9
Name: article_new, dtype: int64
   label case_num     article_new
0      0  M.10001  article6(1)(b)


In [6]:
import random

def balance(decision_id, Ytrain, random_seed):
    print('Balancing...')
    v = [i for i, val in enumerate(Ytrain) if val == 1]
    nv = [i for i, val in enumerate(Ytrain) if val == 0]

    if len(nv) < len(v):
        v = random.sample(v, len(nv))
    else:
        nv = random.sample(nv, len(v))

    indices = v + nv
    random.seed(random_seed)  # Set the random seed
    random.shuffle(indices)

    decision_id = [decision_id[i] for i in indices]
    Ytrain = [Ytrain[i] for i in indices]

    print("Total decisions:", len(decision_id))
    print("Labels distribution:", "\n", (pd.DataFrame(Ytrain)[0].value_counts()))
    return decision_id, Ytrain


In [7]:
decision_id_rus, y_train_rus = balance(df_unique['case_num'], df_unique['label'], random_seed=42)

Balancing...
Total decisions: 196
Labels distribution: 
 0    98
1    98
Name: 0, dtype: int64


In [8]:
df_balanced = df1[df1['case_num'].isin(decision_id_rus) & df1['label'].isin(y_train_rus)].reset_index(drop=True)

In [9]:
df_excluded = df1[~(df1['case_num'].isin(decision_id_rus) & df1['label'].isin(y_train_rus))]
print(df_excluded.groupby('label')['case_num'].nunique(), "\n",
df_excluded['label'].value_counts())

label
0    1386
Name: case_num, dtype: int64 
 0    1386
Name: label, dtype: int64


Split balanced data into train (80%) and test (20%)
- Match case_num and phase2 to get Train set in df1
- Use df_excluded to test the system(?)

In [10]:
# Separate the data into features (text) and labels (phase and section_fin)
X = df_balanced['text_clean']
y = df_balanced[['label', 'article_new']]
case_num = df_balanced[['case_num', 'year', 'section_fin', 'file']]

# Split the data into train and test sets based on phase2 and section_fin
X_train, X_test, y_train, y_test, case_num_train, case_num_test = train_test_split(X, y, case_num, test_size=0.2, stratify=y, random_state=42)

# Print the shape of each set to verify that the data has been split correctly
print("Training set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)

Training set shape: (157,) (157, 2)
Test set shape: (40,) (40, 2)


In [11]:
# Concatenate X_train, y_train, and case_num_train along axis=1
df_train = pd.concat([X_train, y_train, case_num_train], axis=1)

# Concatenate X_test, y_test, and case_num_test along axis=1
df_test = pd.concat([X_test, y_test, case_num_test], axis=1)

print(df_train.groupby('label')['case_num'].nunique(), "\n",
    "Total rows:", len(df_train.index), "\n",
    df_train['label'].value_counts())

print(df_test.groupby('label')['case_num'].nunique(), "\n",
    "Total rows:", len(df_test.index), "\n",
    df_test['label'].value_counts())

label
0    79
1    78
Name: case_num, dtype: int64 
 Total rows: 157 
 0    79
1    78
Name: label, dtype: int64
label
0    20
1    20
Name: case_num, dtype: int64 
 Total rows: 40 
 0    20
1    20
Name: label, dtype: int64


In [12]:
# Add excluded cases to df_test
df_concat = df_excluded[['text_clean', 'label', 'article_new', 'case_num', 'year', 'section_fin', 'file']]
df_test = pd.concat([df_test, df_concat], axis=0)
df_test = df_test.reset_index(drop=True)

print("Cases", df_train.groupby('label')['case_num'].nunique(), "\n",
    "Total rows:", len(df_train.index), "\n",
    "Rows", df_train['label'].value_counts())

print("Cases", df_test.groupby('label')['case_num'].nunique(), "\n",
    "Total rows:", len(df_test.index), "\n",
    "Rows", df_test['label'].value_counts())

Cases label
0    79
1    78
Name: case_num, dtype: int64 
 Total rows: 157 
 Rows 0    79
1    78
Name: label, dtype: int64
Cases label
0    1406
1      20
Name: case_num, dtype: int64 
 Total rows: 1426 
 Rows 0    1406
1      20
Name: label, dtype: int64


In [13]:
# Group df_train by 'case_num', 'article_new', 'file', and join the 'text_clean' column
df_train_grouped = df_train.groupby(['year', 'case_num', 'file', 'article_new', 'label'])['text_clean'].agg(' '.join).reset_index()
print(df_train_grouped.groupby('label')['case_num'].nunique())

label
0    79
1    78
Name: case_num, dtype: int64


In [14]:
df_train_grouped.head(3)

Unnamed: 0,year,case_num,file,article_new,label,text_clean
0,2004,M.3558,\m3558_20041217_20212_en,article6(2),0,undertaking concerned combined aggregate world...
1,2004,M.3561,\m3561_20041215_20310_en,article6(1)(b),0,introduction st eurotel effectively dt group n...
2,2005,M.3625,\m3625_20050713_20682_en,article8(1),1,celanese acetex signed arrangement agreement p...


In [15]:
# Group df_test by 'case_num', 'article_new', 'file', and join the 'text_clean' column
df_test_grouped = df_test.groupby(['year', 'article_new', 'case_num', 'file', 'label'])['text_clean'].agg(' '.join).reset_index()
print(df_test_grouped.groupby('label')['case_num'].nunique())

label
0    1406
1      20
Name: case_num, dtype: int64


In [16]:
df_test_grouped.head(3)

Unnamed: 0,year,article_new,case_num,file,label,text_clean
0,2004,article6(1)(b),M.3355,\m3355_20040615_310_en,0,transmission market assumption narrow market d...
1,2004,article6(1)(b),M.3439,\m3439_20040809_20310_en,0,proposed concentration involves acquisition so...
2,2004,article6(1)(b),M.3448,\m3448_en,0,present transaction result edp having sole con...


Baseline - Logit, SVC

In [17]:
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(df_train['text_clean'])
y_train = df_train['label']

# Transform the test data using the same vectorizer as the training data
X_test = vectorizer.transform(df_test['text_clean'])
y_test = df_test['label']

In [18]:
# Fit the logistic regression model
clf = LogisticRegression(random_state=0).fit(X_train, y_train)

# Predict the class labels for the test data
y_pred = clf.predict(X_test)

# Evaluate the performance of the model
accuracy = accuracy_score(y_test, y_pred) *100.0
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f_score = 2 * (precision * recall) / (precision + recall)

print(f' Accuracy: {accuracy:.2f} \n Precision: {precision:.3f} \n Recall: {recall:.3f} \n F1: {f_score:.3f}')


 Accuracy: 86.75 
 Precision: 0.067 
 Recall: 0.650 
 F1: 0.121


In [19]:
# Fit the SVC
clf = LinearSVC().fit(X_train, y_train)

# Predict the class labels for the test data
y_pred = clf.predict(X_test)

# Evaluate the performance of the model
accuracy = accuracy_score(y_test, y_pred) *100.0
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f_score = 2 * (precision * recall) / (precision + recall)

print(f' Accuracy: {accuracy:.2f} \n Precision: {precision:.3f} \n Recall: {recall:.3f} \n F1: {f_score:.3f}')

 Accuracy: 86.40 
 Precision: 0.074 
 Recall: 0.750 
 F1: 0.134


Grid Search - SVC

In [20]:
X_train = df_train['text_clean']
y_train = df_train['label']

X_test = df_test['text_clean']
y_test = df_test['label']

In [26]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a pipeline with TfidfVectorizer and LinearSVC
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer='word')),
    ('clf', LinearSVC())
])

# Define the grid of hyperparameters to search over
parameters = {
	'tfidf__ngram_range': [(1,2),(1,1),(1,3)], #(1,4),(2,2),(2,3),(2,4),(3,3),(3,4),(4,4)
	#'tfidf__analyzer': ('word', 'char'),
	#'tfidf__lowercase': (True, False),
	#'tfidf__max_df': (0.01, 1.0), # ignore words that occur as more than 1% of corpus
	'tfidf__min_df': (1, 2, 3), # we need to see a word at least (once, twice, thrice) in a document
	#'tfidf__use_idf': (False, True), # use inverse document frequency weighting
	#'tfidf__sublinear_tf': (False, True),
	#'tfidf__binary': (False, True), #set term frequency binary (all non-zero terms are set to 1)
	'tfidf__norm': ('l1', 'l2'), #norm used to normalize term vectors
	#'tfidf__max_features': (None, 2000, 5000),
	#'tfidf__stop_words': (None, 'english'),

	#'tfidfchar_ngram_range': ((1,1),(1,2),(1,3),(1,4),(1,5),(1,6),(2,2),(2,3),(2,4),(2,5),(2,6),(3,3),(3,4),(3,5),(3,6),(4,4),(4,5),(4,6),(5,5),(5,6),(1,7),(2,7),(3,7),(4,7),(5,7),(6,7),(7,7)),
	
	
	'clf__C':(0.1, 1, 5) # penalty parameter for the SVM
}

# Create a GridSearchCV object with the pipeline and hyperparameters
grid_search = GridSearchCV(pipeline, parameters, cv=3, n_jobs=24, verbose=1)
t0 = time()

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 54 candidates, totalling 162 fits
done in 108.081s
Best cross-validation score:  0.7960812772133526
Best parameters set:
	clf__C: 1
	tfidf__min_df: 2
	tfidf__ngram_range: (1, 2)
	tfidf__norm: 'l1'


In [27]:
print("done in %0.3fs" % (time() - t0))

# Print the best hyperparameters and the corresponding mean cross-validated score
print("Best cross-validation score: ", grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
	print("\t%s: %r" % (param_name, best_parameters[param_name]))

done in 138.933s
Best cross-validation score:  0.7960812772133526
Best parameters set:
	clf__C: 1
	tfidf__min_df: 2
	tfidf__ngram_range: (1, 2)
	tfidf__norm: 'l1'


In [28]:
k = sorted(parameters.keys())
vec = TfidfVectorizer()
clf = LinearSVC()
pipeline_test = Pipeline([('tfidf', vec), ('clf', clf)])
pipeline_test.set_params(**best_parameters)
print('fitting the best model')
pipeline_test.fit(X_train, y_train)

y_predict = cross_val_predict(pipeline_test, X_train, y_train, cv=3)
print('Accuracy:', accuracy_score(y_train, y_predict) )
print('\nClassification report:\n', classification_report(y_train, y_predict))
print('\nConfusion matrix:\n', confusion_matrix(y_train, y_predict), '\n\n_______________________\n\n')
accuracies = []
accuracies.append(accuracy_score(y_train, y_predict))

fitting the best model
Accuracy: 0.7961783439490446

Classification report:
               precision    recall  f1-score   support

           0       0.78      0.82      0.80        79
           1       0.81      0.77      0.79        78

    accuracy                           0.80       157
   macro avg       0.80      0.80      0.80       157
weighted avg       0.80      0.80      0.80       157


Confusion matrix:
 [[65 14]
 [18 60]] 

_______________________




In [44]:
def evaluate(Ytest, Ypredict): #evaluate the model (accuracy, precision, recall, f-score, confusion matrix)
        print('Accuracy:', accuracy_score(Ytest, Ypredict) )
        print('\nClassification report:\n', classification_report(Ytest, Ypredict))
        print('\nCR:', precision_recall_fscore_support(Ytest, Ypredict, average='macro'))
        print('\nConfusion matrix:\n', confusion_matrix(Ytest, Ypredict), '\n\n_______________________\n\n')
        
        # Evaluate the performance of the model
        accuracy = accuracy_score(Ytest, Ypredict) *100.0
        precision = precision_score(Ytest, Ypredict, average='binary')
        recall = recall_score(Ytest, Ypredict, average='binary')
        f_score = 2 * (precision * recall) / (precision + recall)

        print(f' Accuracy: {accuracy:.2f} \n Precision: {precision:.3f} \n Recall: {recall:.3f} \n F1: {f_score:.3f}')

In [45]:
def train_model_cross_val(Xtrain, Ytrain, vec, c): #Linear SVC model cross-validation
    print('***10-fold cross-validation***')
    pipeline = Pipeline([
        ('features', FeatureUnion(
            [vec],
        )),
        ('classifier', LinearSVC(C=c))
        ])
    Ypredict = cross_val_predict(pipeline, Xtrain, Ytrain, cv=10) #10-fold cross-validation
    evaluate(Ytrain, Ypredict)

In [46]:
def train_model_test(Xtrain, Ytrain, Xtest_v, Ytest_v, vec, c): #test on 'violations' test set
    pipeline = Pipeline([
        ('features', FeatureUnion([vec]
        )),
        ('classifier', LinearSVC(C=c))
        ])
    pipeline.fit(Xtrain, Ytrain)
    print('***testing on test set***')
    Ypredict = pipeline.predict(Xtest_v)
    evaluate(Ytest_v, Ypredict)

In [37]:
grid_search.best_params_

{'clf__C': 1,
 'tfidf__min_df': 2,
 'tfidf__ngram_range': (1, 2),
 'tfidf__norm': 'l1'}

In [38]:
vec = ('wordvec', TfidfVectorizer(analyzer = 'word', ngram_range = (1,2),  min_df = 2, norm = 'l1'))
c=1

In [39]:
train_model_cross_val(X_train, y_train, vec, c)

***10-fold cross-validation***
Accuracy: 0.7388535031847133

Classification report:
               precision    recall  f1-score   support

           0       0.73      0.76      0.75        79
           1       0.75      0.72      0.73        78

    accuracy                           0.74       157
   macro avg       0.74      0.74      0.74       157
weighted avg       0.74      0.74      0.74       157


CR: (0.7391869918699188, 0.738721194417397, 0.738683879348841, None)

Confusion matrix:
 [[60 19]
 [22 56]] 

_______________________




In [47]:
train_model_test(X_train, y_train, X_test, y_test, vec, c)

***testing on test set***
Accuracy: 0.7601683029453016

Classification report:
               precision    recall  f1-score   support

           0       0.99      0.76      0.86      1406
           1       0.04      0.70      0.08        20

    accuracy                           0.76      1426
   macro avg       0.52      0.73      0.47      1426
weighted avg       0.98      0.76      0.85      1426


CR: (0.5172118959107807, 0.7305120910384069, 0.4689417862665244, None)

Confusion matrix:
 [[1070  336]
 [   6   14]] 

_______________________


 Accuracy: 76.02 
 Precision: 0.040 
 Recall: 0.700 
 F1: 0.076


Get feature importance