In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [3]:
df_tax=pd.read_excel('filtered_file_tax.xlsx')
df_contract=pd.read_excel('filtered_file_contract.xlsx')
df_dispute=pd.read_excel('filtered_file_Dispute.xlsx')
df_property=pd.read_excel('filtered_file_Property.xlsx')
df_corporate=pd.read_excel('filtered_file_Corporate.xlsx')

In [5]:
df_tax.Outcome.value_counts()
df_contract.outcome.value_counts()

outcome
0    28
1    23
Name: count, dtype: int64

In [7]:
df_dispute.outcome.value_counts()

outcome
0    16
1    11
Name: count, dtype: int64

In [9]:
import re 
import string

In [11]:
alphanumeric = lambda x: re.sub(r'\w*\d\w*', '', str(x)) if isinstance(x, str) else x
pun_lower = lambda x: re.sub(r'[%s]' % re.escape(string.punctuation), '', str(x).lower()) if isinstance(x, str) else x
remove_n = lambda x: re.sub(r'\n', '', str(x)) if isinstance(x, str) else x
remove_non_ascii = lambda x: re.sub(r'[^\x00-\x7f]', r'', str(x)) if isinstance(x, str) else x

# Apply the functions to the DataFrame columns
df_tax['Case keywords'] = (
    df_tax['Case keywords']
    .map(alphanumeric)
    .map(pun_lower)
    .map(remove_n)
    .map(remove_non_ascii)
)
df_tax['Court Type'] = (
    df_tax['Court Type']
    .map(alphanumeric)
    .map(pun_lower)
    .map(remove_n)
    .map(remove_non_ascii)
)

#
df_contract['Case keywords'] = (
    df_contract['Case keywords']
    .map(alphanumeric)
    .map(pun_lower)
    .map(remove_n)
    .map(remove_non_ascii)
)
df_contract['Court Type'] = (
    df_contract['Court Type']
    .map(alphanumeric)
    .map(pun_lower)
    .map(remove_n)
    .map(remove_non_ascii)
)

#
df_property['Case keywords'] = (
    df_property['Case keywords']
    .map(alphanumeric)
    .map(pun_lower)
    .map(remove_n)
    .map(remove_non_ascii)
)
df_property['Court Type'] = (
    df_property['Court Type']
    .map(alphanumeric)
    .map(pun_lower)
    .map(remove_n)
    .map(remove_non_ascii)
)

#
df_dispute['Case keywords'] = (
    df_dispute['Case keywords']
    .map(alphanumeric)
    .map(pun_lower)
    .map(remove_n)
    .map(remove_non_ascii)
)
df_dispute['Court Type'] = (
    df_dispute['Court Type']
    .map(alphanumeric)
    .map(pun_lower)
    .map(remove_n)
    .map(remove_non_ascii)
)

#
df_corporate['Case keywords'] = (
    df_corporate['Case keywords']
    .map(alphanumeric)
    .map(pun_lower)
    .map(remove_n)
    .map(remove_non_ascii)
)
df_corporate['Court Type'] = (
    df_corporate['Court Type']
    .map(alphanumeric)
    .map(pun_lower)
    .map(remove_n)
    .map(remove_non_ascii)
)

In [13]:
from sklearn.model_selection import train_test_split,KFold,cross_val_score
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [131]:
import os

def model(dataf, lbl, save_model=True, save_path="models/"):
    """
    Train multiple machine learning models and evaluate their F1 scores.
    
    Parameters:
        dataf (DataFrame): Input data containing features and labels.
        lbl (str): Name of the label column.
        save_model (bool): Whether to save the vectorizer and models. Default is True.
        save_path (str): Directory path to save models and vectorizer. Default is 'models/'.
        
    Returns:
        DataFrame: F1 scores of trained models.
    """
    # Ensure the save directory exists
    if save_model:
        os.makedirs(save_path, exist_ok=True)

    # Fill NaN values and convert columns to strings
    dataf['Case keywords'] = dataf['Case keywords'].fillna('').astype(str)
    dataf['Number of evidences'] = dataf['Number of evidences'].fillna(0).astype(str)
    dataf['Court Type'] = dataf['Court Type'].fillna('').astype(str)
    
    # Combine features
    x = dataf['Case keywords'] + " " + dataf['Number of evidences'] + " " + dataf['Court Type']
    y = dataf[lbl]
    
    # Split data into train and test sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=50)
    
    # Initialize CountVectorizer
    v = CountVectorizer()
    x_train_v = v.fit_transform(x_train.values)
    x_test_v = v.transform(x_test.values)
    
    # Initialize models
    models = {
        'LogisticReg': LogisticRegression(max_iter=1000),
        'MultinomialNB': MultinomialNB(),
        'SVC': SVC(probability=True),
        'RandomForestClassifier': RandomForestClassifier(n_estimators=100, random_state=50)
    }
    
    # Train models and compute F1 scores
    scores = {}
    for name, clf in models.items():
        clf.fit(x_train_v, y_train)
        predictions = clf.predict(x_test_v)
        scores[name] = f1_score(y_test, predictions, average='weighted')
        
        # Save models if required
        if save_model:
            joblib.dump(clf, f"{save_path}{name}.pkl")
    
    # Save the vectorizer
    if save_model:
        joblib.dump(v, f"{save_path}vectorizer.pkl")
    
    # Convert scores to a DataFrame
    df_score = pd.DataFrame(list(scores.items()), columns=['Model', 'F1 Score'])
    df_score.set_index('Model', inplace=True)
    
    return df_score


In [133]:
f1_scores = model(df_tax, lbl='Outcome', save_model=True, save_path="models/")
print(f1_scores)


                        F1 Score
Model                           
LogisticReg             0.556000
MultinomialNB           0.685714
SVC                     0.660000
RandomForestClassifier  0.620376


In [137]:
f1_scores = model(df_contract, lbl='outcome', save_model=True, save_path="models/")
print(f1_scores)

                        F1 Score
Model                           
LogisticReg             0.648485
MultinomialNB           0.655844
SVC                     0.648485
RandomForestClassifier  0.472727


In [139]:
f1_scores = model(df_dispute, lbl='outcome', save_model=True, save_path="models/")
print(f1_scores)

                        F1 Score
Model                           
LogisticReg             0.838095
MultinomialNB           0.666667
SVC                     1.000000
RandomForestClassifier  1.000000


In [143]:
f1_scores = model(df_property, lbl='outcome', save_model=True, save_path="models/")
print(f1_scores)

                        F1 Score
Model                           
LogisticReg                  0.8
MultinomialNB                0.5
SVC                          1.0
RandomForestClassifier       0.5


In [145]:
f1_scores = model(df_corporate, lbl='outcome', save_model=True, save_path="models/")
print(f1_scores)

                        F1 Score
Model                           
LogisticReg                  1.0
MultinomialNB                1.0
SVC                          1.0
RandomForestClassifier       1.0


In [147]:
X1=df_tax['Case keywords']+df_tax['Number of evidences']+df_tax['Court Type']
Y1=df_tax['Outcome']

X1_train,X1_test,Y1_train,Y1_test=train_test_split(X1,Y1,test_size=0.2,random_state=50)
v=CountVectorizer()
X1_train_v=v.fit_transform(X1_train)
X1_test_v=v.transform(X1_test)
tax_mb = MultinomialNB()
tax_mb.fit(X1_train_v,Y1_train)

In [149]:
X2=df_contract['Case keywords']+df_contract['Number of evidences']+df_contract['Court Type']
Y2=df_contract['outcome']

X2_train,X2_test,Y2_train,Y2_test=train_test_split(X2,Y2,test_size=0.2,random_state=50)
v=CountVectorizer()
X2_train_v=v.fit_transform(X2_train)
X2_test_v=v.transform(X2_test)
cont_mb = MultinomialNB()
cont_mb.fit(X2_train_v,Y2_train)

In [151]:
X3=df_dispute['Case keywords']+df_dispute['Number of evidences']+df_dispute['Court Type']
Y3=df_dispute['outcome']

X3_train,X3_test,Y3_train,Y3_test=train_test_split(X3,Y3,test_size=0.2,random_state=50)
v=CountVectorizer()
X3_train_v=v.fit_transform(X3_train)
X3_test_v=v.transform(X3_test)
dispute_svm = SVC()
dispute_svm.fit(X3_train_v,Y3_train)

In [153]:
X4=df_property['Case keywords']+df_property['Number of evidences']+df_property['Court Type']
Y4=df_property['outcome']

X4_train,X4_test,Y4_train,Y4_test=train_test_split(X4,Y4,test_size=0.2,random_state=50)
v=CountVectorizer()
X4_train_v=v.fit_transform(X4_train)
X4_test_v=v.transform(X4_test)
property_svm = SVC()
property_svm.fit(X4_train_v,Y4_train)

In [155]:
X5=df_corporate['Case keywords']+df_corporate['Number of evidences']+df_corporate['Court Type']
Y5=df_corporate['outcome']

X5_train,X5_test,Y5_train,Y5_test=train_test_split(X5,Y5,test_size=0.2,random_state=50)
v=CountVectorizer()
X5_train_v=v.fit_transform(X5_train)
X5_test_v=v.transform(X5_test)
corporate_svm = SVC()
corporate_svm.fit(X5_train_v,Y5_train)

In [113]:
pd.set_option('display.max_colwidth', None)

In [117]:
X1_test.head()

85              tax assessment second sales exemption alternate remedyMediummadras high court
87    subcontractor works contract profit vat kerala value added tax actHighkerala high court
57                            sales tax works contract taxable turnoverhighmadras high court 
52                      sales tax compounding of offences maximum penaltyLowandhra high court
50                                   sales tax exemption burden of proofHighandhra high court
dtype: object

In [159]:
# Load the vectorizer and desired model
v = joblib.load("models/vectorizer.pkl")
model = joblib.load("models/MultinomialNB.pkl")

# Transform new data
cmt1 = ["tax assessment second sales exemption alternate remedyMediummadras high court"]
cmt1_v = v.transform(cmt1)

# Predict probabilities
proba = model.predict_proba(cmt1_v)[:, 1]
print(proba)


[0.11052938]
