In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [3]:
df_tax=pd.read_excel('filtered_file_tax.xlsx')
df_contract=pd.read_excel('filtered_file_contract.xlsx')
df_dispute=pd.read_excel('filtered_file_Dispute.xlsx')
df_property=pd.read_excel('filtered_file_Property.xlsx')
df_corporate=pd.read_excel('filtered_file_Corporate.xlsx')

In [5]:
df_tax.Outcome.value_counts()
df_contract.outcome.value_counts()

outcome
0    28
1    23
Name: count, dtype: int64

In [7]:
df_dispute.outcome.value_counts()

outcome
0    16
1    11
Name: count, dtype: int64

In [9]:
import re 
import string

In [11]:
alphanumeric = lambda x: re.sub(r'\w*\d\w*', '', str(x)) if isinstance(x, str) else x
pun_lower = lambda x: re.sub(r'[%s]' % re.escape(string.punctuation), '', str(x).lower()) if isinstance(x, str) else x
remove_n = lambda x: re.sub(r'\n', '', str(x)) if isinstance(x, str) else x
remove_non_ascii = lambda x: re.sub(r'[^\x00-\x7f]', r'', str(x)) if isinstance(x, str) else x

# Apply the functions to the DataFrame columns
df_tax['Case keywords'] = (
    df_tax['Case keywords']
    .map(alphanumeric)
    .map(pun_lower)
    .map(remove_n)
    .map(remove_non_ascii)
)
df_tax['Court Type'] = (
    df_tax['Court Type']
    .map(alphanumeric)
    .map(pun_lower)
    .map(remove_n)
    .map(remove_non_ascii)
)

#
df_contract['Case keywords'] = (
    df_contract['Case keywords']
    .map(alphanumeric)
    .map(pun_lower)
    .map(remove_n)
    .map(remove_non_ascii)
)
df_contract['Court Type'] = (
    df_contract['Court Type']
    .map(alphanumeric)
    .map(pun_lower)
    .map(remove_n)
    .map(remove_non_ascii)
)

#
df_property['Case keywords'] = (
    df_property['Case keywords']
    .map(alphanumeric)
    .map(pun_lower)
    .map(remove_n)
    .map(remove_non_ascii)
)
df_property['Court Type'] = (
    df_property['Court Type']
    .map(alphanumeric)
    .map(pun_lower)
    .map(remove_n)
    .map(remove_non_ascii)
)

#
df_dispute['Case keywords'] = (
    df_dispute['Case keywords']
    .map(alphanumeric)
    .map(pun_lower)
    .map(remove_n)
    .map(remove_non_ascii)
)
df_dispute['Court Type'] = (
    df_dispute['Court Type']
    .map(alphanumeric)
    .map(pun_lower)
    .map(remove_n)
    .map(remove_non_ascii)
)

#
df_corporate['Case keywords'] = (
    df_corporate['Case keywords']
    .map(alphanumeric)
    .map(pun_lower)
    .map(remove_n)
    .map(remove_non_ascii)
)
df_corporate['Court Type'] = (
    df_corporate['Court Type']
    .map(alphanumeric)
    .map(pun_lower)
    .map(remove_n)
    .map(remove_non_ascii)
)

In [13]:
from sklearn.model_selection import train_test_split,KFold,cross_val_score
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [79]:
def model(dataf, lbl):
    # Fill NaN values and convert all columns to strings
    dataf['Case keywords'] = dataf['Case keywords'].fillna('').astype(str)
    dataf['Number of evidences'] = dataf['Number of evidences'].fillna(0).astype(str)
    dataf['Court Type'] = dataf['Court Type'].fillna('').astype(str)
    
    # Combine text-based features into a single feature set
    x = dataf['Case keywords'] + dataf['Number of evidences'] + dataf['Court Type']
    y = dataf[lbl]
    
    # Splitting into train and test
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=50)
    
    # Initialize CountVectorizer and transform the data
    v = CountVectorizer()
    x_train_v = v.fit_transform(x_train.values)
    x_test_v = v.transform(x_test.values)
    
    # Machine learning models
    lr = LogisticRegression()
    lr.fit(x_train_v, y_train)
    
    mnb = MultinomialNB()
    mnb.fit(x_train_v, y_train)
    
    svm = SVC()
    svm.fit(x_train_v, y_train)
    
    rfc = RandomForestClassifier()
    rfc.fit(x_train_v, y_train)
    
    # Compute F1 Scores
    f1_score_data = {
        'F1 Score': [
            f1_score(y_test, lr.predict(x_test_v), average='weighted'),
            f1_score(y_test, mnb.predict(x_test_v), average='weighted'),
            f1_score(y_test, svm.predict(x_test_v), average='weighted'),
            f1_score(y_test, rfc.predict(x_test_v), average='weighted'),
        ]
    }
    
    # Save scores in a DataFrame
    df_score = pd.DataFrame(f1_score_data, index=['LogisticReg', 'MultinomialNB', 'SVC', 'RandomForestClassifier'])
    
    return df_score

In [81]:
tax_df_cv=model(df_tax,'Outcome')
tax_df_cv

Unnamed: 0,F1 Score
LogisticReg,0.456892
MultinomialNB,0.7
SVC,0.581004
RandomForestClassifier,0.66


In [83]:
contract_df_cv=model(df_contract,'outcome')
contract_df_cv

Unnamed: 0,F1 Score
LogisticReg,0.568765
MultinomialNB,0.655844
SVC,0.648485
RandomForestClassifier,0.568765


In [85]:
dispute_df_cv=model(df_dispute,'outcome')
dispute_df_cv

Unnamed: 0,F1 Score
LogisticReg,0.838095
MultinomialNB,0.666667
SVC,1.0
RandomForestClassifier,1.0


In [87]:
property_df_cv=model(df_property,'outcome')
property_df_cv

Unnamed: 0,F1 Score
LogisticReg,0.8
MultinomialNB,0.8
SVC,1.0
RandomForestClassifier,0.8


In [89]:
corporate_df_cv=model(df_corporate,'outcome')
corporate_df_cv

Unnamed: 0,F1 Score
LogisticReg,1.0
MultinomialNB,1.0
SVC,1.0
RandomForestClassifier,1.0


In [91]:
X1=df_tax['Case keywords']+df_tax['Number of evidences']+df_tax['Court Type']
Y1=df_tax['Outcome']

X1_train,X1_test,Y1_train,Y1_test=train_test_split(X1,Y1,test_size=0.2,random_state=50)
v=CountVectorizer()
X1_train_v=v.fit_transform(X1_train)
X1_test_v=v.transform(X1_test)
tax_mb = MultinomialNB()
tax_mb.fit(X1_train_v,Y1_train)

In [93]:
X2=df_contract['Case keywords']+df_contract['Number of evidences']+df_contract['Court Type']
Y2=df_contract['outcome']

X2_train,X2_test,Y2_train,Y2_test=train_test_split(X2,Y2,test_size=0.2,random_state=50)
v=CountVectorizer()
X2_train_v=v.fit_transform(X2_train)
X2_test_v=v.transform(X2_test)
cont_mb = MultinomialNB()
cont_mb.fit(X2_train_v,Y2_train)

In [95]:
X3=df_dispute['Case keywords']+df_dispute['Number of evidences']+df_dispute['Court Type']
Y3=df_dispute['outcome']

X3_train,X3_test,Y3_train,Y3_test=train_test_split(X3,Y3,test_size=0.2,random_state=50)
v=CountVectorizer()
X3_train_v=v.fit_transform(X3_train)
X3_test_v=v.transform(X3_test)
dispute_svm = SVC()
dispute_svm.fit(X3_train_v,Y3_train)

In [97]:
X4=df_property['Case keywords']+df_property['Number of evidences']+df_property['Court Type']
Y4=df_property['outcome']

X4_train,X4_test,Y4_train,Y4_test=train_test_split(X4,Y4,test_size=0.2,random_state=50)
v=CountVectorizer()
X4_train_v=v.fit_transform(X4_train)
X4_test_v=v.transform(X4_test)
property_svm = SVC()
property_svm.fit(X4_train_v,Y4_train)

In [99]:
X5=df_corporate['Case keywords']+df_corporate['Number of evidences']+df_corporate['Court Type']
Y5=df_corporate['outcome']

X5_train,X5_test,Y5_train,Y5_test=train_test_split(X5,Y5,test_size=0.2,random_state=50)
v=CountVectorizer()
X5_train_v=v.fit_transform(X5_train)
X5_test_v=v.transform(X5_test)
corporate_svm = SVC()
corporate_svm.fit(X5_train_v,Y5_train)

In [101]:
X2_test.head()

36    public procurement tender process judicial rev...
35    arbitration and conciliation act  limitation a...
39    land acquisition contractual obligations gover...
1     memorandum of understanding mou arbitration ag...
25    stamp act arbitration agreement separability d...
dtype: object

In [113]:
pd.set_option('display.max_colwidth', None)

In [117]:
X1_test.head()

85              tax assessment second sales exemption alternate remedyMediummadras high court
87    subcontractor works contract profit vat kerala value added tax actHighkerala high court
57                            sales tax works contract taxable turnoverhighmadras high court 
52                      sales tax compounding of offences maximum penaltyLowandhra high court
50                                   sales tax exemption burden of proofHighandhra high court
dtype: object