In [1]:
## importing and loading libraries
if(!require(DescTools)){install.packages("DescTools")}
if(!require(multcompView)){install.packages("multcompView")}
if(!require(rcompanion)){install.packages("rcompanion")}

Loading required package: DescTools
Loading required package: multcompView
Loading required package: rcompanion


## Fischer Test to see independence of the groups

In [2]:
Input =("
Group               Win   Loss
Group_1_Pro_Se	1022	587
Group_2_One_Non_Attorney	1528	623
Group_3_One_Attorney	5247	2100
Group_4_Multiple_Representatives	142	94
")
Matrix = as.matrix(read.table(textConnection(Input),
                   header=TRUE, 
                   row.names=1))



In [3]:
fisher.test(Matrix,alternative="two.sided",hybrid =TRUE)


	Fisher's Exact Test for Count Data

data:  Matrix
p-value = 8.947e-11
alternative hypothesis: two.sided


### Pairwise Independence 

In [4]:
library(rcompanion)

PT = pairwiseNominalIndependence(Matrix,fisher = TRUE,gtest  = FALSE,chisq  = FALSE,digits = 3)

cldList(comparison = PT$Comparison,
        p.value    = PT$p.adj.Fisher,
        threshold  = 0.05)

Group,Letter,MonoLetter
Group_1_Pro_Se,a,a
Group_2_One_Non_Attorney,b,b
Group_3_One_Attorney,b,b
Group_4_Multiple_Representatives,a,a


## Regression Analysis on the Data

In [5]:
setwd("/home/siddhant/Desktop/projects/finra_legal_analytics")
training.data.raw <- read.csv('data/processed/master_nov_26_20_41.csv',header=T,na.strings=c(""))


In [6]:
## Substting required data
data <- subset(training.data.raw,select=c(2:23))
cols <- c(2:20)

In [7]:
## Converting to features
data[cols] <- lapply(data[cols], factor)

In [13]:
install.packages("caTools")
library(caTools)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)


In [14]:
set.seed(88)
sample <- sample.split(data, SplitRatio = 0.80)
train =subset(data,sample ==TRUE) 
test =subset(data, sample==FALSE)

In [None]:
glm.fit <- glm(Winner ~ ., data = train, family = binomial)
summary(glm.fit)
data.frame(summary(glm.fit)$coef[summary(glm.fit)$coef[,4] <= .05, 4])

## Regression Analysis on individual features
### How the outcome varies with Regions?

In [None]:
glm.fit <- glm(Winner ~ Region, data = data, family = binomial)
x <- summary(glm.fit)

In [None]:
## Comparing the odds ratio for various regions
exp(cbind(Odd_Ratio = coef(glm.fit), confint(glm.fit)))
## Comparing the prob of regions
data.frame(exp( coef(glm.fit) ) /  ( 1 + exp( coef(glm.fit) ) ))

### How outcome varies with Claim Types?

In [None]:
glm.fit <- glm(Winner ~ Claim.Type.1..Breach.of.Contract.Breach.of.Implied.Contract + 
    Claim.Type.2..QuasiContractual.Claims + Claim.Type.3..Breach.of.Fiduciary.Duty + 
    Claim.Type.4..Negligence + Claim.Type.5..Failure.to.Supervise.Negligent.Supervision + 
    Claim.Type.6..Fraud + Claim.Type.7..Unjust.Enrichment + Claim.Type.8..Suitability.or.Unsuitability + 
    Claim.Type.9..Misrepresentation + Claim.Type.10..Unauthorized.Trading + 
    Claim.Type.11..Churning + Claim.Type.12..Failure.to.Execute + 
    Claim.Type.13..Breach.of.Promissory.Note + Claim.Type.14..Conversion.1 + 
    Claim.Type.15..Violation.of.Securities.Laws.Regulations + 
    Claim.Type.16..Employment.Related.Claims + Claim.Type.17..Other, data = data, family = binomial)

In [None]:
data.frame(exp( coef(glm.fit) ) /  ( 1 + exp( coef(glm.fit) ) ))