In [1]:
import os, sys
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from statsmodels.discrete.discrete_model import Logit

pd.set_option('display.max_columns', None)


module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import utils.explore_utils as eu

In [2]:
type_map = {
    'decision_date' : 'str',
    'filing_date' : 'str',
    'Court Name' : 'category',
    'Party of Appointing President' : 'category',
    'CIRCUIT' : 'category',
    'JURIS' : 'category',
    'NOS' : 'category',
    'ORIGIN' : 'category',
    'RESIDENC' : 'category',
    'CLASSACT' : 'category',
    'DEMANDED' : 'float64',
    'TERMDATE' : 'str',
    'DISP' : 'category',
    'PROCPROG' : 'category',
    'NOJ' : 'category',
    'AMTREC' : 'category',
    'JUDGMENT' : 'category',
    'TAPEYEAR' : 'float64',
    'district' : 'category',
    'office' : 'category',
    'county' : 'category',
    'TRCLACT' : 'category',
    'PROSE' : 'category',
    'arbit' : 'category',
    'transoff' : 'category',
    'trmarb' : 'category',
    'ifp' : 'category',
    'statuscd' : 'category'
}

cols_to_drop = [
    'decision_date',
#     'filing_date',
    'TERMDATE',
    'TAPEYEAR'
]

df = pd.read_csv(
    '/scratch/ayl316/ttml_mr_data/processed_data/cases.csv.zip', 
    dtype = type_map, 
    parse_dates = ['decision_date', 'filing_date', 'TERMDATE']
).rename(columns = {
    'Court Name' : 'court_name',
    'Party of Appointing President' : 'party'
}).drop(columns = cols_to_drop)


df['filing_year'] = pd.DatetimeIndex(df['filing_date']).year
df['filing_year'] = df['filing_year'].astype(str).astype('category')
df = df.drop(columns = ['filing_date'])

for col in ['party', 'TRCLACT', 'PROSE', 'arbit', 'transoff', 'trmarb', 'ifp', 'statuscd', 'filing_year']:
    if not '-8' in df[col].cat.categories:
        df[col] = df[col].cat.add_categories('-8')
    df[col] = df[col].fillna('-8')

df['district_year'] = df['district'].astype(str) + '_' + df['filing_year'].astype(str)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1219459 entries, 0 to 1219458
Data columns (total 26 columns):
 #   Column         Non-Null Count    Dtype   
---  ------         --------------    -----   
 0   court_name     1219459 non-null  category
 1   party          1219459 non-null  category
 2   CIRCUIT        1219459 non-null  category
 3   JURIS          1219459 non-null  category
 4   NOS            1219459 non-null  category
 5   ORIGIN         1219459 non-null  category
 6   RESIDENC       1219459 non-null  category
 7   CLASSACT       1219459 non-null  category
 8   DEMANDED       1219459 non-null  float64 
 9   DISP           1219459 non-null  category
 10  PROCPROG       1219459 non-null  category
 11  NOJ            1219459 non-null  category
 12  AMTREC         1219459 non-null  category
 13  JUDGMENT       1219459 non-null  category
 14  district       1219459 non-null  category
 15  office         1219459 non-null  category
 16  county         1219459 non-null  cat

In [4]:
feature_cols = [
#     'NOJ',
#     'JUDGMENT',
#     'PROSE',
#     'trmarb',
    'CLASSACT',
    'JURIS',
#     'TRCLACT',
#     'ifp', (too many nulls)
#     'statuscd',
#     'PROCPROG',
#     'CIRCUIT',
#     'transoff',
    'ORIGIN',
#     'arbit', (too many nulls)
    'office',
#     'court_name',
    'NOS',
    'district',
#     'TAPEYEAR',
    'RESIDENC',
#     'DISP',
    'filing_year',
    'district_year'
]

target_col = 'party'

# eu.cat_heat_map(df, feature_cols)

In [5]:
df[feature_cols]

Unnamed: 0,CLASSACT,JURIS,ORIGIN,office,NOS,district,RESIDENC,filing_year,district_year
0,-8.0,2.0,1.0,8,510.0,16,-8.0,2016.0,16_2016.0
1,-8.0,2.0,1.0,8,510.0,16,-8.0,2016.0,16_2016.0
2,-8.0,2.0,1.0,8,510.0,16,-8.0,2016.0,16_2016.0
3,-8.0,2.0,1.0,8,510.0,16,-8.0,2016.0,16_2016.0
4,-8.0,2.0,1.0,8,510.0,16,-8.0,2016.0,16_2016.0
...,...,...,...,...,...,...,...,...,...
1219454,-8.0,4.0,2.0,4,110.0,65,15.0,2016.0,65_2016.0
1219455,-8.0,4.0,2.0,4,110.0,65,15.0,2016.0,65_2016.0
1219456,-8.0,3.0,1.0,2,442.0,15,-8.0,2015.0,15_2015.0
1219457,-8.0,3.0,1.0,2,442.0,15,-8.0,2015.0,15_2015.0


In [6]:


df = df[(df[target_col] == 'Republican') | (df[target_col] == 'Democratic')]


for col in feature_cols:
    if df[col].dtype.name == 'category':
        df[col] = df[col].cat.remove_unused_categories()

X = df[feature_cols]
y = df[target_col]


y = y.cat.add_categories(['1', '0'])
y[y == 'Democratic'] = '1'
y[y == 'Republican'] = '0'
y = y.cat.remove_unused_categories()

In [7]:
enc = OneHotEncoder(drop = 'first')
enc.fit(X)
X_ohe = enc.transform(X).toarray()

# scaler = StandardScaler().fit(X_ohe)
# X_scaled = pd.DataFrame(
#     scaler.transform(X_ohe),
#     columns = enc.get_feature_names(feature_cols)
# )

X_scaled = pd.DataFrame(
    X_ohe,
    columns = enc.get_feature_names(feature_cols)
)

X_scaled = sm.add_constant(X_scaled)

In [8]:
X_scaled.shape

(1187426, 3249)

In [9]:
lin_reg = sm.OLS(list(y.astype(float)), X_scaled).fit()
lin_pvalues = lin_reg.pvalues

In [10]:
lin_pvalues[lin_pvalues < 0.05]

CLASSACT_-9.0    0.000000e+00
CLASSACT_0.0     0.000000e+00
CLASSACT_1.0     7.443657e-18
CLASSACT_2.0     1.124023e-56
CLASSACT_3.0     1.166023e-12
                     ...     
RESIDENC_46.0    8.387763e-03
RESIDENC_53.0    1.472490e-02
RESIDENC_56.0    9.864274e-05
RESIDENC_62.0    9.658875e-04
RESIDENC_66.0    3.789254e-02
Length: 100, dtype: float64

In [11]:
lin_pvalues[lin_pvalues > 0.05].shape

(3149,)

In [40]:
def get_ohe_col_indices(ohe_cols, col_name):
    x = pd.Series(ohe_cols)
    return list(x[x.str.startswith(col_name)].index)


def get_complement_indices(n_cols, indices):
    return sorted(set(range(n_cols)) - set(indices))

In [12]:
sig_map = {}

for col in list(lin_pvalues[lin_pvalues < 0.05].index):
    if '_' in col:
        col_name = col.split('_')[0]
        col_value = col.split('_')[1]
        
        if 'filing_year' in col:
            col_name = 'filing_year'
            col_value = col.split('_')[2]
        
        if col_name in sig_map.keys():
            sig_map[col_name].append(col_value)
        else:
            sig_map[col_name] = [col_value]

In [14]:
sig_map

{'CLASSACT': ['-9.0', '0.0', '1.0', '2.0', '3.0'],
 'JURIS': ['2.0', '3.0', '5.0'],
 'ORIGIN': ['2.0', '4.0', '5.0', '6.0', '7.0', '8.0'],
 'office': ['1', '2', '3', '4', '5', '6', '8', '9', 'A'],
 'NOS': ['151.0',
  '160.0',
  '190.0',
  '191.0',
  '220.0',
  '230.0',
  '290.0',
  '315.0',
  '330.0',
  '345.0',
  '355.0',
  '360.0',
  '362.0',
  '370.0',
  '371.0',
  '375.0',
  '400.0',
  '410.0',
  '423.0',
  '440.0',
  '441.0',
  '442.0',
  '443.0',
  '445.0',
  '446.0',
  '448.0',
  '450.0',
  '463.0',
  '465.0',
  '470.0',
  '490.0',
  '530.0',
  '540.0',
  '550.0',
  '555.0',
  '560.0',
  '610.0',
  '625.0',
  '630.0',
  '640.0',
  '710.0',
  '720.0',
  '740.0',
  '790.0',
  '791.0',
  '810.0',
  '830.0',
  '850.0',
  '861.0',
  '862.0',
  '863.0',
  '864.0',
  '865.0',
  '870.0',
  '875.0',
  '890.0',
  '891.0',
  '892.0',
  '893.0',
  '894.0',
  '895.0',
  '896.0',
  '899.0',
  '900.0',
  '920.0',
  '930.0',
  '950.0',
  '990.0'],
 'RESIDENC': ['11.0',
  '26.0',
  '33.0',
  '42

In [15]:
lin_reg.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.139
Model:,OLS,Adj. R-squared:,0.137
Method:,Least Squares,F-statistic:,61.73
Date:,"Thu, 29 Apr 2021",Prob (F-statistic):,0.0
Time:,19:21:04,Log-Likelihood:,-747360.0
No. Observations:,1187426,AIC:,1501000.0
Df Residuals:,1184315,BIC:,1538000.0
Df Model:,3110,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-8.875e+09,8.56e+09,-1.037,0.300,-2.56e+10,7.9e+09
CLASSACT_-9.0,0.2537,0.005,49.025,0.000,0.244,0.264
CLASSACT_0.0,0.1531,0.002,80.132,0.000,0.149,0.157
CLASSACT_1.0,0.0347,0.004,8.608,0.000,0.027,0.043
CLASSACT_2.0,0.1547,0.010,15.865,0.000,0.136,0.174
CLASSACT_3.0,0.1428,0.020,7.109,0.000,0.103,0.182
JURIS_2.0,-0.0136,0.003,-4.113,0.000,-0.020,-0.007
JURIS_3.0,0.0139,0.003,4.357,0.000,0.008,0.020
JURIS_4.0,-0.2713,0.161,-1.681,0.093,-0.588,0.045

0,1,2,3
Omnibus:,9424461.462,Durbin-Watson:,0.291
Prob(Omnibus):,0.0,Jarque-Bera (JB):,116298.262
Skew:,0.284,Prob(JB):,0.0
Kurtosis:,1.576,Cond. No.,3.35e+17


In [45]:
A = np.identity(len(lin_reg.params))
non_control_indices = get_complement_indices(A.shape[0], get_ohe_col_indices(X_scaled.columns, 'district_year'))

A = A[non_control_indices, :]
A = A[1:, :]

print(lin_reg.f_test(A))

<F test: F=array([[1.06162199]]), p=0.30284574802968595, df_denom=1.18e+06, df_num=1>


