In [75]:
!pip install statsmodels

Defaulting to user installation because normal site-packages is not writeable
Collecting statsmodels
  Downloading statsmodels-0.12.2-cp38-cp38-manylinux1_x86_64.whl (9.4 MB)
[K     |████████████████████████████████| 9.4 MB 15.7 MB/s eta 0:00:01
Collecting patsy>=0.5
  Downloading patsy-0.5.1-py2.py3-none-any.whl (231 kB)
[K     |████████████████████████████████| 231 kB 115.3 MB/s eta 0:00:01
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.1 statsmodels-0.12.2


In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm

pd.set_option('display.max_columns', None)

In [2]:
type_map = {
    'decision_date' : 'str',
    'filing_date' : 'str',
    'Court Name' : 'category',
    'Party of Appointing President' : 'category',
    'CIRCUIT' : 'category',
    'JURIS' : 'category',
    'NOS' : 'category',
    'ORIGIN' : 'category',
    'RESIDENC' : 'category',
    'CLASSACT' : 'category',
    'DEMANDED' : 'float64',
    'TERMDATE' : 'str',
    'DISP' : 'category',
    'PROCPROG' : 'category',
    'NOJ' : 'category',
    'AMTREC' : 'category',
    'JUDGMENT' : 'category',
    'TAPEYEAR' : 'float64',
    'district' : 'category',
    'office' : 'category',
    'county' : 'category',
    'TRCLACT' : 'category',
    'PROSE' : 'category',
    'arbit' : 'category',
    'transoff' : 'category',
    'trmarb' : 'category',
    'ifp' : 'category',
    'statuscd' : 'category'
}

cols_to_drop = [
    'decision_date',
    'filing_date',
    'TERMDATE'
]

df = pd.read_csv(
    '/scratch/ayl316/ttml_mr_data/processed_data/cases.csv.zip', 
    dtype = type_map, 
    parse_dates = ['decision_date', 'filing_date', 'TERMDATE']
).rename(columns = {
    'Court Name' : 'court_name',
    'Party of Appointing President' : 'party'
}).drop(columns = cols_to_drop)


for col in ['party', 'TRCLACT', 'PROSE', 'arbit', 'transoff', 'trmarb', 'ifp', 'statuscd']:
    if not '-8' in df[col].cat.categories:
        df[col] = df[col].cat.add_categories('-8')
    df[col] = df[col].fillna('-8')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1219459 entries, 0 to 1219458
Data columns (total 25 columns):
 #   Column      Non-Null Count    Dtype   
---  ------      --------------    -----   
 0   court_name  1219459 non-null  category
 1   party       1219459 non-null  category
 2   CIRCUIT     1219459 non-null  category
 3   JURIS       1219459 non-null  category
 4   NOS         1219459 non-null  category
 5   ORIGIN      1219459 non-null  category
 6   RESIDENC    1219459 non-null  category
 7   CLASSACT    1219459 non-null  category
 8   DEMANDED    1219459 non-null  float64 
 9   DISP        1219459 non-null  category
 10  PROCPROG    1219459 non-null  category
 11  NOJ         1219459 non-null  category
 12  AMTREC      1219459 non-null  category
 13  JUDGMENT    1219459 non-null  category
 14  TAPEYEAR    1219459 non-null  float64 
 15  district    1219459 non-null  category
 16  office      1219459 non-null  category
 17  county      1219459 non-null  category
 18  TR

In [11]:
feature_cols = [
    'court_name',
    'CIRCUIT',
    'JURIS',
    'NOS',
    'ORIGIN',
    'RESIDENC',
    'CLASSACT',
#     'DEMANDED',
    'DISP',
    'PROCPROG',
    'NOJ',
    'AMTREC',
    'JUDGMENT',
#     'TAPEYEAR',
    'district',
    'office',
    'county',
    'TRCLACT',
    'PROSE',
    'arbit',
    'transoff',
    'trmarb',
    'ifp',
    'statuscd'
]

target_col = 'party'

df = df[(df[target_col] == 'Republican') | (df[target_col] == 'Democratic')]

df = df.sample(n = 50000)

for col in df.columns:
    if col in feature_cols:
        df[col] = df[col].cat.remove_unused_categories()

X = df[feature_cols]
y = df[target_col]


y = y.cat.add_categories(['1', '0'])
y[y == 'Democratic'] = '1'
y[y == 'Republican'] = '0'
y = y.cat.remove_unused_categories()

In [12]:
enc = OneHotEncoder(drop = 'first')
enc.fit(X)
X_ohe = pd.DataFrame(
    enc.transform(X).toarray(),
    columns = enc.get_feature_names(feature_cols)
)

# X_ohe = enc.transform(X).toarray()

In [13]:
np.asarray(X_ohe).shape

(50000, 3043)

In [16]:
# clf = LogisticRegression(random_state = 0).fit(X_ohe, y)


log_reg = sm.Logit(list(y.astype(float)), X_ohe).fit(method='lbfgs')



In [None]:
# print(log_reg.summary())

In [17]:
log_reg.pvalues

court_name_Supreme Court of the United States            NaN
court_name_U.S. Circuit Courts for the First Circuit     NaN
court_name_U.S. Circuit Courts for the Ninth Circuit     NaN
court_name_U.S. Circuit Courts for the Second Circuit    NaN
court_name_U.S. Circuit Courts for the Seventh Circuit   NaN
                                                          ..
trmarb_M                                                 NaN
trmarb_S                                                 NaN
trmarb_V                                                 NaN
ifp_FP                                                   NaN
statuscd_L                                               NaN
Length: 3043, dtype: float64