In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

In [2]:
X_train_all = pd.read_csv('data/X_train_all.csv', index_col='id')
X_test_all = pd.read_csv('data/X_test_all.csv', index_col='id')
y_train = pd.read_csv('data/y_train.csv')
y_test = pd.read_csv('data/y_test.csv')
y_train = y_train.loc[:, 'win_side']
y_test = y_test.loc[:, 'win_side']

In [3]:
y_train.mean()

0.6536312849162011

In [4]:
X_train_tfidf = X_train_all.iloc[:, :-6]
X_test_tfidf = X_test_all.iloc[:, :-6]
X_train_feat = X_train_all.iloc[:, -6:]
X_test_feat = X_test_all.iloc[:, -6:]

In [5]:
logit_tfidf = LogisticRegression()
logit_tfidf = logit_tfidf.fit(X = X_train_tfidf, y = y_train)
preds = logit_tfidf.predict(X_test_tfidf)
mean_acc = logit_tfidf.score(X_test_tfidf, y_test)
f1 = f1_score(y_test, preds)   
print('Mean accuracy:', mean_acc)
print('F1:', f1)

logit_feat = LogisticRegression()
logit_feat = logit_feat.fit(X = X_train_feat, y = y_train)
preds = logit_feat.predict(X_test_feat)
mean_acc = logit_feat.score(X_test_feat, y_test)
f1 = f1_score(y_test, preds)   
print('Mean accuracy:', mean_acc)
print('F1:', f1)
print(mean_acc)

logit_all = LogisticRegression()
logit_all = logit_all.fit(X = X_train_all, y = y_train)
preds = logit_all.predict(X_test_all)
mean_acc = logit_all.score(X_test_all, y_test)
f1 = f1_score(y_test, preds)   
print('Mean accuracy:', mean_acc)
print('F1:', f1)
print(mean_acc)

Mean accuracy: 0.6514657980456026
F1: 0.7881188118811882
Mean accuracy: 0.6579804560260586
F1: 0.7912524850894632
0.6579804560260586
Mean accuracy: 0.6807817589576547
F1: 0.8008130081300814
0.6807817589576547


In [6]:
# Solvers excluded:
# liblinear (limited to one-vs-rest schemas)
# sag and saga (convergence issues without additional preprocessing)

param_grid = {
    'solver': ['lbfgs', 'newton-cg', 'newton-cholesky'],
    'penalty': ['l2', None],
    'C': [1.0, 0.75, 0.5, 0.25]
}

In [7]:
logit_all = LogisticRegression()
logit_all = logit_all.fit(X = X_train_all, y = y_train)
search = GridSearchCV(logit_all, param_grid, n_jobs=-1, return_train_score=True)
search.fit(X_train_all, y_train)

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further opti

In [8]:
print('Best parameters found:\n', search.best_params_)

Best parameters found:
 {'C': 0.75, 'penalty': 'l2', 'solver': 'lbfgs'}


In [9]:
results = pd.DataFrame(search.cv_results_)
results = results.loc[:, ['param_solver', 'param_penalty', 'param_C', 'mean_train_score', 'mean_test_score', 'rank_test_score']]
results = results.sort_values('rank_test_score')
results

Unnamed: 0,param_solver,param_penalty,param_C,mean_train_score,mean_test_score,rank_test_score
8,newton-cholesky,l2,0.75,0.722065,0.659237,1
7,newton-cg,l2,0.75,0.722065,0.659237,1
6,lbfgs,l2,0.75,0.722065,0.659237,1
14,newton-cholesky,l2,0.5,0.685753,0.657828,4
13,newton-cg,l2,0.5,0.685753,0.657828,4
12,lbfgs,l2,0.5,0.685753,0.657828,4
0,lbfgs,l2,1.0,0.764314,0.65643,7
2,newton-cholesky,l2,1.0,0.764314,0.65643,7
1,newton-cg,l2,1.0,0.764314,0.65643,7
18,lbfgs,l2,0.25,0.658521,0.652234,10


In [10]:
# According to GridSearchCV, the best parameters found are:
# {'C': 0.75, 'penalty': 'l2', 'solver': 'lbfgs'}

mod = LogisticRegression(C=0.75)
mod = mod.fit(X = X_train_all, y = y_train)
mod_preds = mod.predict(X_test_all)
mean_acc = mod.score(X_test_all, y_test)
print('Mean accuracy with C=0.75:', mean_acc)
print('F1:', f1_score(y_test, mod_preds))

Mean accuracy with C=0.75: 0.6677524429967426
F1: 0.7951807228915664


In [17]:
features = pd.DataFrame(X_train_all.columns[-6:])
mod_coefs = pd.DataFrame(mod.coef_[:, -6:]).T
mod_coefs = pd.merge(features, mod_coefs, left_index=True, right_index=True)
mod_coefs.columns = ['feature', 'weight (log odds)']
mod_coefs = mod_coefs.set_index('feature')
mod_coefs

Unnamed: 0_level_0,weight (log odds)
feature,Unnamed: 1_level_1
convo_count,-0.092685
justice_utt_share,0.210192
petitioner_advocate_utt_share,-1.977999
female_utt_share,0.041941
cons_just,0.193704
prop_cons,0.414848


In [18]:
# https://towardsdatascience.com/interpreting-coefficients-in-linear-and-logistic-regression-6ddf1295f6f1
mod_coefs.loc[:, 'odds'] = np.exp(mod_coefs.loc[:, 'weight (log odds)'])
mod_coefs.loc[:, '1/odds'] = ''
mod_coefs.loc[mod_coefs.loc[:, 'weight (log odds)'] < 0, '1/odds'] =  (1 / mod_coefs.loc[:, 'odds'])
mod_coefs

Unnamed: 0_level_0,weight (log odds),odds,1/odds
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
convo_count,-0.092685,0.911481,1.097116
justice_utt_share,0.210192,1.233915,
petitioner_advocate_utt_share,-1.977999,0.138346,7.228266
female_utt_share,0.041941,1.042833,
cons_just,0.193704,1.213737,
prop_cons,0.414848,1.514141,
