In [49]:
import numpy as np
import pandas as pd
import re
from convokit import Corpus, download
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix
import gender_guesser.detector as gender


In [50]:
#corpus = Corpus(filename=download('supreme-corpus'))
all_utts = pd.read_csv('data/utts.csv')

In [51]:
# All cases
cases = pd.read_json(path_or_buf='data/cases.jsonl', lines=True)

# Cases with clear winners
df = cases.loc[cases.loc[:, 'win_side'].isin([1, 0])]

# Roberts court cases with clear winners
roberts = df.loc[df.loc[:, 'court'] == 'Roberts Court', :]

# All utterances
#all_utts = corpus.get_utterances_dataframe()

# Roberts court case utterances
roberts_ids = roberts.loc[:, 'id'].unique()
utts = all_utts.loc[all_utts.loc[:, 'meta.case_id'].isin(roberts_ids)]

# Roberts court cases with clearn winners and utterance data
    # Unique case IDs from utts indicate the number of Roberts court cases
    # the corpus has utterance data for
subset_ids = utts.loc[:, 'meta.case_id'].unique()
    # Use these ids to subset the roberts data frame
    # (exclude cases without utterance data)
roberts = roberts.loc[roberts.loc[:, 'id'].isin(subset_ids)]
print(len(roberts), 'cases')
petitioner_wins = roberts.loc[:, 'win_side'].mean()
print(round(petitioner_wins * 100, 2),
      'percent of cases were decided favorably for the petitioner')

1024 cases
65.33 percent of cases were decided favorably for the petitioner


In [52]:
# Utterances per case
print('Min:', utts.groupby(['meta.case_id']).size().min()) 
print('Max:', utts.groupby(['meta.case_id']).size().max())
print('Mean', utts.groupby(['meta.case_id']).size().mean())

Min: 1
Max: 1235
Mean 237.3955078125


In [53]:
# Drop single-utterance cases
utt_counts = pd.DataFrame(utts.groupby(['meta.case_id']).size())
utt_counts = utt_counts.reset_index()
utt_counts = utt_counts.rename(columns={0: 'utt_counts'})
utts = pd.merge(utts, utt_counts, how = 'left')
utts = utts.loc[utts.loc[:, 'utt_counts'] != 1, :]
print('1 case dropped')
print('New min:', utts.groupby(['meta.case_id']).size().min()) 
print(len(utts.groupby(['meta.case_id'])), 'cases')

# Store utts for use in other notebooks
%store utts

1 case dropped
New min: 66
1023 cases
Stored 'utts' (DataFrame)


In [54]:
# Drop single-sided cases

# Case IDs with 0 or 100% petitioner_advocate_utt_share
%store -r advocates
single_sided = (advocates.loc[
    ((advocates.loc[:, 'petitioner_advocate_utt_share'] == 0.0) |
     (advocates.loc[:, 'petitioner_advocate_utt_share'] == 1.0)), :])
single_sided = single_sided.reset_index()

# advocates has one row per case ID --> case_ids are already unique
iffy_ids = single_sided.loc[:, 'meta.case_id']
utts = utts.loc[~utts.loc[:, 'meta.case_id'].isin(iffy_ids)]
print(len(utts.groupby(['meta.case_id'])), 'cases')
print(utts)

1004 cases
                  id  timestamp  \
0       22620__0_000        NaN   
1       22620__0_001        NaN   
2       22620__0_002        NaN   
3       22620__0_003        NaN   
4       22620__0_004        NaN   
...              ...        ...   
243088  24969__2_007        NaN   
243089  24969__2_008        NaN   
243090  24969__2_009        NaN   
243091  24969__2_010        NaN   
243092  24969__2_011        NaN   

                                                     text  \
0       We'll hear argument first this morning in 04-4...   
1       Thank you, Mr. Chief Justice, and may it pleas...   
2       Well, isn't there something different here?\nB...   
3       I don't agree, Justice Souter, and here's why....   
4       Sure, but they suffered the harm because the f...   
...                                                   ...   
243088  -- has all sorts of meaning that you're not en...   
243089                                  No, Your Honor --   
243090             

In [55]:
# Compute justice utterance share
utts['justice'] = utts.loc[:, 'meta.speaker_type'] == 'J'
df = utts.groupby('meta.case_id').agg({'justice': ['sum', 'count']})
df.columns = [ 'justice_utts', 'tot_utts']
df['justice_utt_share'] = df.loc[:, 'justice_utts'] / df.loc[:, 'tot_utts']
justices = df.loc[:, ['justice_utt_share']]
print('Min:', justices.loc[:, 'justice_utt_share'].min())
print('Max:', justices.loc[:, 'justice_utt_share'].max())
print('Mean:', justices.loc[:, 'justice_utt_share'].mean())

Min: 0.48
Max: 0.6521739130434783
Mean: 0.5152917822677269


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utts['justice'] = utts.loc[:, 'meta.speaker_type'] == 'J'


In [56]:
# Compute petitioner advocate utterance share
advocates = utts.loc[utts.loc[:, 'meta.speaker_type'] == 'A', :].copy()
advocates['petitioner_advocate'] = advocates.loc[:, 'meta.side'] == 1
advocates = advocates.groupby('meta.case_id').agg({'petitioner_advocate': ['sum', 'count']})
advocates.columns = [ 'petitioner_advocate_utts', 'total_advocate_utts']
advocates['petitioner_advocate_utt_share'] = advocates.loc[:, 'petitioner_advocate_utts'] / advocates.loc[:, 'total_advocate_utts']
advocates = advocates.loc[:, [ 'petitioner_advocate_utt_share']]
print('Min', advocates.loc[:, 'petitioner_advocate_utt_share'].min())
print('Max', advocates.loc[:, 'petitioner_advocate_utt_share'].max())
print('Mean', advocates.loc[:, 'petitioner_advocate_utt_share'].mean())
%store advocates

Min 0.14130434782608695
Max 0.967741935483871
Mean 0.4602423775745476
Stored 'advocates' (DataFrame)


In [57]:
# Compute female utterance share
d = gender.Detector()

def guess_gender(name):
    name = re.sub('j__', '', name)
    name = re.sub('_\S*', '', name)
    name = name.capitalize()
    guess = d.get_gender(name, 'usa')
    return guess

# Guess gender
gendr = utts.loc[:, ['meta.case_id', 'speaker' ]].copy()
gendr.loc[:, 'gender'] = gendr.loc[:, 'speaker'].map(guess_gender)
print('Before subsetting:')
print(gendr.loc[:, 'gender'].unique())
print(gendr.groupby('gender').agg({'gender': ['count']}).droplevel)

# Restrict to confident guesses
gendr = gendr.loc[gendr.loc[:, 'gender'].isin(['female', 'mostly_female', 
                                               'mostly_male', 'male'])]
print('\nAfter subsetting:')
print(gendr.loc[:, 'gender'].unique())
print(gendr.groupby('gender').agg({'gender': ['count']}).droplevel)

# Compute female_utt_share 
    # = (N female + N mostly_female) / 
    # (N female + N mostly_female + N male + N mostly_male)
gendr.loc[:, 'gender_num'] = gendr.loc[:, 'gender'].isin(['female', 
                                                          'mostly_female'])
gendr = gendr.groupby('meta.case_id').agg({'gender_num': ['sum', 'count']})
gendr.columns = ['female_utts', 'total_utts']
gendr.loc[:, 'female_utt_share'] = (gendr.loc[:, 'female_utts'] /
                                    gendr.loc[:, 'total_utts'])
gendr = gendr.loc[:, ['female_utt_share']]

Before subsetting:
['male' 'female' 'andy' 'mostly_male' 'unknown' 'mostly_female']
<bound method NDFrame.droplevel of                gender
                count
gender               
andy            28747
female          41059
male           156479
mostly_female    1856
mostly_male      3099
unknown          7680>

After subsetting:
['male' 'female' 'mostly_male' 'mostly_female']
<bound method NDFrame.droplevel of                gender
                count
gender               
female          41059
male           156479
mostly_female    1856
mostly_male      3099>


In [58]:
# Combine utterance shares
utt_shares = pd.merge(justices, advocates, how = 'left', left_index = True, right_index = True)
utt_shares = pd.merge(utt_shares, gendr, how = 'left', left_index = True, right_index = True)
utt_shares.head()

Unnamed: 0_level_0,justice_utt_share,petitioner_advocate_utt_share,female_utt_share
meta.case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2005_04-1034,0.513353,0.603659,0.06639
2005_04-10566,0.525333,0.533708,0.14
2005_04-1067,0.534954,0.48366,0.275081
2005_04-1084,0.526611,0.514793,0.358025
2005_04-1131,0.509434,0.584615,0.403361


In [59]:
#Code to create conversations/case feature
utts['text'].replace('\d+', '', regex=True, inplace = True) 
convo_case = pd.DataFrame(utts.groupby('meta.case_id')['conversation_id'].nunique())
convo_case.rename(columns = {'conversation_id':'convo_count'}, inplace = True)
print(convo_case.loc[:,'convo_count'].describe())

"""PROPORTION OF UTTERANCES BY JUSTICES APPOINTED BY REPUBLICAN PRESIDENTS
Does not consider length of utterance, which also may be of interest"""
just_dict = {'j__john_g_roberts_jr':0, 'j__david_h_souter':0,
       'j__anthony_m_kennedy':0, 'j__ruth_bader_ginsburg':1,
       'j__john_paul_stevens':0, 'j__samuel_a_alito_jr':0,
       'j__antonin_scalia':0, 'j__stephen_g_breyer':1,
       'j__sandra_day_oconnor':0, 'j__clarence_thomas': 0 ,
       'j__sonia_sotomayor':1, 'j__elena_kagan':1, 'j__neil_gorsuch': 0,
       'j__brett_m_kavanaugh':0} #based on being appointed by cons/liberal president
j_utts = utts.copy()
j_utts = j_utts.loc[utts.loc[:,'justice'] == True]
j_utts.loc[:,'cons_just']= j_utts.loc[:,'speaker'].map(just_dict)
cons_just = j_utts.groupby(['meta.case_id']).agg({'cons_just':'mean'}) 
print(cons_just.loc[:,'cons_just'].describe())


count    1004.000000
mean        1.016932
std         0.150480
min         1.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         4.000000
Name: convo_count, dtype: float64
count    1004.000000
mean        0.434504
std         0.155229
min         0.045455
25%         0.307692
50%         0.444994
75%         0.553393
max         0.915789
Name: cons_just, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [60]:
#PROPORTION OF REPUBLICAN APPOINTED JUSTICES ON THE COURT AT THE TIME OF THE DECISION
roberts.loc[:,'decided_date'] = pd.to_datetime(roberts.loc[:,'decided_date'])
roberts.loc[roberts.loc[:,'id'] == '2019_17-1268','decided_date'] = '2020-06-19'
roberts.loc[roberts.loc[:,'decided_date'].between('2005-10-29','2009-08-07'), 'prop_cons'] = 7/9
roberts.loc[roberts.loc[:,'decided_date'].between('2009-08-08','2010-08-06'), 'prop_cons'] = 6/9
roberts.loc[roberts.loc[:,'decided_date'].between('2010-08-07','2016-02-12'), 'prop_cons'] = 5/9
roberts.loc[roberts.loc[:,'decided_date'].between('2016-02-13','2017-04-09'), 'prop_cons'] = 4/8
roberts.loc[roberts.loc[:,'decided_date'].between('2017-04-10','2020-07-10'), 'prop_cons'] = 5/9

prop_cons = roberts.loc[:,['id','prop_cons']].set_index('id')
print(roberts.loc[:,'prop_cons'].describe())

count    1024.000000
mean        0.623210
std         0.103381
min         0.500000
25%         0.555556
50%         0.555556
75%         0.777778
max         0.777778
Name: prop_cons, dtype: float64


In [73]:
group_utts = utts.groupby('meta.case_id')['text'].apply(' '.join)
df = pd.merge(group_utts,roberts[['id','win_side']], how = 'left', 
              left_on = 'meta.case_id', right_on = 'id')
X_train, X_test, y_train, y_test = train_test_split(
    df.loc[:,df.columns != 'win_side'],df['win_side'], test_size = 0.3, random_state = 0)

In [74]:
#Initialize Vectorizer and vectorize train data
count_vect = CountVectorizer(ngram_range = (1,1),min_df = 4,
            stop_words = 'english', token_pattern = r'\b[a-zA-Z]{3,}\b') 
count_df = count_vect.fit_transform(X_train['text'])
count_array = count_df.toarray()
count_df = pd.DataFrame(count_array,columns = count_vect.get_feature_names(), index = X_train['id'])
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(count_df).toarray()
X_train_tfidf = pd.DataFrame(X_train_tfidf, columns = count_vect.get_feature_names(), index = X_train['id'])

X_train_all = X_train_tfidf.copy()
X_train_all = pd.merge(X_train_all,convo_case, how = 'left', left_index = True, right_index = True)
X_train_all = pd.merge(X_train_all,utt_shares, how = 'left', left_index = True, right_index = True)
X_train_all = pd.merge(X_train_all,cons_just, how = 'left', left_index = True, right_index = True)
X_train_all = pd.merge(X_train_all,prop_cons, how = 'left', left_index = True, right_index = True)
#MERGING IN ADDITIONAL FEATURES HERE
X_train_all

Unnamed: 0_level_0,aaa,aba,abandon,abandoned,abandoning,abandonment,abandons,abate,abated,abatement,...,zipes,zone,zones,zoning,convo_count,justice_utt_share,petitioner_advocate_utt_share,female_utt_share,cons_just,prop_cons
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011_10-8505,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.528634,0.485981,0.380208,0.416667,0.555556
2019_17-834,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.524823,0.238806,0.168498,0.513514,0.555556
2010_10-76,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.515152,0.265625,0.533333,0.441176,0.555556
2008_07-773,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.508418,0.383562,0.089286,0.364238,0.777778
2012_12-399,0.0,0.0,0.0,0.003799,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.510417,0.503546,0.390947,0.448980,0.555556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017_16-1348,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.517483,0.637681,0.244444,0.614865,0.555556
2007_07-21,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.508143,0.503311,0.063670,0.166667,0.777778
2013_12-9490,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.512281,0.496403,0.172996,0.328767,0.555556
2012_12-167,0.0,0.0,0.0,0.022821,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.508333,0.389831,0.214286,0.557377,0.555556


In [75]:
#Vectorize test data
count_test = count_vect.transform(X_test['text'])
count_test_array = count_test.toarray()
count_test_df = pd.DataFrame(count_test_array, columns = count_vect.get_feature_names(),index = X_test['id'])

X_test_tfidf = tfidf_transformer.transform(count_test_df).toarray()
X_test_tfidf = pd.DataFrame(X_test_tfidf, columns = count_vect.get_feature_names(), index = X_test['id'])
X_test_all = X_test_tfidf.copy()

X_test_all = pd.merge(X_test_all,convo_case, how = 'left', left_index = True, right_index = True)
X_test_all = pd.merge(X_test_all,utt_shares, how = 'left', left_index = True, right_index = True)
X_test_all = pd.merge(X_test_all,cons_just, how = 'left', left_index = True, right_index = True)
X_test_all = pd.merge(X_test_all,prop_cons, how = 'left', left_index = True, right_index = True)
X_test_all 

Unnamed: 0_level_0,aaa,aba,abandon,abandoned,abandoning,abandonment,abandons,abate,abated,abatement,...,zipes,zone,zones,zoning,convo_count,justice_utt_share,petitioner_advocate_utt_share,female_utt_share,cons_just,prop_cons
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018_17-1272,0.014901,0.00653,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.521912,0.500000,0.259669,0.679389,0.555556
2013_12-5196,0.000000,0.00000,0.005705,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.510204,0.791667,0.315789,0.613333,0.555556
2009_08-1196,0.000000,0.00000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.501992,0.459677,0.110132,0.388889,0.666667
2009_09-5327,0.000000,0.00000,0.000000,0.0,0.0,0.058243,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.512077,0.594059,0.123077,0.330189,0.666667
2012_12-43,0.000000,0.00000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.517588,0.468750,0.401130,0.757282,0.555556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2011_10-699,0.000000,0.00000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.524752,0.562500,0.142857,0.377358,0.555556
2010_09-907,0.000000,0.00000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.530534,0.650407,0.305310,0.424460,0.555556
2005_04-1203,0.000000,0.00000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.510204,0.479167,0.114173,0.260000,0.777778
2006_05-7058,0.000000,0.00000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0.504237,0.444444,0.642202,0.268908,0.777778


In [76]:
 X_val_all, X_test_all, y_val, y_test = train_test_split(X_test_all, y_test, 
    test_size=0.66, random_state= 0)
print(len(X_test_all), len(X_val_all))
print(len(y_test), len(y_val))


200 102
200 102


In [78]:
X_train_all.to_csv('data/X_train_all')
X_val_all.to_csv('data/X_val_all')
X_test_all.to_csv('data/X_test_all')

y_train.to_csv('data/y_train')
y_val.to_csv('data/y_val')
y_test.to_csv('data/y_test')

In [81]:
#CONSTRUCTED FEATURE ONLY MODEL, TRANSPLANT THIS CODE TO SEPERATE FEATURE ENGINEERING FILES
X_train_feat = X_train_all.loc[:,['convo_count',
       'justice_utt_share', 'petitioner_advocate_utt_share', 'cons_just','prop_cons']]
X_val_feat = X_val_all.loc[:,['convo_count',
       'justice_utt_share', 'petitioner_advocate_utt_share', 'cons_just','prop_cons']]

TypeError: bad operand type for unary ~: 'list'

In [34]:
#REMOVE BEFORE SUBMISSION, current use is as reference. Below code NOT adjusted to consider validation subset 
logit_tfidf = LogisticRegression()
logit_tfidf = logit_tfidf.fit(X = X_train_tfidf, y = y_train)
logit_tfidf.predict(X_test_tfidf)
mean_acc = logit_tfidf.score(X_test_tfidf, y_test)
print(mean_acc)

logit_all = LogisticRegression()
logit_all = logit_all.fit(X = X_train_all, y = y_train)
logit_all.predict(X_test_all)
mean_acc = logit_all.score(X_test_all, y_test)
print(mean_acc)

logit_feat = LogisticRegression()
logit_feat = logit_feat.fit(X = X_train_feat, y = y_train)
logit_feat.predict(X_test_feat)
mean_acc = logit_feat.score(X_test_feat, y_test)
print(mean_acc)

ValueError: Found input variables with inconsistent numbers of samples: [44, 307]

In [35]:
np.mean(y_test)

0.6818181818181818

In [36]:
mlp = MLPClassifier(activation='relu', 
                     max_iter=500, random_state = 0)

parameter_space = {
    'hidden_layer_sizes': [(10,5,2), (50,100,50)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}
search = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
search.fit(X_train_feat, y_train)
# Best parameters set
print('Best parameters found:\n', search.best_params_)

mlp.fit(X_train_feat,y_train)
predict_test = mlp.predict(X_test_feat)
accuracy_score(y_test,predict_test)

--- Logging error ---
--- Logging error ---
Traceback (most recent call last):
  File "/Users/amaribauer/opt/anaconda3/lib/python3.9/site-packages/joblib/externals/loky/backend/fork_exec.py", line 31, in close_fds
    os.close(i)
--- Logging error ---
Traceback (most recent call last):
  File "/Users/amaribauer/opt/anaconda3/lib/python3.9/site-packages/joblib/externals/loky/backend/fork_exec.py", line 31, in close_fds
    os.close(i)
Traceback (most recent call last):
OSError: [Errno 9] Bad file descriptor
  File "/Users/amaribauer/opt/anaconda3/lib/python3.9/site-packages/joblib/externals/loky/backend/fork_exec.py", line 31, in close_fds
    os.close(i)

During handling of the above exception, another exception occurred:

OSError: [Errno 9] Bad file descriptor
Traceback (most recent call last):
  File "/Users/amaribauer/opt/anaconda3/lib/python3.9/site-packages/joblib/externals/loky/process_executor.py", line 1109, in _ensure_executor_running
    self._adjust_process_count()

During h

KeyboardInterrupt: 

In [45]:
means = search.cv_results_['mean_test_score']
means

array([0.65753629, 0.66502693, 0.65753629, 0.66502693, 0.65753629,
       0.66627071, 0.65753629, 0.66627071, 0.65753629, 0.66501761,
       0.65753629, 0.66501761, 0.65753629, 0.66503158, 0.65753629,
       0.66503158, 0.65753629, 0.65629251, 0.65753629, 0.65629251,
       0.65753629, 0.66501295, 0.65753629, 0.66501295, 0.65753629,
       0.66128161, 0.65753629, 0.66128161, 0.65753629, 0.66003783,
       0.65753629, 0.66003783, 0.65753629, 0.66751449, 0.65753629,
       0.66751449, 0.65753629, 0.66129092, 0.65753629, 0.66129092,
       0.65753629, 0.66253005, 0.65753629, 0.66253005, 0.65753629,
       0.66876759, 0.65753629, 0.66876759])

In [35]:
# Naive Bayes
model = GaussianNB() 
model.fit(X_train_tfidf,y_train)
win_pred = model.predict(X_test_tfidf)

print('Naive Bayes TFIDF')
print('accuracy_score:', accuracy_score(y_test,win_pred))
print('precision_score:', precision_score(y_test,win_pred))
print('f1_score:', f1_score(y_test,win_pred))

model = GaussianNB() 
model.fit(X_train_feat,y_train)
win_pred = model.predict(X_test_feat)

print('Naive Bayes Features')
print('accuracy_score:', accuracy_score(y_test,win_pred))
print('precision_score:', precision_score(y_test,win_pred))
print('f1_score:', f1_score(y_test,win_pred))

model = GaussianNB() 
model.fit(X_train_all,y_train)
win_pred = model.predict(X_test_all)

print('Naive Bayes All')
print('accuracy_score:', accuracy_score(y_test,win_pred))
print('precision_score:', precision_score(y_test,win_pred))
print('f1_score:', f1_score(y_test,win_pred))

Naive Bayes TFIDF
accuracy_score: 0.6368159203980099
precision_score: 0.6443298969072165
f1_score: 0.7739938080495357
Naive Bayes Features
accuracy_score: 0.6417910447761194
precision_score: 0.7021276595744681
f1_score: 0.7333333333333334
Naive Bayes All
accuracy_score: 0.6368159203980099
precision_score: 0.6443298969072165
f1_score: 0.7739938080495357


In [31]:
len(X_val_tfidf)

23

In [36]:
print(confusion_matrix(y_test,predict_test)) #lots of false negatives
print(classification_report(y_test,predict_test))

[[  0  72]
 [  0 129]]
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        72
         1.0       0.64      1.00      0.78       129

    accuracy                           0.64       201
   macro avg       0.32      0.50      0.39       201
weighted avg       0.41      0.64      0.50       201



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
