In [2]:
import numpy as np
import pandas as pd
import re
from convokit import Corpus, download
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression

In [3]:
utts = pd.read_csv('data/utts.csv')
roberts = pd.read_csv('data/subset.csv')

In [52]:
utt_counts = pd.DataFrame(utts.groupby(['meta.case_id']).size())
utt_counts = utt_counts.reset_index()
utt_counts = utt_counts.rename(columns={0: 'utt_counts'})
utts = pd.merge(utts, utt_counts, how = 'left')
utts = utts.loc[utts.loc[:, 'utt_counts'] != 1, :]
utts.groupby(['meta.case_id']).size()

utts['text'].replace('\d+', '', regex=True, inplace = True) 
#remove numbers, decide as group if this makes sense
#NOTE: No stemming or lemmatization done at this point
convo_case = pd.DataFrame(utts.groupby('meta.case_id')['conversation_id'].nunique())
convo_case.rename(columns = {'conversation_id':'convo_count'}, inplace = True)
#Code to create conversations/case feature

group_utts = utts.groupby('meta.case_id')['text'].apply(' '.join)
df = pd.merge(group_utts,roberts[['id','win_side']], how = 'left', 
              left_on = 'meta.case_id', right_on = 'id')

X_train, X_test, y_train, y_test = train_test_split(
    df.loc[:,df.columns != 'win_side'],df['win_side'], test_size = 0.2, random_state = 0)

#Plotting and Parameter tuning lists
ngrams = [(1,1),(1,2),(2,2),(1,3),(3,3)]
df_min = range(0,8)
df_max = range(4,9)

In [54]:
for val in df_max:#for val in df_min: #min_df = 4 is highest
    count_vect = CountVectorizer(ngram_range = (1,1),min_df = 4, max_df = val,
                stop_words = 'english', token_pattern = r'\b[a-zA-Z]{3,}\b') 
    count_df = count_vect.fit_transform(X_train['text'])
    count_array = count_df.toarray()
    count_df = pd.DataFrame(count_array,columns = count_vect.get_feature_names(), index = X_train['id'])
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(count_df).toarray()
    X_train_tfidf = pd.DataFrame(X_train_tfidf, columns = count_vect.get_feature_names(), index = X_train['id'])

    X_train_tfidf = pd.merge(X_train_tfidf,convo_case, how = 'left', left_index = True, right_index = True)
    #MERGING IN ADDITIONAL FEATURES HERE
    #X_train_tfidf
    count_test = count_vect.transform(X_test['text'])
    count_test_array = count_test.toarray()
    count_test_df = pd.DataFrame(count_test_array, columns = count_vect.get_feature_names(),index = X_test['id'])

    X_test_tfidf = tfidf_transformer.transform(count_test_df).toarray()
    X_test_tfidf = pd.DataFrame(X_test_tfidf, columns = count_vect.get_feature_names(), index = X_test['id'])
    X_test_tfidf = pd.merge(X_test_tfidf,convo_case, how = 'left', left_index = True, right_index = True)
    #X_test_tfidf
    #model = GaussianNB()
    model = LogisticRegression()
    model.fit(X_train_tfidf,y_train)
    win_pred = model.predict(X_test_tfidf)
    print(' ')
    print(accuracy_score(y_test,win_pred), 'max_df:',val)
    """
    this accuracy is notably worse than the 'just say the petitioner will win 
    every time and you'll be right 65% of the time'
    """
    print(precision_score(y_test,win_pred))
    print(f1_score(y_test,win_pred))

 
0.624390243902439 max_df: 4
0.6274509803921569
0.7687687687687688
 
0.624390243902439 max_df: 5
0.6274509803921569
0.7687687687687688
 
0.624390243902439 max_df: 6
0.6274509803921569
0.7687687687687688
 
0.624390243902439 max_df: 7
0.6274509803921569
0.7687687687687688
 
0.624390243902439 max_df: 8
0.6274509803921569
0.7687687687687688


In [38]:
np.mean(y_test)

0.6292682926829268

0.6390243902439025
0.6424870466321243
0.7701863354037267


[range(0, 8)]