In [214]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
import pandas as pd 
import jsonlines
from more_itertools import flatten
import glob

In [172]:
batches = glob.glob('../data/gpt_classification/gpt_classified_batch-*.jsonl')
data_dicts = list(map(lambda x: list(jsonlines.open(x)), batches))
data_dicts = list(flatten(data_dicts))

In [174]:
data = []
for d in data_dicts:
    data.append({
        'id': d['custom_id'],
        'response': d['response']['body']['choices'][0]['message']['content']
    })
response_df = pd.DataFrame(data)
response_df['response'] = response_df['response'].str.replace('.', '')

In [178]:
prompt_df = list(map(pd.read_csv, glob.glob('../data/gpt_classification/prompt_df-*.csv')))
prompt_df = pd.concat(prompt_df)

In [179]:
full_df = response_df.merge(prompt_df, left_on='id', right_on='article_url').drop(columns='article_url')

In [180]:
full_df['prompt'] = full_df['prompt'].str.split('```').str.get(1)

In [296]:
train_df, test_df = train_test_split(full_df, test_size=.1)

In [297]:
train_df['response'].value_counts()

response
No     286935
Yes     15386
Name: count, dtype: int64

In [298]:
pipe = Pipeline([
    ('cv', TfidfVectorizer(min_df=.01, max_df=.5, stop_words='english')),
    ('lr', LogisticRegressionCV())
])

In [299]:
bal_train_df = (train_df
     .pipe(lambda df: pd.concat([
        df.loc[lambda df: df['response']=='Yes'],
        df.loc[lambda df: df['response']=='No'].sample(n=len(df.loc[lambda df: df['response']=='Yes']))
    ]))
     .sample(frac=1)
)

In [300]:
pipe.fit(bal_train_df['prompt'], bal_train_df['response'])

In [321]:
keywords = [
    'council',
    'policy',
    'town board',
    'votes',
    'member',
    'supervisor',
    'local',
    'proposal',
    'ordinance',
    'resolution',
    'meeting',
    'agenda',
    'minutes',
    'public hearing',
    'public comment',
    'task force',
    'committee',
    'board',
    'commission',
    'zoning',
    'planning',
]

y_pred_keywords = test_df.pipe(lambda df: df['prompt'].str.contains('|'.join(keywords), case=False))

In [323]:
y_pred_keywords.value_counts()

prompt
True     18698
False    14894
Name: count, dtype: int64

In [319]:
import pickle
with open('../data/gpt_classification/trained_models/lr-model.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [301]:
y_prob = pipe.predict_proba(test_df['prompt'])
y_pred = pipe.predict(test_df['prompt'])

In [302]:
test_df['y_pred'] = y_pred

In [303]:
precision_score(test_df['response'], y_pred, pos_label='Yes')

0.4992276799505715

In [304]:
recall_score(test_df['response'], y_pred, pos_label='Yes')

0.9682444577591373

In [305]:
roc_auc_score(test_df['response'].map({'Yes': 1, 'No': 0}), y_prob[:, 1])

0.9881391125256143

In [306]:
import numpy as np 

In [307]:
f1s = []
y_true = test_df['response'].map({'No': 0, 'Yes': 1})
for i in np.arange(0, 1, .01):
    y_pred_i = y_prob[:, 1] > i
    f1 = f1_score(y_true, y_pred_i)
    f1s.append({
        'f1': f1,
        'threshold': i
    })

In [317]:
# pd.DataFrame(f1s).sort_values('f1', ascending=False).head(20)

In [309]:
f1_score(test_df['response'], y_pred, pos_label='Yes')

0.6587851610273135

In [190]:
test_df['response'].value_counts()

response
No     31864
Yes     1728
Name: count, dtype: int64

In [191]:
pd.Series(y_pred).value_counts()

No     30295
Yes     3297
Name: count, dtype: int64

In [192]:
test_df.pipe(lambda df: confusion_matrix(df['response'], df['y_pred']))

array([[30235,  1629],
       [   60,  1668]])

In [234]:
vocab_coef = pd.Series(
    pipe.steps[1][1].coef_[0],
    index=pd.Series(pipe.steps[0][1].vocabulary_).sort_values().index.tolist()
)

In [244]:
num_coef = 10
vocab_coef.sort_values().head(num_coef)

way          -8.554906
giving       -8.206600
deals        -7.874944
phase        -7.598685
boston       -7.534490
background   -7.341671
political    -7.296717
crucial      -7.209561
elect        -6.938660
designed     -6.863904
dtype: float64

In [245]:
vocab_coef.sort_values().tail(num_coef)

councilwoman     10.040401
hearing          10.042261
ordinance        12.430084
councilman       14.482254
voted            14.856055
commissioners    17.044607
approves         19.471978
approved         21.693522
meeting          28.377206
council          32.635435
dtype: float64

In [249]:
train_df['prompt'].str.len().sort_values().value_counts().sort_index()

prompt
14        14
23         1
27         7
28         4
29         4
          ..
64879      1
65066      4
74422      1
79070      1
100000    21
Name: count, Length: 10197, dtype: int64

In [250]:
train_df.loc[lambda df: df['prompt'].str.len() < 20]

Unnamed: 0,id,response,prompt
278540,"com,jaxdailyrecord)/news/2023/jan/23/com,jaxda...",No,More Like This
52364,"com,jaxdailyrecord)/news/2023/jan/23/com,jaxda...",No,More Like This
201192,"com,jaxdailyrecord)/news/2023/jan/23/com,jaxda...",No,More Like This
197307,"com,jaxdailyrecord)/news/2001/dec/11/com,jaxda...",No,More Like This
276045,"com,jaxdailyrecord)/news/2001/dec/11/com,jaxda...",No,More Like This
278539,"com,jaxdailyrecord)/news/2023/jan/23/com,jaxda...",No,More Like This
278538,"com,jaxdailyrecord)/news/2023/jan/23/com,jaxda...",No,More Like This
52363,"com,jaxdailyrecord)/news/2023/jan/23/com,jaxda...",No,More Like This
201191,"com,jaxdailyrecord)/news/2023/jan/23/com,jaxda...",No,More Like This
197308,"com,jaxdailyrecord)/news/2001/dec/11/com,jaxda...",No,More Like This


In [193]:
(test_df
     .loc[lambda df: (df['response'] == 'No') & (df['y_pred'] == 'Yes')]
     .pipe(lambda df: print(df['prompt'].iloc[2]))
)

To the editor:



On behalf of the Alexandria Chamber of Commerce Board of Directors, our nearly 900 members and the Alexandria business community, we sincerely thank the mayor, city council members and city staff for their excellent work on the fiscal year 2012 budget.



We appreciate how difficult the city councils decisions were during the budget process and commend them for the very hard work they have done. Choosing not to implement the commercial add-on tax sends a clear message to our region that Alexandria wants to attract and retain business. Our council chose to do the right thing for Alexandria and did not simply follow the lead of our neighbors. Indeed, we can now be known as a safe haven for businesses in Northern Virginia.



We still have work to do to ensure that we select, implement and continue to fund the most impactful transportation projects in Alexandria. We take that responsibility seriously and look forward to working with the city government in this regard.




# Try Transformers

In [194]:
from transformers import AutoTokenizer

In [195]:
tok = AutoTokenizer.from_pretrained('bert-base-uncased')



In [None]:
from happytransformer import HappyTextClassification, TCTrainArgs

# happy_tc_albert = HappyTextClassification(model_type="ALBERT", model_name="albert-base-v2")
# happy_tc_bert = HappyTextClassification("BERT", "bert-base-uncased")
# happy_tc_roberta = HappyTextClassification("ROBERTA", "roberta-base")
# happy_tc_private_roberta = HappyTextClassification("ROBERTA", "user-repo/roberta-base", use_auth_token="123abc")
distilbert_model = HappyTextClassification("DISTILBERT", "distilbert-base-uncased", num_labels=2)

In [213]:
(bal_train_df
 .loc[lambda df: df['response']=='Yes']
 .iloc[3]
 ['prompt']
)

'The Raleigh-based hotel company that planned to incorporate part of a historic hospital building on West Main Street into a new hotel has made a request for city and county incentives for the project.\n\nConcord Hospitality Enterprises Co. got approval from the Board of Adjustment in August 2011 for a minor special-use permit for the hotel project. A company official had said in a previous interview with The Herald-Sun that construction was targeted to begin by the end of last year’s first quarter.\n\nKevin McAteer, vice president of sales and marketing for Concord, said in an email that the company is finishing the design and hopes to bid out to the project this summer, and to begin building before the year’s end. Attempts to get additional information from Concord officials about the incentive request were not successful this week.\n\nMarqueta Welton, deputy county manager, said an initial request was made for $5 million in both city and county incentives. She said that according to

In [252]:
(
    bal_train_df
         .rename(columns={'prompt': 'text', 'response': 'label'})
         [['text', 'label']]
         .assign(label=lambda df: df['label'].map({'Yes': 1, 'No': 0}))
         .to_csv('../data/gpt_classification/trained_models/training-df.csv')
)

In [None]:
args = TCTrainArgs(num_train_epochs=3, output_dir='../data/gpt_classification/trained_models/')
distilbert_model.train("../data/gpt_classification/trained_models/training-df.csv", args=args)

In [149]:
import os 
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [151]:
# mv ../data/gpt_classification/training-df.csv ../data/gpt_classification/trained_models/

In [270]:
tok = AutoTokenizer.from_pretrained('distilbert-base-uncased')
tok = AutoTokenizer.from_pretrained('roberta-base')



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

In [271]:
from tqdm.auto import tqdm
tqdm.pandas()
(
    test_df[['prompt']]
         .assign(prompt=lambda df: df['prompt'].progress_apply(tok.encode).apply(lambda x: x[:505]).progress_apply(tok.decode))
         .rename(columns={'prompt': 'text'})
         .to_csv('../data/gpt_classification/trained_models/test-df-for-ht-roberta.csv', index=False)
)

  0%|          | 0/33592 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (876 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/33592 [00:00<?, ?it/s]

In [257]:
test_df[['id', 'prompt', 'response']].to_csv('../data/gpt_classification/trained_models/test-df.csv')

In [166]:
r = distilbert_model.test('../data/gpt_classification/trained_models/test-df.csv')

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3114/3114 [04:48<00:00, 10.81it/s]
