In [19]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import confusion_matrix


In [73]:
def read_file(file_path):
    """Reads a text file and returns a list of lines."""
    with open(file_path, 'r') as file:
        lines = file.read().strip().split('\n')
    return lines

def preprocess_data(static_words_path, static_tags_path, static_candidates_path,
                    formal_words_path, formal_tags_path, formal_candidates_path, formal_labels_path):
    # Read data from files
    static_words = read_file(static_words_path)
    static_tags = read_file(static_tags_path)
    static_candidates = read_file(static_candidates_path)

    formal_words = read_file(formal_words_path)
    formal_tags = read_file(formal_tags_path)
    formal_candidates = read_file(formal_candidates_path)
    formal_labels = read_file(formal_labels_path)

    # Process static idioms - assign label 1 for all instances
    static_data = [{'words': words.lower().split(), 'tags': tags.split(),
                    'candidate': candidate, 'label': 1}
                   for words, tags, candidate in zip(static_words, static_tags, static_candidates)]

    # Process formal idioms
    formal_data = [{'words': words.lower().split(), 'tags': tags.split(),
                    'candidate': candidate, 'label': int(label)}
                   for words, tags, candidate, label in zip(formal_words, formal_tags, formal_candidates, formal_labels)]

    # Convert to DataFrames for ease of use
    static_df = pd.DataFrame(static_data)
    formal_df = pd.DataFrame(formal_data)

    return static_df, formal_df

# File paths for static idioms
static_base_path = 'EPIE_Corpus/Static_Idioms_Corpus/'
static_words_path = static_base_path + 'Static_Idioms_Words.txt'
static_tags_path = static_base_path + 'Static_Idioms_Tags.txt'
static_candidates_path = static_base_path + 'Static_Idioms_Candidates.txt'

# File paths for formal idioms
formal_base_path = 'EPIE_Corpus/Formal_Idioms_Corpus/'
formal_words_path = formal_base_path + 'Formal_Idioms_Words.txt'
formal_tags_path = formal_base_path + 'Formal_Idioms_Tags.txt'
formal_candidates_path = formal_base_path + 'Formal_Idioms_Candidates.txt'
formal_labels_path = formal_base_path + 'Formal_Idioms_Labels.txt'

# Preprocess and combine the data
static_df, formal_df = preprocess_data(static_words_path,
                                       static_tags_path,
                                       static_candidates_path,
                                       formal_words_path,
                                       formal_tags_path,
                                       formal_candidates_path,
                                       formal_labels_path)
print("Static data:")
print(static_df.head())
print("Formal data:")
print(formal_df.head())

Static data:
                                               words  \
0  [anyway, ,, thanks, mkm, and, keep, up, the, g...   
1  [well, done, steffi, —, keep, up, the, good, w...   
2  [please, use, it, as, a, reminder, to, ensure,...   
3         [please, do, keep, up, the, good, work, .]   
4  [we, hope, you, find, the, current, issue, sti...   

                                                tags              candidate  \
0  [O, O, O, O, O, B-IDIOM, I-IDIOM, I-IDIOM, I-I...  keep up the good work   
1  [O, O, O, O, B-IDIOM, I-IDIOM, I-IDIOM, I-IDIO...  keep up the good work   
2  [O, O, O, O, O, O, O, O, O, O, B-IDIOM, I-IDIO...  keep up the good work   
3  [O, O, B-IDIOM, I-IDIOM, I-IDIOM, I-IDIOM, I-I...  keep up the good work   
4  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-I...  keep up the good work   

   label  
0      1  
1      1  
2      1  
3      1  
4      1  
Formal data:
                                               words  \
0  [‘, you, know, ,, the, panda, who, ke

In [74]:
formal_df["label"].value_counts()

label
1    2761
0     375
Name: count, dtype: int64

In [75]:
print(375 / (375 + 2761))

0.11957908163265306


The formal dataset contains 12% literal uses and 88% idiomatic uses. The static dataset only contains idiomatic uses. For the first experiment, we will train a classifier on the formal dataset to determine whether a sentence is formal or idiomatic.

In [76]:
# Concatenate the words into a single string for each sample
formal_df['text'] = formal_df['words'].apply(lambda x: ' '.join(x))

In [77]:
X_formal = formal_df["text"]
y_formal = formal_df["label"]

# Split into training and test sets
X_train_formal, X_test_formal, y_train_formal, y_test_formal = train_test_split(X_formal, y_formal, test_size=0.2, random_state=42)

# Check the class distribution before resampling
print(pd.Series(y_train_formal).value_counts())
print()

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=10000)

# Initialize classifiers
rf_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(random_state=42)
lr_clf = LogisticRegression(random_state=42)

# Create a pipeline that first vectorizes the text and then oversamples the minority class before training
# Note: Oversampling and vectorization are done inside the cross-validation loop to avoid data leakage
rf_pipeline = Pipeline([
    ('tfidf', vectorizer),
    ('oversample', RandomOverSampler(random_state=42)),
    ('classifier', rf_clf)
])

svm_pipeline = Pipeline([
    ('tfidf', vectorizer),
    ('oversample', RandomOverSampler(random_state=42)),
    ('classifier', svm_clf)
])

lr_pipeline = Pipeline([
    ('tfidf', vectorizer),
    ('oversample', RandomOverSampler(random_state=42)),
    ('classifier', lr_clf)
])

# Create a dictionary of pipelines and classifier types for ease of reference
pipelines = {
    "Random Forest": rf_pipeline,
    "SVM": svm_pipeline,
    "Logistic Regression": lr_pipeline
}

scoring_metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']

# Perform 5-fold cross-validation and print the results
for classifier_name, pipeline in pipelines.items():
    scores = cross_validate(pipeline, X_train_formal, y_train_formal, cv=5, scoring=scoring_metrics)
    print(f'{classifier_name} Scores:')
    for metric_name in scoring_metrics:
        metric_scores = scores[f'test_{metric_name}']
        print(f'{metric_name}: Mean={metric_scores.mean():.2f}, Std={metric_scores.std():.2f}')
    print("\n")

label
1    2209
0     299
Name: count, dtype: int64

Random Forest Scores:
accuracy: Mean=0.89, Std=0.00
precision_macro: Mean=0.83, Std=0.08
recall_macro: Mean=0.54, Std=0.01
f1_macro: Mean=0.54, Std=0.02


SVM Scores:
accuracy: Mean=0.89, Std=0.00
precision_macro: Mean=0.80, Std=0.04
recall_macro: Mean=0.53, Std=0.01
f1_macro: Mean=0.53, Std=0.02


Logistic Regression Scores:
accuracy: Mean=0.87, Std=0.02
precision_macro: Mean=0.68, Std=0.04
recall_macro: Mean=0.66, Std=0.03
f1_macro: Mean=0.67, Std=0.04




In [78]:
# Vectorize the data
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train_formal)
X_test_vectorized = vectorizer.transform(X_test_formal)

# Fit the models on the training set
rf_clf.fit(X_train_vectorized, y_train_formal)
svm_clf.fit(X_train_vectorized, y_train_formal)
lr_clf.fit(X_train_vectorized, y_train_formal)

# Predict on the test set
y_pred_rf = rf_clf.predict(X_test_vectorized)
y_pred_svm = svm_clf.predict(X_test_vectorized)
y_pred_lr = lr_clf.predict(X_test_vectorized)

# Calculate the confusion matrices
conf_matrix_rf = confusion_matrix(y_test_formal, y_pred_rf)
conf_matrix_svm = confusion_matrix(y_test_formal, y_pred_svm)
conf_matrix_lr = confusion_matrix(y_test_formal, y_pred_lr)

In [79]:
# Print the confusion matrices
print("Confusion Matrix for Random Forest:")
print(conf_matrix_rf)
print("\nConfusion Matrix for SVM:")
print(conf_matrix_svm)
print("\nConfusion Matrix for Logistic Regression:")
print(conf_matrix_lr)

# Note: The confusion matrix format is
# [[true_negative, false_positive],
#  [false_negative, true_positive]]

Confusion Matrix for Random Forest:
[[ 10  66]
 [  4 548]]

Confusion Matrix for SVM:
[[  3  73]
 [  1 551]]

Confusion Matrix for Logistic Regression:
[[  2  74]
 [  0 552]]


In [80]:
y_test_formal.value_counts()

label
1    552
0     76
Name: count, dtype: int64

All the models substantially underpredicted the negative samples. The random forest classifier predicted the most negative samples, but it still had a reasonably high error rate even for the ones it did predict as negative - only 10 out of the 14 samples it predicted as negative were actually negative.

In [81]:
y_pred_rf

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [82]:
y_test_formal[0]

1

In [83]:
# Extract false positives and false negatives for analysis
fp_rf = (y_pred_rf == 1) & (y_test_formal == 0)
fn_rf = (y_pred_rf == 0) & (y_test_formal == 1)

# Look at some of the false positives
print("Some false positive samples:")
print(X_test_formal.iloc[fp_rf].head())

# Look at some of the false negatives
print("Some false negative samples:")
print(X_test_formal.iloc[fn_rf].head())

Some false positive samples:


NotImplementedError: iLocation based boolean indexing on an integer type is not available

In [87]:
y_test_formal

1791    0
528     1
449     0
2994    1
3068    1
       ..
1037    1
1703    0
969     1
1288    1
2298    1
Name: label, Length: 628, dtype: int64

In [88]:
fp_rf.reset_index(drop=True)

0       True
1      False
2       True
3      False
4      False
       ...  
623    False
624    False
625    False
626    False
627    False
Name: label, Length: 628, dtype: bool

In [99]:
fp_rf

1791     True
528     False
449      True
2994    False
3068    False
        ...  
1037    False
1703    False
969     False
1288    False
2298    False
Name: label, Length: 628, dtype: bool

In [119]:
false_positives_rf = formal_df.iloc[fp_rf[fp_rf].index].drop(columns=["words", "tags", "label"], axis=1)
false_negatives_rf = formal_df.iloc[fn_rf[fn_rf].index].drop(columns=["words", "tags", "label"], axis=1)
false_negatives_rf

Unnamed: 0,candidate,text
2711,tie [pron] knot,stefan ties the knot
291,bad egg,he 's a bad egg .
325,be in hot water,"i 'm in hot water over various things — nothing important enough to bore you with , but i know the director wants shot of me — he 's said as much and i 'm sure my days are numbered ; i feel like i 'm struggling to keep my head above water — oh , i knew this particular pool was going to be deep , but he 's there waiting just above the surface to shove me under , finding some half-baked reason to be rid of me ."
2279,lose [pron] touch,"well away from the beaten track he laid her gently on a bed of moss and bracken , and she opened her arms to him , loath to lose his touch for even a second ."


In [149]:
def format_samples(samples_series):
    return "\n".join([f"- {i}: {sample['text']} ({sample['candidate']})" for i, sample in samples_series.iterrows()])

# Set the option to display the full content of the column
pd.set_option('display.max_colwidth', None)

# Print some of the false positives and false negatives for Random Forest
print("Some false positive samples for Random Forest:")
print(format_samples(false_positives_rf.head(n=10)))

print("\nSome false negative samples for Random Forest:")
print(format_samples(false_negatives_rf.head(n=10)))

Some false positive samples for Random Forest:
- 1791: adding fuel to the fire is deepa kaur , the daughter of rajah man singh , the maharajah 's late brother . (add fuel to [pron] fire)
- 449: paul bit his lip as he jogged at his father 's shoulder along a trail that wound through fringes of jungle beside the la nga river . (bite [pron] lip)
- 1146: so as i can take her for a ride and back . (take for [pron] ride)
- 567: candles were brought to the tables and any mishaps in the food or service were blamed on the lack of power , which lasted for three hours . (bring to [pron] table)
- 2922: keep them cool (keep [pron] cool)
- 2124: your partner standing behind you , pulls your leg towards him gently . (pull [pron] leg)
- 1814: the axeman wore the first beard of a boy and had big ears that stuck through his hair . (have big ears)
- 2285: ‘ good , ’ said caspar , meaning it , and fenella , still curled into her uncomfortable corner , crossed her fingers and tried not to disturb the shini

In [154]:
static_df["label"].value_counts()

label
1    21891
Name: count, dtype: int64

Taking a closer look at these samples, we learn some important pieces of information:
- Some of the samples are mislabeled. For example, index 1791 is an idiomatic usage of the phrase "add fuel to the fire," but the sentence is labeled `0`. Index 2279 is a literal usage of the phrase "lose one's touch," but it is labeled as `1`. Even though the labels for these samples are incorrect, `translated_sentences.txt` has the correct translations for them. According to the paper, "These labels are done by automatic systems with high accuracy." The paper does not mention the source of the translated sentences. The original sentences are taken from StringNet, which is a knowledge base containing n-grams extracted from the British National Corpus (http://nav.stringnet.org/about.php). From looking at the StringNet Navigator, it's unclear if the translated sentences could have come from there.
- For some of the samples, it's hard to tell if a particular phrase is used idiomatically or literally due to the lack of context. For example, the sample at index 2711 is simply "Stefan ties the knot." This could be referring to getting married, which would be the idiomatic sense of the phrase, or it could be referring to literally tying a knot (in a piece of rope or similar). Unfortunately, the dataset does not provide the context in which a sentence occurs, and going to the original source of the sentences (StringNet) does not provide any additional help in determining the context of a phrase, as mentioned above.

Due to these challenges, as well as the fact that the dataset contains a relatively small number of sentences with literal uses of phrases (375 negative samples of sentences with formal idioms; 2761 positive samples of sentences with formal idioms; 21891 samples of static idioms, all of which are positive), we will focus on sequence labeling for the remainder of this report. The dataset has more data that is suitable for sequence labeling than for idiom classification. Additionally, the original study that published the data discusses its potential utility in sequence labeling and trains a sequence labeling on the dataset, which can be used as a benchmark for our sequence labeling model.