In [4]:
from glob import glob #gets list of files from directory
import os
import re

train = sorted(glob('nyt_archive_data/train/**/*.txt', recursive=True))

In [2]:
from collections import Counter
def words(txt): return re.findall("[a-z0-9']+",txt.lower())

word_count = {}

for f in train:
    txt = re.sub('\s+', ' ', open(f, 'r', encoding='utf-8').read().lower())
    author = os.path.normpath(f).split(os.sep)[-2]
    unique_words = set(words(txt))
    if author in word_count:
        word_count[author].update(unique_words)
    else:
        word_count[author] = Counter(unique_words)

In [3]:
total_vocab = Counter()
for author in word_count: total_vocab += word_count[author]
len(total_vocab)

31895

In [6]:
import pandas as pd
import numpy as np
pet_df= df = pd.read_csv('PETS-284-variations.csv', encoding = "utf-8")
euph_df= df = pd.read_csv('Euphemism_Corpus.csv', encoding = "utf-8")

In [5]:
pet_df

Unnamed: 0,euphemism,type,real_meaning,category,source
0,a certain age,a certain age,old,physical/mental attributes,"Kapron-King, A., OED"
1,able-bodied,able-bodied,non disable,physical/mental attributes,-
2,accident,accident,peeing your pants,bodily functions,-
3,accident,accident,tragic event,misc.,-
4,addict,addict,drug user,substances,-
...,...,...,...,...,...
279,well-fed,well-fed,fat,physical/mental attributes,-
280,went to a better place,went to a better place,died,death,-
281,went to be with the lord,went to be with the lord,died,death,"Heerema, Esther"
282,went to heaven,go to heaven,died,death,"Heerema, Esther"


In [6]:
euph_df

Unnamed: 0,keyword,edited_text,is_euph,category,type,euph_status
0,tinkle,We're just getting back what was TAKEN from us...,1,bodily functions,tinkle,always_euph
1,tinkle,I think AB390 will pass next year now that the...,1,bodily functions,tinkle,always_euph
2,undocumented immigrants,"Singled Out Think Like a Man, the new movie ba...",1,politics,undocumented immigrant,always_euph
3,undocumented immigrants,"Not to be outdone, Sen. Rand Paul (R-Ky. ), so...",1,politics,undocumented immigrant,always_euph
4,undocumented immigrants,The law has also galvanized the growing immigr...,1,politics,undocumented immigrant,always_euph
...,...,...,...,...,...,...
1960,sleep with,There were other photos she wanted me to see: ...,0,sexual activity,sleep with,sometimes_euph
1961,sleep with,I am relieved to see two pup tents marked STAF...,0,sexual activity,sleep with,sometimes_euph
1962,sleep around,"Nothing serious, just long nights of me hackin...",0,sexual activity,sleep around,sometimes_euph
1963,with child,sounds more like Jonestown. They cant leave @ ...,0,physical/mental attributes,with child,sometimes_euph


In [7]:
data = []

for f in train:
    # Read the file and preprocess text
    with open(f, 'r', encoding='utf-8') as file:
        txt = re.sub('\s+', ' ', file.read().lower())
    # Extract author name from file path
    author = os.path.normpath(f).split(os.sep)[-2]
    # Extract title from file path
    title = os.path.splitext(os.path.basename(f))[0]
    # Append author, title, and text to the list
    data.append({'author': author, 'title': title, 'text': txt})

# Create a DataFrame from the list of dictionaries
train_df = pd.DataFrame(data)

In [8]:
import pandas as pd
import re
euphemisms = pet_df['euphemism'].unique().tolist()

def find_euphemism_sentences(text, author, title):
    sentences = re.split(r'[.!?]+', text)  # Split text into sentences
    matches = []
    for sentence in sentences:
        for euphemism in euphemisms:
            if re.search(r'\b' + re.escape(euphemism) + r'\b', sentence, flags=re.IGNORECASE):
                matches.append({
                    'sentence': sentence.strip(),
                    'author': author,
                    'title': title,
                    'pet': euphemism
                })
                break  # Break after the first euphemism match
    return matches

# Apply the function to each row in train_df
results = []
for index, row in train_df.iterrows():
    matches = find_euphemism_sentences(row['text'], row['author'], row['title'])
    results.extend(matches)

# Convert results to DataFrame
results_df = pd.DataFrame(results)

In [9]:
results_df

Unnamed: 0,sentence,author,title,pet
0,"in the late 1960's, the army used to solicit f...",ac,the-russians-coming-in-a-nonexistent-tank,late
1,"the t-64 and t-72 tanks, revealed to western e...",ac,the-russians-coming-in-a-nonexistent-tank,late
2,"on the other hand, it did look very much like ...",ac,the-russians-coming-in-a-nonexistent-tank,troubled
3,he admitted that united states intelligence ha...,ac,the-russians-coming-in-a-nonexistent-tank,did it
4,"* since january, thousands of east germans hav...",ah,east-gemrman-dissent,outspoken
...,...,...,...,...
315,", after a more experienced guide has guided th...",ws,essay-the-nsc-after-clark,experienced
316,richard burt has his hands full getting conser...,ws,essay-the-nsc-after-clark,outspoken
317,workers who priced themselves and their indust...,ws,essay-the-recession-speaks,laid off
318,"then let him go to jerusalem, without precondi...",ws,essay-the-ultimate-settlement,let him go


In [10]:
results_df.to_csv('archive_euphemism_sentences.csv')

In [15]:
results_df= pd.read_csv('train_euphemism_sentences.csv')

In [16]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
vectorizer.fit(euph_df['edited_text'])  # Fit on the entire training corpus

X_train = vectorizer.transform(euph_df['edited_text'])
y_train_category = euph_df['category']
y_train_status = euph_df['euph_status']

# Train a model for category
model_category = LogisticRegression(random_state=42)
model_category.fit(X_train, y_train_category)

# Train a model for euph status
model_status = LogisticRegression(random_state=42)
model_status.fit(X_train, y_train_status)

# Prepare results_df data for prediction
X_results = vectorizer.transform(results_df['sentence'])

# Predict categories and statuses
results_df['predicted_category'] = model_category.predict(X_results)
results_df['predicted_status'] = model_status.predict(X_results)

In [17]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('averaged_perceptron_tagger')

# Assuming you have a DataFrame 'results_df' with a column 'sentence'
def pos_tag_sentences(sentence):
    tokens = word_tokenize(sentence)
    return nltk.pos_tag(tokens)

results_df['pos_tags'] = results_df['sentence'].apply(pos_tag_sentences)

# Expand the pos_tags into separate rows
all_tags = []
for index, row in results_df.iterrows():
    for word, tag in row['pos_tags']:
        all_tags.append({
            'author': row['author'],
            'sentence': row['sentence'],
            'pet': row['pet'],  # Assuming 'pet' is the column for euphemisms
            'word': word,
            'pos_tag': tag
        })

tags_df = pd.DataFrame(all_tags)

tags_df['is_euphemism'] = tags_df.apply(lambda x: x['word'].lower() == x['pet'].lower(), axis=1)
euphemism_tags = tags_df[tags_df['is_euphemism']]

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\yasmi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [18]:
tags_df = tags_df[tags_df['is_euphemism']]
tags_df= tags_df.drop(columns=['is_euphemism','word'])

In [21]:
tags_df

Unnamed: 0,author,sentence,pet,pos_tag
14,bs,if she winds up as her party's default nominee...,late,RB
38,bs,""" some readers may be inclined to dismiss this...",troubled,JJ
82,bs,because of the delta variant's vigorous attack...,experienced,VBN
128,bs,"the florida governor did, in fact, do well in ...",elderly,JJ
186,bs,""" of these, nearly two-thirds come from econom...",disadvantaged,VBN
...,...,...,...,...
21791,zt,protocols should allow the elderly to interact...,elderly,JJ
21822,zt,it is too late to fix everything for this elec...,late,JJ
21839,zt,"i never do, maybe because i discovered it so late",late,RB
21860,zt,i had expected some of what i encountered -- i...,expecting,VBG


In [22]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('stsb-roberta-base')

# Generate embeddings
embeddings = model.encode(euph_df['edited_text'])

from sklearn.manifold import TSNE

# Reduce dimensions to 2D for visualization
tsne = TSNE(n_components=2, random_state=0)
reduced_embeddings = tsne.fit_transform(embeddings)

import matplotlib.pyplot as plt

# Assuming you have an array 'labels' that indicates whether each sentence is euphemistic (1) or not (0)
labels = [1, 0]  # Corresponding to the example sentences above
colors = ['red' if x else 'blue' for x in euph_df['is_euph']]

plt.figure(figsize=(8, 6))
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=colors)
plt.title('Semantic Space of Sentences')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
for i, text in enumerate(euph_df['edited_text']):
    plt.annotate(text, (reduced_embeddings[i, 0], reduced_embeddings[i, 1]))
plt.show()


ModuleNotFoundError: No module named 'sentence_transformers'