<a href="https://colab.research.google.com/github/aliang9/nlpfa23/blob/main/word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import csv
from gensim.models import Word2Vec
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np
import random
import re
import string
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def remove_punctuation(input_string):
    punc = '''‘’“”!()-[]{};:'"\,<>./?@#$%^&*_~！？……。…～「⋯⋯⋯，（）：」『』．'''
    for ele in input_string:
        if ele in punc:
            input_string = input_string.replace(ele, "")
    return input_string

def read_csv(file_path):
    setups = []
    punchlines = []

    # read csv file
    with open(file_path, 'r', newline='', encoding='utf-8') as csvfile:
        csv_reader = csv.reader(csvfile)

        for row in csv_reader:
            if len(row) >= 2:
                punchline = re.sub(r'\s+', ' ', row[0].replace('\n', ' ').replace('\t', ''))
                punchlines.append(remove_punctuation(punchline))
                setup = re.sub(r'\s+', ' ', row[1].replace('\n', ' ').replace('\t', ''))
                setups.append(remove_punctuation(setup))

    return punchlines, setups

# Word2Vec Baseline Model for Chinese Jokes

In [None]:
dataset_path = '/content/drive/MyDrive/NLPProject/Data Preprocessing/updated-chinese-jokes.csv'
joke_punchlines, joke_setups = read_csv(dataset_path)
joke_punchlines = joke_punchlines[1:]
joke_setups = joke_setups[1:]
print(joke_setups[:10])
print(joke_punchlines[:10])

['員工自來水公司台電天然氣公司', '正好一個賣菜的阿伯從此經過順口而出查無此人', '老闆我咧娘', '他回答說證據不足', '頓時世界安靜了', '那乞丐笑笑道丫不知怎麼滴最近生意特別好所以開了家分公司', '求職者走到門外對其他等待面試的人說你們可以回去了我已經得到了這份工作沒你們的事了', '他想了想說就是高利貸亂收費和拉皮條老闆聽後豁然開朗', '老闆我不給你加薪你也別走', '沒關係你可以去下面找他啊']
['員工老闆您必須幫我加薪已經有三家公司在找我了 老闆哪三家', '某市政府辦公大樓落成門口缺副對聯 副市長揮毫 上聯說實話辦實事一身正氣 下聯不貪污不受賄兩袖清風 各個局處首長看後齊聲喝采 考慮到民主副市長讓各個局處首長一起出個橫批大家你看我我看你都不開口', '中午老闆視察自己的建築工地時發現有個人在角落玩手機 老闆你月薪多少 那人答二萬二 老闆掏出錢包數出二萬二再加遣散費共三萬元給他並大聲吼道拿著這個月的薪水馬上離開 那人走後餘怒未消的老闆問旁邊工人他是哪個部門的 工人小小聲回答他他是來送便當的', '一天一位法官的女友看見兩個蚊子便叫法官打死 只見法官只把那個肚子飽飽的蚊子打死了卻對那隻肚子乾癟的蚊子遲遲不下手 女友問他為什麼不把那隻蚊子也打死', '辦公室中兩位女同事吵起來了 經理忍無可忍太不像話了現在是什麼情況你們把原因給我說清楚 兩人一聽又爭先恐後各執一詞吵成一團 經理大吼一聲夠了胖的先講', '某條街有個乞丐每天都在街旁向路人乞討 某日乞丐身邊多了一個碗可是卻又沒人看著 小明感到好奇便上前問了那乞丐為什麼你放兩個碗', '一位求職者在專長一欄中填上造謠 面試官不信任地說你造一次謠給我們看看', '一位銀行經理去洗車 洗車店老闆非常好奇地向他問到你們銀行是怎麼賺錢的 他立即回答主要是靠授信類業務中間業務和資產類業務三大板塊實現的 老闆一臉疑惑要求他通俗的解釋一下', '員工老闆我要加薪不然我就辭職 老闆有話好好說你看我們倆都各退一步行不行 員工怎麼退', '小王在10樓人事部門工作一個月前被調到9樓行政部門去了 今天小王同學打電話到人事部門找他小王在麼 接電話同事說小王已不在人事了 小王同學啊啊什麼時候的事啊我怎麼不知道啊還沒來得及送他呢']


In [None]:
# train Word2Vec model

def train_word2vec_model(joke_setups, joke_punchlines):
    corpus = joke_setups + joke_punchlines
    tokenized_corpus = [list(sentence) for sentence in corpus]
    print(tokenized_corpus)
    model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)
    return model

# evaluate Word2Vec model
def evaluate_word2vec_model(model, joke_setups, joke_punchlines):
    correct_labels = []
    predicted_labels = []
    i = 0

    for setup, correct_punchline in zip(joke_setups, joke_punchlines):
        choices = random.sample(joke_punchlines, 3)
        choices.append(remove_punctuation(correct_punchline))
        random.shuffle(choices)
        similarity_scores = [np.mean([model.wv.similarity(word_setup, word_punchline) for word_setup in list(setup) for word_punchline in list(punchline)]) for punchline in choices]
        predicted_punchline = choices[np.argmax(similarity_scores)]
        correct_labels.append(remove_punctuation(correct_punchline))
        predicted_labels.append(remove_punctuation(predicted_punchline))

        #print examples
        if i <= 100:
          print("Example:")
          print("Joke Setup:", setup)
          print("Correct Punchline:", correct_punchline)
          print("Predicted Punchline:", predicted_punchline)
          print(similarity_scores, choices)
          print("--------------------")
          i += 1

    # calculate evaluation metrics
    accuracy = accuracy_score(correct_labels, predicted_labels)
    precision = precision_score(correct_labels, predicted_labels, average='weighted')
    recall = recall_score(correct_labels, predicted_labels, average='weighted')
    f1 = f1_score(correct_labels, predicted_labels, average='weighted')

    return accuracy, precision, recall, f1

In [None]:
word2vec_model = train_word2vec_model(joke_setups, joke_punchlines)

accuracy, precision, recall, f1 = evaluate_word2vec_model(word2vec_model, joke_setups, joke_punchlines)

# Print evaluation metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

[['員', '工', '自', '來', '水', '公', '司', '台', '電', '天', '然', '氣', '公', '司'], ['正', '好', '一', '個', '賣', '菜', '的', '阿', '伯', '從', '此', '經', '過', '順', '口', '而', '出', '查', '無', '此', '人'], ['老', '闆', '我', '咧', '娘'], ['他', '回', '答', '說', '證', '據', '不', '足'], ['頓', '時', '世', '界', '安', '靜', '了'], ['那', '乞', '丐', '笑', '笑', '道', '丫', '不', '知', '怎', '麼', '滴', '最', '近', '生', '意', '特', '別', '好', '所', '以', '開', '了', '家', '分', '公', '司'], ['求', '職', '者', '走', '到', '門', '外', '對', '其', '他', '等', '待', '面', '試', '的', '人', '說', '你', '們', '可', '以', '回', '去', '了', '我', '已', '經', '得', '到', '了', '這', '份', '工', '作', '沒', '你', '們', '的', '事', '了'], ['他', '想', '了', '想', '說', '就', '是', '高', '利', '貸', '亂', '收', '費', '和', '拉', '皮', '條', '老', '闆', '聽', '後', '豁', '然', '開', '朗'], ['老', '闆', '我', '不', '給', '你', '加', '薪', '你', '也', '別', '走'], ['沒', '關', '係', '你', '可', '以', '去', '下', '面', '找', '他', '啊'], ['經', '理', '遇', '到', '了', '壞', '人', '你', '照', '一', '下', '自', '己', '的', '臉'], ['到', '了', '月', '尾', '就', '發', '愁', '不', '知', '

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Example:
Joke Setup: 維埃裡傷心的對兒子說不兒子你長大了還是當一名裁判吧因為再精彩的進球如果被判成越位的話也只能認倒霉
Correct Punchline: 維埃裡的兒子對維埃裡說爸爸我以後也要當你那樣的足球明星
Predicted Punchline: 一位婦人抱著BABY到一間婦產科 醫生問婦人說BABY是吃母乳還是牛乳啊 婦人吃母乳 醫生那請你把衣服脫下來 婦人啊為什麼 醫生請你不用緊張這裡是婦產科絕不會對你有任何侵犯的 婦人半信半疑的脫去了上衣醫生用他的手在婦人的胸部上摸摸下摸摸左搓搓右揉揉對這婦人說難怪BABY會營養不良妳根本就沒有母乳嘛
[0.48596573, 0.47203234, 0.46627292, 0.36866555] ['一位婦人抱著BABY到一間婦產科 醫生問婦人說BABY是吃母乳還是牛乳啊 婦人吃母乳 醫生那請你把衣服脫下來 婦人啊為什麼 醫生請你不用緊張這裡是婦產科絕不會對你有任何侵犯的 婦人半信半疑的脫去了上衣醫生用他的手在婦人的胸部上摸摸下摸摸左搓搓右揉揉對這婦人說難怪BABY會營養不良妳根本就沒有母乳嘛', '維埃裡的兒子對維埃裡說爸爸我以後也要當你那樣的足球明星', '有11人遇到船難直到船沉了才緊急被直升機用繩索救離 11人都一起掛在一台直升機的繩索上共有十個男人和一個女人 由於繩子無法支撐所有人的重量直昇機快摔機了因此他們討論一定要有一個人放開手否則所有人都將完蛋 這些男人無法決定誰該放手 直到那女人感人肺腑的說著我當然會自願放手因為我們女人都習慣了為了小孩和先生放棄所有一切把所有的都給男人而不求回報真的 ', '某男初通英文至使館有表要填有一欄是sex 該男思之久已毅然下筆once a week 簽證官觀後大笑曰this item should be filed in with male or female 該男頓時赧顏思之填下female 官愣之曰Shouldnt be male']
--------------------
Example:
Joke Setup: 母雞回答炸雞咖裡雞白斬雞燒雞烤雞香菇雞土窯雞
Correct Punchline: 小雞問母雞為什麼人類都有名字而我們全都叫做雞 母雞回答人活著的時候都有名字但死了也全就叫鬼呀我們雞活著時雖沒有名字但死了就有很多名字了 小雞開心的問叫

  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.3807461692205197
Precision: 0.2515199621036683
Recall: 0.3807461692205197
F1 Score: 0.28505870218125196


# Word2Vec Baseline Model for English Reddit Jokes

In [None]:
dataset_path = '/content/drive/MyDrive/updated-reddit-jokes.csv'
joke_punchlines, joke_setups = read_csv(dataset_path)
joke_punchlines = joke_punchlines[1:]
joke_setups = joke_setups[1:]
print(joke_setups[:10])
print(joke_punchlines[:10])

['My wife offered me a blowjob today', 'A man who lived by the sea grew a cucumber so large he was able to turn it into his house One day a bad storm flooded the area with seawater and damged his home', 'I love my job exclaimed the farmer All you do is boss me around all day complained one of his sheep What did you say challenged the farmer The sheep glared back and growled', 'I was doing a pretend job interview with my 6 year old daughter and I asked her where do you see yourself in 5 years', 'How do you grab the attention of a pervert', 'My wife is fed up of my constant Dad jokes so I asked her How can I stop my addiction', '\u200c\u200cI p\u200c\u200croposed t\u200c\u200co m\u200c\u200cy e\u200c\u200cxwife t\u200c\u200coday', 'The wifes leaving me because of my sexual fetishes', 'I asked my North Korean friend whats it like to live in North Korea', 'Why was Han Solo so suspicious when he put his penis inside Princess Leia for the first time']
['Really I said No April fooaarrrrglegar

In [None]:
def train_word2vec_model(joke_setups, joke_punchlines):
    corpus = joke_setups + joke_punchlines
    tokenized_corpus = [sentence.split() for sentence in corpus]
    print(tokenized_corpus)
    model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)
    return model

In [None]:
word2vec_model = train_word2vec_model(joke_setups, joke_punchlines)

accuracy, precision, recall, f1 = evaluate_word2vec_model(word2vec_model, joke_setups, joke_punchlines)

# Print evaluation metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Example:
Joke Setup: My wife offered me a blowjob today
Correct Punchline: Really I said No April fooaarrrrglegargle Thatll teach her to be funny
Predicted Punchline: She told me shes been having sex with an asshole for years
[0.91137373, 0.9218623, 0.93819565, 0.91051686] ['I find that it just ruins the pineapple', 'Really I said No April fooaarrrrglegargle Thatll teach her to be funny', 'She told me shes been having sex with an asshole for years', 'Her Fuck that shit Me Thats the spirit']
--------------------
Example:
Joke Setup: A man who lived by the sea grew a cucumber so large he was able to turn it into his house One day a bad storm flooded the area with seawater and damged his home
Correct Punchline: Now hes in a pickle
Predicted Punchline: He sold his soul to Santa
[0.7790531, 0.9206175, 0.93620145, 0.9453728] ['I didnt want to interrupt her', 'I politely declined I cant deal with high maintenance women', 'Now hes in a pickle', 'He sold his soul to Santa']
--------------------

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.29651162790697677
Precision: 0.1788534223794921
Recall: 0.29651162790697677
F1 Score: 0.2079358487229532


# Code for generating human matching task CSVs

In [None]:
# Load your pre-trained Word2Vec model
# Assuming you've trained your model on joke setups and named it 'model'
# model = Word2Vec.load("your_model_path")

# Load your CSV file containing joke setups and punchlines
csv_file_path = '/content/drive/MyDrive/new_updated_redditjokes.csv'
df = pd.read_csv(csv_file_path)

tops = []
for setup, correct_punchline in zip(joke_setups, joke_punchlines):
        choices = []
        for punchline in joke_punchlines:
            if punchline != correct_punchline:
                choices.append(punchline)
        tops.append(random.sample(choices, 3))

with open(csv_file_path, 'r') as infile, open('output.csv', 'w', newline='') as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    # Write header (if needed)
    header = next(reader, None)
    if header:
        writer.writerow(header + ['Choices'])  # Replace 'New_Column_Name' with your desired column name

    # Write data with values in the third column
    for row, value in zip(reader, tops):
        row.append(value)
        writer.writerow(row)
