In [12]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize

In [2]:
data = pd.read_csv('../00_source_data/preprocessed_data_yelp.csv', encoding='latin-1')

In [3]:
data.head()

Unnamed: 0,label,text
0,2,contrari review zero complaint servic price ge...
1,1,last summer appoint get new tire wait super lo...
2,2,friendli staff starbuck fair get anywher els s...
3,1,food good unfortun servic hit miss main issu s...
4,2,even didnt car filen basement worth bu trip wa...


In [4]:
# tokenize the text
data["split_data"] = data["text"].apply(lambda x: ["#S"]+word_tokenize(str(x))+["#E"])

In [5]:
data.head()

Unnamed: 0,label,text,split_data
0,2,contrari review zero complaint servic price ge...,"[#S, contrari, review, zero, complaint, servic..."
1,1,last summer appoint get new tire wait super lo...,"[#S, last, summer, appoint, get, new, tire, wa..."
2,2,friendli staff starbuck fair get anywher els s...,"[#S, friendli, staff, starbuck, fair, get, any..."
3,1,food good unfortun servic hit miss main issu s...,"[#S, food, good, unfortun, servic, hit, miss, ..."
4,2,even didnt car filen basement worth bu trip wa...,"[#S, even, didnt, car, filen, basement, worth,..."


In [6]:
# calculate the frequency of the appereance of each word in different labels
dictionary = {}
for index, row in data.iterrows():
    text = set(row["split_data"])
    for word in text:
        dictionary[word] = dictionary.get(word, {1: 0, 2: 0})
        dictionary[word][row["label"]] += 1

In [7]:
# Construct three dictionaries for conditional probability for each individual word, given different labels
tempdict = [{},{}]
sentences = []
sentences.append(data[data["label"] == 1]["split_data"].values)
sentences.append(data[data["label"] == 2]["split_data"].values)

for i in range(2):
    for j in range(len(sentences[i])):
        tempdict[i][j]={}
        for t in range(len(sentences[i][j])):
            word = sentences[i][j][t]
            tempdict[i][j][word] = tempdict[i][j].get(word,[])
            tempdict[i][j][word].append(t)

In [8]:
def getConditionalProbability(word, tempdict, sentences):
    d = {}
    allNextWords = 0
    for i, value in tempdict.items():
        nextArr = value.get(word, [])
        for wordIndex in nextArr:
            try:
                nexWord = sentences[i][wordIndex + 1]
                d[nexWord] = d.get(nexWord, 0)
                d[nexWord] += 1
                allNextWords += 1
            except:
                pass
    d1 = {}
    for key, value in d.items():
        d1[key] = value / allNextWords
    return d1

In [9]:
# Calculate conditional probability for each word in different labels
conditionalProbDictLabel = [{},{}]
for i in range(2):
    for word in dictionary:
        conditionalProbDictLabel[i][word] = getConditionalProbability(word,tempdict[i],sentences[i])

In [10]:
def getRandomWord(conditionalProbDictLabel, word):
    d = conditionalProbDictLabel[word]
    p = np.array(list(d.values()))
    # Look at np.random.choice: Given the prob distribution of next word, we random select word based on given probability
    if list(d.keys()) != []:
        nextWord = np.random.choice(list(d.keys()), p=p.ravel())
    else:
        nextWord = "#E"
    return nextWord

In [13]:
generateLabel = [[],[]]
for i in range(2):
    for j in range(5000):
        nextWord = getRandomWord(conditionalProbDictLabel[i], "#S")
        text = ["#S", nextWord]
        for t in range(500):  # sentence max lengh
            nextWord = getRandomWord(conditionalProbDictLabel[i], nextWord)
            text.append(nextWord)
            if nextWord == "#E":
                break
        generateLabel[i].append(text)

In [14]:
generateLabelText = []
label = [1,2]
for i in range(2):
    for item in generateLabel[i]:
        if item[0] == "#S":
            item = item[1:]
        if item[-1] == "#E":
            item = item[:-1]
        generateLabelText.append({"label": label[i], "text": " ".join(item)})
generateLabelTextDf = pd.DataFrame(generateLabelText)

In [15]:
generateLabelTextDf.sample(10)

Unnamed: 0,label,text
442,1,went le ramado brufblu et il aussi fallu quon ...
2486,1,updat everyon els eat
2,1,pro friendli lot disgust woman entir time cold...
3164,1,fyi place guess order chili chees omelett medi...
9890,2,im give tip meal like outstand though girl ano...
7338,2,wow talk food pretti order uni blue crab leg s...
5043,2,stop nnfor lunch menu littl weird think check ...
5737,2,king bed tast decor barbershop certainli hit c...
4912,1,took minut late nnfinal decisionni doubt think...
8924,2,wow amaz nthe jalapeufo cornbread stuf potato ...


In [16]:
generateLabelTextDf.to_csv('../00_source_data/synthetic_data_yelp.csv',index=False, encoding='latin-1')