## Task 1

1. Implement a function that takes 2-, 3-, or 4-grams to generate text, 
    if an initial N−1 gram is given, by sampling a next token according 
    to the conditional distribution $P(w_N|w_1, ..., w_{N−1})$.
2. If the initial N−1 gram is unknown, there is no distribution to sample
    from  and the generation process cannot start. A way out is to “reverse”
    the backoff-idea for N-grams: i.e. if $P(w_N|w_1, ..., w_{N−1})$ is
    unknown, I can try $P(w_N|w_2, ..., w_{N−1})$, or 
    $P(w_N|w_3, ..., w_{N−1})$, ... or $P(w_N|w_{N−1})$, or at last $P(w_N)$.

    
These distributions have already been estimated in subtask 1.
Update your code such that it implements this idea!

Does it help only for the initial N−1 gram or also later in the generation process?

Ans. Backoff also helps during the generation process as well as during initialization.
When higher order n-grams are not available, we try lower order n-grams. Whenever a prefix is unknown during the generation process, we try a smaller prefix.

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

df = pd.read_csv('imdb_reviews.csv')
unigram_vectorizer = CountVectorizer(ngram_range=(1,1))
bigram_vectorizer =  CountVectorizer(ngram_range=(2,2))
trigram_vectorizer = CountVectorizer(ngram_range=(3,3))
fourgram_vectorizer = CountVectorizer(ngram_range=(4,4))

X_unigram = unigram_vectorizer.fit_transform(df['review'])
X_bigram  = bigram_vectorizer.fit_transform(df['review'])
X_trigram = trigram_vectorizer.fit_transform(df['review'])
X_fourgram = fourgram_vectorizer.fit_transform(df['review'])

In [2]:

# maps the value of n -> frequency map.
ngram_freq_maps = {}

ngram_freq_maps[1] = dict(zip(
    unigram_vectorizer.get_feature_names_out(),
    X_unigram.sum(axis=0).A1
))

ngram_freq_maps[2] = dict(zip(
    bigram_vectorizer.get_feature_names_out(),
    X_bigram.sum(axis=0).A1
))

ngram_freq_maps[3] = dict(zip(
    trigram_vectorizer.get_feature_names_out(),
    X_trigram.sum(axis=0).A1
))

ngram_freq_maps[4] = dict(zip(
    fourgram_vectorizer.get_feature_names_out(),
    X_fourgram.sum(axis=0).A1
))

In [3]:
# compute conditional prob dictionaries from freq maps
ngram_prob_maps = {}

# for 2,3,4-grams
for n in range(2, 5):
    prob_dict = {}

    for ngram, count in ngram_freq_maps[n].items():
        # suppose the tokens are `cat is cute`
        tokens = ngram.split()

        # context is `cat is` and next_word is `cute`
        context = " ".join(tokens[:-1])
        next_word = tokens[-1]

        # Count(`cat is`)
        context_count = ngram_freq_maps[n-1].get(context, 0)

        if context_count > 0:
            # P(`cute` | `cat is`) = Count(`cat is cute`) / Count(`cat is`)
            prob = count / context_count

            if context not in prob_dict:
                prob_dict[context] = {}

            # ngram_prob_maps[3][`cat is`] = { `cute`: number_found_above }
            prob_dict[context][next_word] = prob

    ngram_prob_maps[n] = prob_dict

# For unigrams (fallback), normalize to probs
total_unigrams = sum(ngram_freq_maps[1].values())
unigram_probs = {word: freq / total_unigrams for word, freq in ngram_freq_maps[1].items()}

In [38]:
import random

def generate_text_bigram(initial_prefix: str, max_length: int=50):
    """
        Generate text using bigram model. Whenever, a bigram is not available
        the generation process halts.
    """
    generated = initial_prefix.split()
    current_prefix = generated[-1]  # last word

    for _ in range(max_length - len(generated)):
        next_word = None

        if current_prefix in ngram_prob_maps[2]:
            probs = ngram_prob_maps[2][current_prefix]
            next_word = random.choices(list(probs.keys()), weights=list(probs.values()))[0]
        else:
            break

        generated.append(next_word)
        current_prefix = next_word

    return " ".join(generated)

In [None]:
def generate_text_trigram(initial_prefix: str, max_length: int=50):
    """
        Generate text using trigram model.
    """
    generated = initial_prefix.split()
    current_prefix = " ".join( generated[ len(generated)-2: ] ) # last 2 words

    for _ in range(max_length - len(generated)):
        next_word = None

        if current_prefix in ngram_prob_maps[3]:
            probs = ngram_prob_maps[3][current_prefix]
            next_word = random.choices(list(probs.keys()), weights=list(probs.values()))[0]
        else:
            break

        generated.append(next_word)
        current_prefix = " ".join(generated[-2:]) # last 2 words

    return " ".join(generated)

In [42]:
def generate_text_fourgram(initial_prefix: str, max_length: int=50):
    """
        Generate text using fourgram model.
    """
    generated = initial_prefix.split()
    current_prefix = " ".join( generated[ len(generated)-3: ] ) # last 3 words

    for _ in range(max_length - len(generated)):
        next_word = None

        if current_prefix in ngram_prob_maps[4]:
            probs = ngram_prob_maps[4][current_prefix]
            next_word = random.choices(list(probs.keys()), weights=list(probs.values()))[0]
        else:
            break

        generated.append(next_word)
        current_prefix = " ".join(generated[-3:]) # last 3 words

    return " ".join(generated)

In [45]:
test_prefixes_bigram = [ "the", "this" ]
test_prefixes_trigram = [ "the movie", "this movie" ]
test_prefix_fourgram = [ "this movie is", "the movie is" ]

print( "\n", "*" * 10, " Text generated by Bigram Model ", "*" * 10, "\n")

for p in test_prefixes_bigram:
    print(f'\nFor prefix: {p},\nText Generated by bigram model: {generate_text_bigram(p)}\n')

print( "\n", "*" * 10, " Text generated by Trigram Model ", "*" * 10, "\n")

for p in test_prefixes_trigram:
    print(f'\nFor prefix: {p},\nText Generated by trigram model: {generate_text_trigram(p)}\n')

print( "\n", "*" * 10, " Text generated by 4-Gram Model ", "*" * 10, "\n")

for p in test_prefix_fourgram:
    print(f'\nFor prefix: {p},\nText Generated by fourgram model: {generate_text_fourgram(p)}\n')



 **********  Text generated by Bigram Model  ********** 


For prefix: the,
Text Generated by bigram model: the point of the same reason why he just let go to the bellicose sergeant forced at his own didn have ever there is no grand canyon because it isn long distance though the two become the confession br enter into an adult cinema br br br will without being


For prefix: this,
Text Generated by bigram model: this movie goers who controlled by cheh was excellent widmark is very enjoyable and blond american history because he is especially for the acting but full nudity and gretchen mol nude swimming in the town of disaster has funny movie timon timon and you would have borrowed from the rope


 **********  Text generated by Trigram Model  ********** 


For prefix: the movie,
Text Generated by trigram model: the movie that exemplifies hope love and happiness freud might have thought that an artist profound vision his art is namely the french first dubbed noir was without any intenti

In [32]:
def generate_text_ngrams_model(n: int, initial_prefix: str, max_length: int=50):
    """
        Generate text using n-grams model. Assume that initial n-1 words are
        given as prefix. Uses backoff strategy to shorter n-grams if larger
        n-grams are not found.
    """
    generated = initial_prefix.split()
    current_prefix = initial_prefix

    for _ in range(max_length - len(generated)):
        next_word = None

        prefix_tokens = current_prefix.split()

        for backoff_level in range(len(prefix_tokens) + 1):  # 0: full, 1: remove first

            # suppose we have current prefix "the movie"
            test_prefix = " ".join(prefix_tokens[backoff_level:])

            # check if "the movie" is in ngrams_prob_maps[3]
            # if no -> increase backoff, make the prefix "movie" next time

            if not test_prefix:
                # the test prefix is empty, maybe because we do not have enough tokens left after backoff
                # in this case we randomly select from unigram probabilities
                next_word = random.choices(list(unigram_probs.keys()), weights=list(unigram_probs.values()))[0]
                break

            elif test_prefix in ngram_prob_maps[n]:
                # "the movie" is found in ngram_prob_maps[3]
                probs = ngram_prob_maps[n][test_prefix]
                # we will have probs like: {  "was": 0.6, "is": 0.4 }
                next_word = random.choices(list(probs.keys()), weights=list(probs.values()))[0]
                # we choose one of these words
                break

        if not next_word:
            break  # No options, halt generation

        generated.append(next_word)

        # Update prefix to last N-1 words
        current_prefix = " ".join(generated[-(n-1):])

    return " ".join(generated)

In [5]:
test_prefixes = [
    "the movie was",
    "i really enjoyed",
    "the plot of",
    "the acting was",
    "i would recommend",
]

for n in range(2,5):
    print(f"\n--- Generating text with {n}-grams model ---\n")
    for prefix in test_prefixes:
        generated_text = generate_text_ngrams_model(n, prefix, max_length=20)
        print(f"Prefix: {prefix}\nGenerated: {generated_text}\n")


--- Generating text with 2-grams model ---

Prefix: the movie was
Generated: the movie was spitfire of the liquefied colors aid in this was ignored by alfio contini find out of main character development

Prefix: i really enjoyed
Generated: i really enjoyed mostly forgotten today are all male to me lot from quick lover júlio ismael de palma direction is master

Prefix: the plot of
Generated: the plot of africa and you see this movie in mere sport and involves lust temptation of them thinking fondly through the

Prefix: the acting was
Generated: the acting was great national spirit of 10 10 br br episode is the prez prado and closing credits br this matter

Prefix: i would recommend
Generated: i would recommend it came on youtube but dialogue focusing on tap ii movies my life is perfectly balanced writing this may


--- Generating text with 3-grams model ---

Prefix: the movie was
Generated: the movie was just too amazing to see it twice about making things drag on and on about how the

## Sampling initial prefixes for 3-grams model

We also need to generate 5000 samples for naive bayes classifier. We generate these reviews
using 3-gram model. But for that, we need 2 words for initial generation for each review. 
Hence, we need 5000 initial prefixes of size 2. For example: "the movie", "it is", "i like" etc.

We simply sample initial prefixes from our `imdb_reviews.csv` dataset. 

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# keep the initial 2 words for each review
df = pd.read_csv('imdb_reviews.csv')
df['initial_prefix'] = df['review'].apply(lambda x: ' '.join(x.split()[:2]))

df.head()

Unnamed: 0,review,initial_prefix
0,This is a great movie for the true romantics a...,This is
1,I saw this film when I was a young child on te...,I saw
2,I consider myself a great admirer of David Lyn...,I consider
3,Cat Soup at first seems to be a very random an...,Cat Soup
4,"Back in 1994, I had a really lengthy vacation ...",Back in


In [7]:
# compute bigram frequencies, and choose 500 prefiex based on weights = frequencies

bigram_vectorizer_initial_prefix = CountVectorizer(ngram_range=(2,2))
X_bigram_initial_prefix  = bigram_vectorizer_initial_prefix.fit_transform(df['initial_prefix'])

freq_initial_prefix = dict(zip(
    bigram_vectorizer_initial_prefix.get_feature_names_out(),
    X_bigram_initial_prefix.sum(axis=0).A1
))

## check top 10 most frequent initial prefixes
sorted(freq_initial_prefix.items(), key=lambda x: x[1], reverse=True)[:10], len(freq_initial_prefix)

([('this is', np.int64(706)),
  ('this movie', np.int64(418)),
  ('this film', np.int64(195)),
  ('this was', np.int64(117)),
  ('if you', np.int64(115)),
  ('one of', np.int64(92)),
  ('the movie', np.int64(59)),
  ('br br', np.int64(56)),
  ('in the', np.int64(50)),
  ('the first', np.int64(45))],
 4711)

>##### We only have `4711` unique initial prefixes in this dataset. But we will still end up generating `5000` reviews, since we can generate multiple unique reviews from same initial biagrams. Also, lets only keep top 2000 initial prefixes, because there is some noise in the dataset

In [10]:
top_2000_initial_prefixes = sorted(freq_initial_prefix.items(), key=lambda x: x[1], reverse=True)[:2000]

initial_prefixes = pd.DataFrame(
    {
        "initial_prefix": [x[0] for x in top_2000_initial_prefixes ],
        "frequency":      [x[1] for x in top_2000_initial_prefixes]
    }
)
initial_prefixes.head()

Unnamed: 0,initial_prefix,frequency
0,this is,706
1,this movie,418
2,this film,195
3,this was,117
4,if you,115


In [11]:
initial_prefixes.to_csv('initial_prefixes.csv', index=False)

## Generating 5000 artificial reviews for Naive Bayes classifier

In [None]:
artificial_reviews = {
    "artificial_reviews": []
}

# note that the probability fo selecting an initial prefix depends on its
# frequency in original dataset

for ip in random.choices(initial_prefixes["initial_prefix"], initial_prefixes["frequency"], k=5000 ):
    # review of random length in range [30,50]
    review = generate_text_ngrams_model(3, ip, random.randint(30,50) )
    artificial_reviews["artificial_reviews"].append(review)

In [14]:
# save the artificial reviews
pd.DataFrame(artificial_reviews).to_csv("artificial_reviews.csv", index=False)

---