# Preprocessing

In [None]:
# import libraries

import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')

In [None]:
# read dataset

df = pd.read_csv("A2_dataset.csv")
df.head()

Unnamed: 0,LABEL,DATE_TIME,TEXT
0,0,Fri Jun 05 14:26:50 2009,About to get threaded and scared
1,1,Thu May 14 10:13:55 2009,@awaisnaseer I like Shezan Mangooo too!!! I ha...
2,1,Fri Jun 05 21:02:20 2009,worked on my car after work. showering then go...
3,1,Sun Jun 14 22:25:52 2009,@Marama Actually we start this afternoon! I w...
4,1,Sun May 31 00:42:12 2009,@gfalcone601 Aww Gi.don't worry.we'll vote for...


In [None]:
df.shape

(4287, 3)

In [None]:
df.LABEL.value_counts()

1    2287
0    2000
Name: LABEL, dtype: int64

In [None]:
# install libraries
import nltk

!pip install autocorrect
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting autocorrect
  Downloading autocorrect-2.6.1.tar.gz (622 kB)
[K     |████████████████████████████████| 622 kB 27.9 MB/s 
[?25hBuilding wheels for collected packages: autocorrect
  Building wheel for autocorrect (setup.py) ... [?25l[?25hdone
  Created wheel for autocorrect: filename=autocorrect-2.6.1-py3-none-any.whl size=622382 sha256=0d260eb39768b8376cb8b515a926e6f41a7b938651c55fb50e82f256dcbfc32e
  Stored in directory: /root/.cache/pip/wheels/54/d4/37/8244101ad50b0f7d9bffd93ce58ed7991ee1753b290923934b
Successfully built autocorrect
Installing collected packages: autocorrect
Successfully installed autocorrect-2.6.1


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
# import libraries
from nltk.stem.snowball import stopwords
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from autocorrect import Speller


# create dataframe df1 same columns given in orignal dataset
df1 = df.copy()


# create required columns and initialize with 0
df1['white_space_removed'] = 0
df1['tokenized_data'] = 0
df1['stopword_removed_data'] = 0
df1['punct_removed_data'] = 0
df1['url_removed_data'] = 0
df1['spelling_checked_data'] = 0
df1['lemmetized_data'] = 0



# create object for lemmetizer and spelling checking
lemmatizer = WordNetLemmatizer()
spell = Speller(lang='en')


# iterate over each row of dataset and preprocess data
for i in range(df1.shape[0]):

  # white space removel
  df1['white_space_removed'][i] = re.sub("\s+", " ", df1.TEXT[i])


  # tokenization
  lower = df1['white_space_removed'][i].lower()
  tokenized_data = word_tokenize(lower)
  df1['tokenized_data'][i] = tokenized_data
  # print(tokenized_data)


  # remove stopwords
  stop_words = "|".join(stopwords.words('english'))
  pattern = re.compile(r'\b(' + stop_words + r')\b\s*')
  stopword_removed_data = [pattern.sub("", text) for text in tokenized_data]
  stopword_removed_data = [x for x in stopword_removed_data if x]
  df1['stopword_removed_data'][i] = stopword_removed_data 
  # print(stopword_removed_data)


  # punctuation removel
  punct_removed_data = [x for x in stopword_removed_data if x.isalnum()]
  df1['punct_removed_data'][i] = punct_removed_data
  # print(punct_removed_data)
  

  # remove urls and html tags
  urls = re.findall("https?://[a-zA-Z0-9_\?=\@\/#=.~-]+", " ".join(punct_removed_data))
  url_removed_data = [x for x in punct_removed_data if x not in urls]
  df1['url_removed_data'][i] = url_removed_data
  # print(url_removed_data)  


  # spelling checking
  spelling_checked_data = [spell(x) for x in url_removed_data]
  df1['spelling_checked_data'][i] = spelling_checked_data
  # print(spelling_checked_data)


  # lemmetization
  lemmas = []
  for w in spelling_checked_data:
    lemmas.append(lemmatizer.lemmatize(w, wordnet.VERB))
  df1['lemmetized_data'][i] = lemmas

In [None]:
df1.head()

Unnamed: 0,LABEL,DATE_TIME,TEXT,white_space_removed,tokenized_data,stopword_removed_data,punct_removed_data,url_removed_data,spelling_checked_data,lemmetized_data
0,0,Fri Jun 05 14:26:50 2009,About to get threaded and scared,About to get threaded and scared,"[about, to, get, threaded, and, scared]","[get, threaded, scared]","[get, threaded, scared]","[get, threaded, scared]","[get, threaded, scared]","[get, thread, scar]"
1,1,Thu May 14 10:13:55 2009,@awaisnaseer I like Shezan Mangooo too!!! I ha...,@awaisnaseer I like Shezan Mangooo too!!! I ha...,"[@, awaisnaseer, i, like, shezan, mangooo, too...","[@, awaisnaseer, like, shezan, mangooo, !, !, ...","[awaisnaseer, like, shezan, mangooo, one, yest...","[awaisnaseer, like, shezan, mangooo, one, yest...","[awaisnaseer, like, sedan, mango, one, yesterday]","[awaisnaseer, like, sedan, mango, one, yesterday]"
2,1,Fri Jun 05 21:02:20 2009,worked on my car after work. showering then go...,worked on my car after work. showering then go...,"[worked, on, my, car, after, work, ., showerin...","[worked, car, work, ., showering, going, bed, ...","[worked, car, work, showering, going, bed, soo...","[worked, car, work, showering, going, bed, soo...","[worked, car, work, showing, going, bed, soooo...","[work, car, work, show, go, bed, sooooooooooo,..."
3,1,Sun Jun 14 22:25:52 2009,@Marama Actually we start this afternoon! I w...,@Marama Actually we start this afternoon! I wi...,"[@, marama, actually, we, start, this, afterno...","[@, marama, actually, start, afternoon, !, try...","[marama, actually, start, afternoon, try, some...","[marama, actually, start, afternoon, try, some...","[drama, actually, start, afternoon, try, somet...","[drama, actually, start, afternoon, try, somet..."
4,1,Sun May 31 00:42:12 2009,@gfalcone601 Aww Gi.don't worry.we'll vote for...,@gfalcone601 Aww Gi.don't worry.we'll vote for...,"[@, gfalcone601, aww, gi.do, n't, worry.we, 'l...","[@, gfalcone601, aww, gi., n', worry., ', vote...","[gfalcone601, aww, vote, coz, love, much]","[gfalcone601, aww, vote, coz, love, much]","[falcon601, www, vote, col, love, much]","[falcon601, www, vote, col, love, much]"


In [None]:
# add starting and end of sentance
df1['preprocessed_text'] = 0

for i in range(df1.shape[0]):
  df1['preprocessed_text'][i] = "<s> " + " ".join(df1['lemmetized_data'][i]) + " </s>"

In [None]:
df1.head()

Unnamed: 0,LABEL,DATE_TIME,TEXT,white_space_removed,tokenized_data,stopword_removed_data,punct_removed_data,url_removed_data,spelling_checked_data,lemmetized_data,preprocessed_text
0,0,Fri Jun 05 14:26:50 2009,About to get threaded and scared,About to get threaded and scared,"[about, to, get, threaded, and, scared]","[get, threaded, scared]","[get, threaded, scared]","[get, threaded, scared]","[get, threaded, scared]","[get, thread, scar]",<s> get thread scar </s>
1,1,Thu May 14 10:13:55 2009,@awaisnaseer I like Shezan Mangooo too!!! I ha...,@awaisnaseer I like Shezan Mangooo too!!! I ha...,"[@, awaisnaseer, i, like, shezan, mangooo, too...","[@, awaisnaseer, like, shezan, mangooo, !, !, ...","[awaisnaseer, like, shezan, mangooo, one, yest...","[awaisnaseer, like, shezan, mangooo, one, yest...","[awaisnaseer, like, sedan, mango, one, yesterday]","[awaisnaseer, like, sedan, mango, one, yesterday]",<s> awaisnaseer like sedan mango one yesterday...
2,1,Fri Jun 05 21:02:20 2009,worked on my car after work. showering then go...,worked on my car after work. showering then go...,"[worked, on, my, car, after, work, ., showerin...","[worked, car, work, ., showering, going, bed, ...","[worked, car, work, showering, going, bed, soo...","[worked, car, work, showering, going, bed, soo...","[worked, car, work, showing, going, bed, soooo...","[work, car, work, show, go, bed, sooooooooooo,...",<s> work car work show go bed sooooooooooo tir...
3,1,Sun Jun 14 22:25:52 2009,@Marama Actually we start this afternoon! I w...,@Marama Actually we start this afternoon! I wi...,"[@, marama, actually, we, start, this, afterno...","[@, marama, actually, start, afternoon, !, try...","[marama, actually, start, afternoon, try, some...","[marama, actually, start, afternoon, try, some...","[drama, actually, start, afternoon, try, somet...","[drama, actually, start, afternoon, try, somet...",<s> drama actually start afternoon try somethi...
4,1,Sun May 31 00:42:12 2009,@gfalcone601 Aww Gi.don't worry.we'll vote for...,@gfalcone601 Aww Gi.don't worry.we'll vote for...,"[@, gfalcone601, aww, gi.do, n't, worry.we, 'l...","[@, gfalcone601, aww, gi., n', worry., ', vote...","[gfalcone601, aww, vote, coz, love, much]","[gfalcone601, aww, vote, coz, love, much]","[falcon601, www, vote, col, love, much]","[falcon601, www, vote, col, love, much]",<s> falcon601 www vote col love much </s>


In [None]:
data = pd.concat([df, df1['preprocessed_text']], axis=1)
data.head()

Unnamed: 0,LABEL,DATE_TIME,TEXT,preprocessed_text
0,0,Fri Jun 05 14:26:50 2009,About to get threaded and scared,<s> get thread scar </s>
1,1,Thu May 14 10:13:55 2009,@awaisnaseer I like Shezan Mangooo too!!! I ha...,<s> awaisnaseer like sedan mango one yesterday...
2,1,Fri Jun 05 21:02:20 2009,worked on my car after work. showering then go...,<s> work car work show go bed sooooooooooo tir...
3,1,Sun Jun 14 22:25:52 2009,@Marama Actually we start this afternoon! I w...,<s> drama actually start afternoon try somethi...
4,1,Sun May 31 00:42:12 2009,@gfalcone601 Aww Gi.don't worry.we'll vote for...,<s> falcon601 www vote col love much </s>


In [None]:
data.to_csv('preprocessed_data.csv', index=False)

In [None]:
data = pd.read_csv('preprocessed_data.csv')
data.head()

Unnamed: 0,LABEL,DATE_TIME,TEXT,preprocessed_text
0,0,Fri Jun 05 14:26:50 2009,About to get threaded and scared,<s> get thread scar </s>
1,1,Thu May 14 10:13:55 2009,@awaisnaseer I like Shezan Mangooo too!!! I ha...,<s> awaisnaseer like sedan mango one yesterday...
2,1,Fri Jun 05 21:02:20 2009,worked on my car after work. showering then go...,<s> work car work show go bed sooooooooooo tir...
3,1,Sun Jun 14 22:25:52 2009,@Marama Actually we start this afternoon! I w...,<s> drama actually start afternoon try somethi...
4,1,Sun May 31 00:42:12 2009,@gfalcone601 Aww Gi.don't worry.we'll vote for...,<s> falcon601 www vote col love much </s>


In [None]:
type(data.preprocessed_text)

pandas.core.series.Series

## find vocabulary set and unigram count

In [None]:
# find vocabulary
# find unigram count for each unique words

vocab = set()
unigram_count = {}

for text in data.preprocessed_text.to_list():
  if type(text) != 'str':
    text = str(text)
  x = text.split()
  for word in x:
    if word not in vocab:
      vocab.add(word)
    if word not in unigram_count:
      unigram_count[word] = 1
    else:
      unigram_count[word] += 1

vocab = list(vocab)
print(len(vocab))
print(unigram_count)

7348
{'<s>': 4287, 'get': 531, 'thread': 1, 'scar': 11, '</s>': 4287, 'awaisnaseer': 1, 'like': 202, 'sedan': 1, 'mango': 3, 'one': 122, 'yesterday': 20, 'work': 244, 'car': 19, 'show': 56, 'go': 479, 'bed': 59, 'sooooooooooo': 1, 'tire': 45, 'sparrow': 1, 'sign': 9, 'lt': 45, 'cowboy': 2, 'gt': 24, 'drama': 4, 'actually': 29, 'start': 52, 'afternoon': 18, 'try': 81, 'something': 52, 'slow': 8, 'process': 3, 'collect': 2, 'many': 16, 'vac': 2, 'falcon601': 2, 'www': 55, 'vote': 15, 'col': 1, 'love': 219, 'much': 101, 'mrstessyman': 1, 'ever': 31, 'good': 268, 'day': 213, 'knitpicks': 1, 'getmevideo': 1, 'sorry': 67, 'forte': 1, 'ask': 18, 'trade': 2, 'scale': 2, 'answer': 20, 'ready': 37, 'church': 17, 'sum': 5, 'watch': 130, 'afa': 3, 'whoop': 4, 'djokovic': 1, 'live': 46, 'settle': 7, 'dvr': 1, 'suppose': 10, 'early': 35, 'tomorrow': 114, 'last': 101, 'open': 20, 'home': 113, 'goodnight': 13, 'need': 127, 'shake': 5, 'gloomy': 4, 'feel': 155, 'maybe': 26, 'rain': 45, 'minecraft': 1, 

## find bigram count


In [None]:
bigram_count = {}

for text in data.preprocessed_text.to_list():
  if type(text) != 'str':
    text = str(text)
  temp = text.split()
  for p in range(len(temp)-1):
    w1 = temp[p]
    w2 = temp[p+1]
    if (w1, w2) in bigram_count:
      bigram_count[(w1, w2)] += 1
    else:
      bigram_count[(w1, w2)] = 1

# print(bigram_count)
temp = list(bigram_count.items())[:5]
for i in temp:
  print(i[0], ":", i[1])

('<s>', 'get') : 67
('get', 'thread') : 1
('thread', 'scar') : 1
('scar', '</s>') : 2
('<s>', 'awaisnaseer') : 1


## find probability of bigram

In [None]:
p_bigram = {}

for k, v in bigram_count.items():
  p_bigram[(k)] = bigram_count[(k)] / unigram_count[k[0]]
  
# print(p_bigram)
temp = list(p_bigram.items())[:5]
for i in temp:
  print(i[0], ":", i[1])

('<s>', 'get') : 0.01562864473991136
('get', 'thread') : 0.0018832391713747645
('thread', 'scar') : 1.0
('scar', '</s>') : 0.18181818181818182
('<s>', 'awaisnaseer') : 0.00023326335432703523


In [None]:
laplace_prob = {}
x = len(vocab)

for i in vocab:
  for j in vocab:
    if (i,j) in bigram_count:
      laplace_prob[(i,j)] = (bigram_count[(i,j)] + 1)/(unigram_count[i] + x) 
    else:
      laplace_prob[(i,j)] = 1/(unigram_count[i] + x)

temp = list(laplace_prob.items())[:5]
for i in temp:
  print(i[0], ":", i[1])

NameError: ignored

## Generate text

In [None]:
import random

# generate next word based on previous contenxt
def generate_next_word(prev_context):
    random_num = random.random()
    context_token_prob = {}
    
    # find probability of each bigram which contains previous context
    token_of_interest = [prev[1] for prev in bigram_count.keys() if prev[0] == prev_context]
    for token in token_of_interest:
        context_token_prob[token] = laplace_prob[(prev_context, token)]
    # return words with highest probability after the context
    total = 0
    for token in context_token_prob:
        total += context_token_prob[token]
        if (total > random_num):
            # print(token)
            return token
        


# generate sentance based on no. of words
def generate_sentence():
  current_context = '<s>' # current last word as context
  sentance = ['<s>'] # final sentance
  
  x = 1
  minimum = 7
  maximum = 25
  while x <= maximum:
    # generate next word
    generated_word = generate_next_word(current_context)
    sentance.append(generated_word)
    if generated_word == '</s>' and x <= minimum:
      sentance.pop(-1)
    elif generated_word == '</s>' and x > minimum:
      sentance.pop(-1)
      break
    else:
      current_context = generated_word
    x += 1
  sentance.append('</s>')
  return ' '.join(sentance)
    

generate_sentence()

NameError: ignored

## Laplace Smoothing

In [None]:
# laplace_prob = {}
# x = len(vocab)

# for i in vocab:
#   for j in vocab:
#     if (i,j) in bigram_count:
#       laplace_prob[(i,j)] = (bigram_count[(i,j)]+1)/(unigram_count[i] + x) 
#     else:
#       laplace_prob[(i,j)] = 1/(unigram_count[i] + x)

In [None]:
len(laplace_prob)

In [None]:
x**2