<a href="https://colab.research.google.com/github/balyashukla1/NLP/blob/master/tweet-pre-processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import re
from collections import defaultdict
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

## PROBLEM 1: 

First, I will import the dataset and do the pre-processing:

In [0]:
# Import the data 
tweets_corpus = pd.read_csv("tweets_corpus1.txt", sep = "	", header = None)
tweets_corpus.columns = ['tweet_id', 'tweet']

            
# Pre-processes the data
tweets_corpus["processed_tweet"] = tweets_corpus['tweet'].str.replace('[^\w\s]','')
tweets_corpus.loc[:,"processed_tweet"] = tweets_corpus.processed_tweet.apply(lambda x : str.lower(x))
tweets_corpus['processed_tweet'] = tweets_corpus.apply(lambda row: nltk.word_tokenize(row['processed_tweet']), axis=1)

#print(tweets_corpus["processed_tweet"])
tweets_corpus.insert(0, 'Doc_ID', range(1, len(tweets_corpus)+1))
tweets_corpus.drop('tweet_id', axis=1, inplace=True)
print(tweets_corpus)

    Doc_ID                                              tweet  \
0        1  Bandaging up my paper-cuts , having cheesecake...   
1        2  I haven't had any krispy kremes or strawberry ...   
2        3  Bacon/cheddar slider topped w/fried egg & Blue...   
3        4              Nacho w/ cheese on my shirt ! Uggghhh   
4        5  you aint nuffin but a piece of cheese without ...   
5        6  TAG_USERNAME TAG_USERNAME Mmmm ... cheese ... ...   
6        7                                        Mmmm cheese   
7        8  TAG_USERNAME 1st off I'm like 1 year younger t...   
8        9  RT TAG_USERNAME : I want a steak and cheese eg...   
9       10  think imma eat some cheesecake befor i lay dow...   
10      11  A mixed one mostly strawberry peach little whi...   
11      12  My stomach was yelling at me telling me to get...   
12      13  chocolate mint , cookies & cream , very berry ...   
13      14  I think I want some cheese eggs and pancakes ....   
14      15  TAG_USERNAME 

In [0]:
## Creating a list of list of words in each document
processed_tweets_list = tweets_corpus["processed_tweet"].tolist()
remove_words = ["tag_hashtags","rt", "tag_username", "tag_final_hashtags"]
for lists in processed_tweets_list:
    for word in lists:
        if word in remove_words:
            lists.remove(word)
print(processed_tweets_list)
#dct = {tuple(key): idx for idx, key in enumerate(processed_tweets_list)}
#print(dct)

[['bandaging', 'up', 'my', 'papercuts', 'having', 'cheesecake', 'for', 'dinner', 'and', 'calling', 'it', 'a', 'night', 'were', 'doin', 'it', 'big', 'here', 'in', 'nyc'], ['i', 'havent', 'had', 'any', 'krispy', 'kremes', 'or', 'strawberry', 'trifles', 'since', 'i', 'started', 'gym', 'cries'], ['baconcheddar', 'slider', 'topped', 'wfried', 'egg', 'blue', 'cheese', 'slider', 'topped', 'wavocado', 'purple', 'cherokee', 'tomato'], ['nacho', 'w', 'cheese', 'on', 'my', 'shirt', 'uggghhh'], ['you', 'aint', 'nuffin', 'but', 'a', 'piece', 'of', 'cheese', 'without', 'the', 'corners', 'in', 'other', 'words', 'you', 'will', 'never', 'be', 'a', 'slice', 'bitch'], ['tag_username', 'mmmm', 'cheese', 'dreaming', 'of', 'a', 'squirrel', 'burger', 'with', 'cheese'], ['mmmm', 'cheese'], ['1st', 'off', 'im', 'like', '1', 'year', 'younger', 'than', 'u', '2nd', 'age', 'is', 'just', 'a', 'number', '3rd', 'ima', 'cater', 'ur', 'wedding', 'wit', 'patty', 'n', 'cheese'], ['tag_username', 'i', 'want', 'a', 'steak'

In [0]:
#dictionary = {k: v for v, k in enumerate(processed_tweets_list)}
#dictionary = dict(zip(docID, processed_tweets_list))
#print(dictionary)    

In [0]:
## Creating an inverse index
inv_indx = defaultdict(list)
for idx, text in enumerate(processed_tweets_list):
    for word in text:
        inv_indx[word].append(idx)
inv_indx

defaultdict(list,
            {'bandaging': [0],
             'up': [0, 11],
             'my': [0, 3, 11, 11, 12, 13],
             'papercuts': [0],
             'having': [0],
             'cheesecake': [0, 9, 15],
             'for': [0, 14, 15],
             'dinner': [0],
             'and': [0, 8, 12, 13, 14, 15, 16],
             'calling': [0],
             'it': [0, 0, 9, 14, 17, 20, 20],
             'a': [0, 4, 4, 5, 7, 8, 9, 10, 11, 13, 14],
             'night': [0, 20],
             'were': [0],
             'doin': [0],
             'big': [0],
             'here': [0],
             'in': [0, 4, 9, 12],
             'nyc': [0],
             'i': [1, 1, 8, 9, 11, 13, 13, 13, 15, 17, 20, 23],
             'havent': [1, 9],
             'had': [1, 9, 19],
             'any': [1],
             'krispy': [1],
             'kremes': [1],
             'or': [1, 10, 22],
             'strawberry': [1, 10, 12, 14, 15, 17, 18, 19, 22],
             'trifles': [1],
             's

In [0]:
## Creating a dataframe from the set of inverted index

new_dict = {a:list(set(b)) for a, b in inv_indx.items()}


keys, values = zip(*new_dict.items()) 
lists = [keys, values]

df = pd.concat([pd.Series(x) for x in lists], axis=1)
df['doc_freq'] = df[1].str.len()
df = df.rename({0:'term', 1:'docs'}, axis=1)
print(df.to_string())

             term                                           docs  doc_freq
0       bandaging                                            [0]         1
1              up                                        [0, 11]         2
2              my                             [0, 3, 11, 12, 13]         5
3       papercuts                                            [0]         1
4          having                                            [0]         1
5      cheesecake                                     [0, 9, 15]         3
6             for                                    [0, 14, 15]         3
7          dinner                                            [0]         1
8             and                     [0, 8, 12, 13, 14, 15, 16]         7
9         calling                                            [0]         1
10             it                             [0, 9, 14, 17, 20]         5
11              a             [0, 4, 5, 7, 8, 9, 10, 11, 13, 14]        10
12          night        

## PROBLEM 2: 

In [0]:
## creating a dataframe for boolean operations 

s = pd.Series(new_dict)
df_bo =  pd.get_dummies(s.apply(pd.Series).stack()).sum(level=0)

df_bo.columns = df_bo.columns.astype(int)
df_bo

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
bandaging,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
up,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
my,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
papercuts,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
having,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
casserole,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
deck,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
sum,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
ok,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [0]:
#(df_boo["term"] == "bandaging").iloc[1]
#print(df_boo.loc[df_boo["term"] == "my"])


def merge(term1, term2):
    global intersect
    global union
    global diff
    #term1 =! ""
    #term2 =! ""
    #t1 = df_bo.loc[[term1]].to_dict()
    #t2 = df_bo.loc[[term2]]
    #intersect = get_intersect1ds(t1, t2)
    sum_ = df_bo.loc[[term1, term2]].sum().to_frame().transpose()
    intersect = [col for col in sum_ if (sum_[col] == 2).any()]
    union = [col for col in sum_ if (sum_[col] >= 1).any()]  
    diff = [col for col in sum_ if (sum_[col] == 1).any()]
    #return sum_
    #return "Doc_ID:", res
    return ("AND OPERATOR: Doc_ID:", intersect,
          "OR OPERATOR: Doc_ID:", union,
         "AND NOT OPERATOR: Doc_ID:", diff,)


In [0]:
## Testing the merge function on terms "it" and "up"
merge("it", "up")

('AND OPERATOR: Doc_ID:',
 [0],
 'OR OPERATOR: Doc_ID:',
 [0, 9, 11, 14, 17, 20],
 'AND NOT OPERATOR: Doc_ID:',
 [9, 11, 14, 17, 20])

## PROBLEM 3: 

In [0]:
## Expanding existing function

def merge_tfidf(term1, term2):
    global intersect2
    global union2
    global diff2
    sum_ = df_bo.loc[[term1, term2]].sum().to_frame().transpose()
    intersect2 = [col for col in sum_ if (sum_[col] == 2).any()]
    tweettt = tweets_corpus.iloc[intersect2]
    ##print(tweettt)
    cor = tweettt["tweet"].to_list()
    ## settings up the tfid vectorizer using the cor list
    tfidf_vectorizer=TfidfVectorizer(use_idf=True)
    tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(cor)
    first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0]
    ## creating a pandas dataframe for the terms and TF_IDF
    df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["TF_IDF_SCORE"])
    df2 = df.sort_values(by=["TF_IDF_SCORE"],ascending=False)
    df2 = df2.reset_index()
    df2 = df2.rename(columns={"index": "TERM"})
    return (df2)

In [0]:
## Testing the merge_tfidf function on terms "it" and "up"
merge_tfidf("it", "up")

Unnamed: 0,TERM,TF_IDF_SCORE
0,it,0.417029
1,and,0.208514
2,bandaging,0.208514
3,up,0.208514
4,re,0.208514
5,paper,0.208514
6,nyc,0.208514
7,night,0.208514
8,my,0.208514
9,in,0.208514
