In [1]:
from ipynb.fs.full.NeededModules import *

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/blakedickerson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/blakedickerson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/blakedickerson/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Comment Model Functions

## Section 2 Functions

In [2]:
def preprocess_columns(dataframe):
    """
        This function will clean the whitespace out of the column names
        
        args:
            dataframe: pandas dataframe
    """
    dataframe.columns = dataframe.columns.str.replace(' ', '')

In [3]:
def clean_comment_number_column(comment_number):
    """
        This function will strip the whitespace to the left of each number in the comment number column
        
        args:
            comment_number: object
        
        return:
            comment_number: object
    """
    for number in comment_number:
        comment_number = comment_number.lstrip()
    return comment_number

In [4]:
def clean_null(df):
    """
        This function will clean the null value in our dataset
        
        args:
            df: pandas dataframe
    """
    
    #Filter location to where the data is missing a value
    df[df['image_name'] == '2199200615.jpg']
    #Create a temporary variable to hold that row
    tmp = df.iloc[19999]
    #Get the comment number column
    holder = tmp['comment_number']
    #Seperate the text from the digit and print
    result = ''.join([i for i in holder if not i.isdigit()])
    #Update the temporary variable row
    tmp['comment'] = result 
    tmp['comment_number'] = '4'
    #Update our data copy row to reflect change
    df.iloc[19999] = tmp

In [5]:
def expand_contractions(comment):
    """
        This function will expand all contractions to get rid of them.

        args:
            comment (object): A column in the dataframe that contains all the comments from each image

        return:
           comment: A fixed object that has all the punctuations taken out of the sentence
    """
    comment = contractions.fix(comment)
    return comment

In [6]:
def remove_punctuations(comment):
    """
        This function will go through all the comment rows and remove all punctuations.
    
        args:
            comment (object): A column in the dataframe that contains all the comments from each image

        return:
            comment: A fixed object that has all the punctuations taken out of the sentence
    """
    for letter in string.punctuation:
        comment = comment.replace(letter, '')
    return comment

In [7]:
def to_lower(comment):
    """
        This function will convert a given text to lower cae
        
        args: 
            comment: object
        
        return:
            comment: object
            
    """
    for word in comment:
        comment = comment.lower()
    return comment

In [8]:
def strip_whitespace(df, column_name):
    """
        This function will strip the whitespace off a given column
        
        args:
            df: pandas dataframe
            column_name: object
        
        return: 
            df: pandas dataframe
    """
    df[column_name] = df[column_name].str.replace('\s+', '', regex=True)
    return df

In [9]:
def classify_images(dataframe, column_name, new_column_name):
    """
        A function to turn the images to a label classifying them

        args: 
            df: a dataframe
            column_name: object
            new_column_name: oject    
     """
    dic = dict()                             #dictionary to hold our unique labels
    label_list = list()                      #list to hold all of the labels
    dataframe['label'] = np.nan              #create a new column which will be our label column
    temp_counter = 0                         #temporary counting variable
    for value in dataframe[column_name]:
        if value not in dic:
            dic[value] = temp_counter        #assign the counter as the value while the image_name is the key
            temp_counter += 1
    images = dataframe[column_name].tolist() #put the image_name column values in a list
    for image in images:
        label_list.append(dic[image])
    dataframe[new_column_name] = label_list  #update label column to the correct value

In [10]:
def count_words(comment):
    """
        This function will count the amount of words in each comment

        args:
            comment (object): A column in the dataframe that contains all the comments from each image

        returns:
            int: An int that counts the words in the sentence
    """
    return len(comment.split()) #split the string on each space

In [11]:
def count_unique_words(comment):
    """
        This function will count the amount of unique words in a comment

        args:
            comment (object): A column in the dataframe that contains all the comments from each image

        returns:
            Object: A fixed object that has all the punctuations taken out of the sentence
    """
    return len(set(comment.split()))

In [12]:
def count_unique_words_vs_word_count(num_words, num_unique):
    """
        This function will count the number of unique words vs the total words per sentence

        args:
            num_words (int): number of words per sentence
            num_unique (int): number of unique words per sentence

        returns:
            Float: the ratio of unique words to total words
    """
    return num_unique/num_words

## Section 3 Functions

In [13]:
def generate_Xids(num_samples, seq_len):
    """
        This function will generate a numpy array in a 2d shape
        
        args:
            num_samples (int): length of a pandas dataframe
            seq_len (int): max number of words to parse
        
        return:
            Xids: numpy array
    """
    Xids = np.zeros((num_samples, seq_len))
    return Xids

In [14]:
def generate_attn_msks(num_samples, seq_len):
    """
        This function will generate a numpy array in a 2d shape
        
        args:
            num_samples (int): length of a pandas dataframe
            seq_len (int): max number of words to parse
        
        return:
            attn_masks: numpy array
    """
    attn_masks = np.zeros((num_samples, seq_len))
    return attn_masks

# Fix function below to take in any column instead of comment

In [15]:
def generate_encoded_values(df, ids, masks, tokenizer, sequence_len, column):
    """
        This function will encode each comment 1 by 1
        
        args:
            df: pandas dataframe
            ids: Xids
            masks: attn_mask
            tokenizer: huggingface AutoTokenizer
            
        return:
            ids: input ids to pass to BERT model
            masks: attention masks to pass to BERT model
    """
    for i, text in tqdm(enumerate(df['comment'])):
        tokenized_text = tokenizer.encode_plus(
                              text,
                              max_length=sequence_len,      #max length of each comment so we can truncate if needed
                              truncation=True,              #if the comment is longer than max_length only take up to max_length values
                              padding='max_length',         #make all elements the same size of max_length
                              add_special_tokens=True,      #add [cls] [pad] [sep]
                              return_tensors='tf'           #return the data as tensorflow tensors
        )
        ids[i, :] = tokenized_text.input_ids          #at current index to the end tokenize the input ids
        masks[i, :] = tokenized_text.attention_mask   #at current index to the end tokenize the mask
    return ids, masks

In [16]:
def generate_encoded_values_dynamically(df, ids, masks, tokenizer, sequence_len, column):
    """
        This function will encode each comment 1 by 1
        
        args:
            df: pandas dataframe
            ids: Xids
            masks: attn_mask
            tokenizer: huggingface AutoTokenizer
            
        return:
            ids: input ids to pass to BERT model
            masks: attention masks to pass to BERT model
    """
    for i, text in tqdm(enumerate(df[column])):
        tokenized_text = tokenizer.encode_plus(
                              text,
                              max_length=sequence_len,      #max length of each comment so we can truncate if needed
                              truncation=True,              #if the comment is longer than max_length only take up to max_length values
                              padding='max_length',         #make all elements the same size of max_length
                              add_special_tokens=True,      #add [cls] [pad] [sep]
                              return_tensors='tf'           #return the data as tensorflow tensors
        )
        ids[i, :] = tokenized_text.input_ids          #at current index to the end tokenize the input ids
        masks[i, :] = tokenized_text.attention_mask   #at current index to the end tokenize the mask
    return ids, masks

## Section 5 Functions

In [17]:
def generate_train_loss_plots(num_epochs, model_name):
    """
        This function will plot the training loss and training accuracy over a fixed amount of epochs
        
        args:
            num_epochs (int): number of epochs trained for
            model_name (object): name of the model trained
    """
    x = list(range(1, num_epochs+1))
    metric_list = list(model_name.history.keys())
    num_metrics = int(len(metric_list)/2)
    
    fig, ax = plt.subplots(nrows=1, ncols=num_metrics, figsize=(30,5))
    
    for i in range(0, num_metrics):
        ax[i].plot(x, model_name.history[metric_list[i]], marker="o", label=metric_list[i].replace("_", " "))
        ax[i].plot(x, model_name.history[metric_list[i+num_metrics]], marker="o", label=metric_list[i+num_metrics].replace("_", " "))
        ax[i].set_xlabel("epochs",fontsize=14)
        ax[i].set_title(metric_list[i].replace("_", " "),fontsize=20)
        ax[i].legend(loc="lower left")

## Section 6 Functions

In [18]:
def generate_num_classes(num_classes):
    """
        This function will generate a list of the number of classes (labels) generated in the dataset
        
        args:
            num_classes (int): number of classes generated
        
        return:
            class_list (int): a list of all the classes
    """
    class_list = []
    for i in range(0, num_classes):
        class_list.append(i)
    return class_list

In [19]:
def prepare_data(text, tokenizer, seq_len):
    '''
        This function will encode a given string
        
        args:
            text: a string input
            tokenizer: 'bert-base-cased' tokenizer that we used previously
        
        return:
            input_ids: tf.float64
            attention_mask: tf.float64
    '''
    token = tokenizer.encode_plus(
        text,
        max_length=seq_len, 
        truncation=True, 
        padding='max_length', 
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }

In [20]:
def make_prediction(model, processed_data, classes):
    '''
        This funciton will determine the label for a given string inputted
        
        args:
            model: BERT model that was created
            processed_data: tokenized data
            classes: class list
        
        return:
            the index of the largest value predicted
    '''
    probs = model.predict(processed_data)[0]
    return classes[np.argmax(probs)]

## Noisy Tag Model Functions

In [21]:
def extract_keywords(df, 
                     column_name, 
                     language='en', 
                     max_ngram_size=3, 
                     dedupLim=0.9, 
                     dedupFunc='seqm', 
                     num_keywords=5):
    '''
        This function will use the yake library to grab the highest weighted tags from each comment
        
        args:
            df: pandas dataframe
            column_name: the name of the column needed to extract information from
            language: english
            max_ngram_size: maximum number of n-grams to consider
            dedupLim: threshold cutoff
            dedupFunc: algorithm specification
            num_keywords: number of keywords to extract
            
        return:
            pandas dataframe
    '''
    
    kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, top=num_keywords) #create the keyword extractor object with paramerts
    keyword_counts = []
    for sentence in df[column_name]:
        keywords = kw_extractor.extract_keywords(sentence) #extract the keywords from the sentence
        keyword_counts.append(keywords)                    #add the keywords to the list
    df['keywords'] = keyword_counts                        #create a new column and add the 'noisy tags'
    return df

In [22]:
def extract_nouns_verbs(df, column_name):
    '''
        This function will use the NLTK library to tag the nouns and verbs from each commment and put them in a new column
        
        args:
            df: pandas dataframe
            column_name: the name of the column needed to extract information from

        return:
            pandas dataframe
    '''
        
    nouns_verbs = []
    for sentence in df[column_name]:
        tokens = nltk.word_tokenize(sentence) #tokenize each word in the sentence
        tagged = nltk.pos_tag(tokens)         #part of speech tagging
        nv = [word for (word, pos) in tagged if pos.startswith('N') or pos.startswith('V')] #get the nouns and verbs from the tagged data
        nouns_verbs.append(nv) #add nouns and verbs to list
    df['noisy_tags'] = nouns_verbs
    return df

In [23]:
def process_noisy_tags(df, column_name):
    df[column_name] = [','.join(map(str, word)) for word in df[column_name]]
    df[column_name] = df[column_name].str.replace(',', ' ')
    return df