In [3]:
import pandas as pd
import numpy as np
import nltk
import string
import re
from nltk.tokenize import sent_tokenize, word_tokenize

In [4]:
def remove_between_square_brackets(text):
  return re.sub('\[[^]]*\]', '', text)

def remove_special_characters(text, remove_digits=True):
  pattern=r'[^a-zA-z0-9\s]'
  text=re.sub(pattern,'',text) 
  return text

def get_word_count(text):
    stopwords_list = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
    word_count = {}
    for sent in sent_tokenize(text):
        for word in word_tokenize(sent):
            if word in word_count and word not in stopwords_list and word not in string.punctuation:
                word_count[word] += 1
            elif word not in word_count and word not in stopwords_list and word not in string.punctuation:
                word_count[word] = 1
    return word_count

def tokenize_text(text):
    return [word for sent in sent_tokenize(text) for word in word_tokenize(sent)]

def replace_text_stemming(text):
    stemmer = nltk.stem.PorterStemmer()
    stems = [stemmer.stem(word) for word in tokenize_text(text)]
    return " ".join(stems)

def get_vector(text, vocabulary):
    tokens = tokenize_text(text)
    vector = np.zeros(len(vocabulary))
    for token in tokens:
        if token in vocabulary:
            vector[list(vocabulary.keys()).index(token)]+= 1
    return vector
    
def get_features_labels(df, label_column_name):
    if 'Unnamed: 0' in df.columns:
        df.drop('Unnamed: 0', 1, inplace=True)
    labels = df[label_column_name]
    train_features = df.drop(label_column_name, 1)
    return train_features, labels.to_numpy()

def build_vocab(dataset):
    text_corpus = ""
    for i in range(0, dataset.shape[0]):
        text_corpus += " " + dataset.iloc[i]['Text']
    text_corpus = text_corpus.lower()
    text_corpus = remove_between_square_brackets(text_corpus)
    text_corpus = remove_special_characters(text_corpus, True)
    
    stemmed_corpus = replace_text_stemming(text_corpus)
    vocabulary = get_word_count(stemmed_corpus)
    updated_vocab = {}
    for key in vocabulary.keys():
        if(vocabulary[key]>0.01*dataset.shape[0] and vocabulary[key]<2*dataset.shape[0]):
            updated_vocab[key] = vocabulary[key]
    return updated_vocab

In [96]:
def generate_N_grams(ngram, dataset):
  # text_corpus = "Firstly, word tokenization is done[89] where the 1100 stop words are ignored and the remaining words are retained."
  text_corpus = ""
  for i in range(0, dataset.shape[0]):
      text_corpus += " " + dataset.iloc[i]['Text']
  text_corpus = text_corpus.lower()
  text_corpus = remove_between_square_brackets(text_corpus)
  text_corpus = remove_special_characters(text_corpus, True)
  # print("COrpus now:", text_corpus)
  stemmed_corpus = replace_text_stemming(text_corpus)
  # print("COrpus now:", stemmed_corpus)
  # words=[word for word in text.split(" ") if word not in set(stopwords.words('english'))]  
  # print("Sentence after removing stopwords:",words)
  token=word_tokenize(stemmed_corpus)
  # print(type(token))
  stopwords_list = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
  token = [tok for tok in token if tok not in stopwords_list] 
  token = [tok for tok in token if tok not in string.punctuation]
  # for word in word_tokenize(sent):
  temp=zip(*[token[i:] for i in range(0,ngram)])
  ans=[' '.join(ngram) for ngram in temp]
  return ans

In [39]:
def split_dataset(all_data):
    test_data = all_data.sample(random_state = 47, frac = 0.2)
    train_data = all_data.drop(test_data.index)
    
    """
    This function will take in as input the whole dataset and you will have to program how to split the dataset into
    training and test datasets. These are the following requirements:
        -The function must take only one parameter which is all_data as a pandas dataframe of the raw dataset.
        -It must return 2 outputs in the specified order: train and test datasets
        
    It is up to you how you want to do the splitting of the data.
    """
    return train_data, test_data

all_data = pd.read_csv('data.csv', index_col=0)
train_data, test_data = split_dataset(all_data)

# ans = generate_N_grams(4)
# print(len(ans))
# ans

In [87]:
ans = generate_N_grams(3, train_data)
print(len(ans))

<class 'list'>
503027


In [56]:
ans = generate_N_grams(1, train_data)
print(len(ans))

COrpus now: firstli word token is done where the 1100 stop word are ignor and the remain word are retain
<class 'list'>
11


In [64]:
def get_word_count_n_gram(n_gram_token_list):
    # stopwords_list = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
    word_count = {}
    for token in n_gram_token_list:
        if token in word_count:
            word_count[token]+=1
        elif token not in word_count:
            word_count[token]=1
    # for sent in sent_tokenize(text):
    #     for word in word_tokenize(sent):
    #         if word in word_count and word not in stopwords_list and word not in string.punctuation:
    #             word_count[word] += 1
    #         elif word not in word_count and word not in stopwords_list and word not in string.punctuation:
    #             word_count[word] = 1
    return word_count

In [109]:
# vocab_n_gram = get_word_count_n_gram(ans)
# print(len(vocab_n_gram))
# updated_vocab_n_gram = {}
# for key in vocab_n_gram.keys():
#     if(vocab_n_gram[key]>6 and vocab_n_gram[key]<200):
#         updated_vocab_n_gram[key] = vocab_n_gram[key]
# print(len(updated_vocab_n_gram))
n_gram_value=1
dataset = train_data
def build_vocab_n_gram(dataset, n_gram_value):
    n_gram_tokens = generate_N_grams(n_gram_value, dataset)
    vocab_n_gram_value = get_word_count_n_gram(n_gram_tokens)
    print(len(vocab_n_gram_value))
    updated_vocab_n_gram_value = {}
    for key in vocab_n_gram_value.keys():
        if(vocab_n_gram_value[key]>17 and vocab_n_gram_value[key]<20000):
            updated_vocab_n_gram_value[key] = vocab_n_gram_value[key]
    # print(len(updated_vocab_n_gram_value))
    return updated_vocab_n_gram_value

final_vocab = build_vocab_n_gram(dataset, n_gram_value)
print(len(final_vocab))

26949
3879


In [98]:
def generate_N_grams_for_sent(ngram, sent):
  # text_corpus = "Firstly, word tokenization is done[89] where the 1100 stop words are ignored and the remaining words are retained."
  text_corpus = sent
#   for i in range(0, dataset.shape[0]):
#       text_corpus += " " + dataset.iloc[i]['Text']
  text_corpus = text_corpus.lower()
  text_corpus = remove_between_square_brackets(text_corpus)
  text_corpus = remove_special_characters(text_corpus, True)
  # print("COrpus now:", text_corpus)
  stemmed_corpus = replace_text_stemming(text_corpus)
  # print("COrpus now:", stemmed_corpus)
  # words=[word for word in text.split(" ") if word not in set(stopwords.words('english'))]  
  # print("Sentence after removing stopwords:",words)
  token=word_tokenize(stemmed_corpus)
  # print(type(token))
  stopwords_list = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
  token = [tok for tok in token if tok not in stopwords_list] 
  token = [tok for tok in token if tok not in string.punctuation]
  # for word in word_tokenize(sent):
  temp=zip(*[token[i:] for i in range(0,ngram)])
  ans=[' '.join(ngram) for ngram in temp]
  return ans

def get_vector_n_gram(text, vocabulary, n_gram_value):
    tokens = generate_N_grams_for_sent(n_gram_value, text)
    vector = np.zeros(len(vocabulary))
    for token in tokens:
        if token in vocabulary:
            vector[list(vocabulary.keys()).index(token)]+= 1
    return vector

In [85]:
text = "spoiled rich kid kelley morse ( chris klein ) receives a new mercedes for a graduation present"
vec = get_vector_n_gram(text, updated_vocab_n_gram, 2)
vec

<class 'list'>


array([0., 0., 0., ..., 0., 0., 0.])

In [92]:
class Logistic():
    def __init__(self):
        """
        The __init__ function initializes the instance attributes for the class. There should be no inputs to this
        function at all. However, you can setup whatever instance attributes you would like to initialize for this
        class. Below, I have just placed as an example the weights and bias of the logistic function as instance
        attributes.
        """
        self.vocabulary = None
        self.weights = None
        self.bias = None
        self.n_gram_value = 2

    def get_vocab(self, dataset):
        self.vocabulary = build_vocab_n_gram(dataset, self.n_gram_value)
    
    def sigmoid(self, z):
	    return(1 / (1 + np.exp(-z))) 
    
    def feature_extraction(self, data):
        """
        Optional helper method to code the feature extraction function to transform the raw dataset into a processed
        dataset to be used in training.
        """
        df = []
        for i in range(0, data.shape[0]):
            df.append(get_vector_n_gram(data.iloc[i]['Text'], self.vocabulary, self.n_gram_value))
        return np.array(df)

    def logistic_loss(self, predicted_label, true_label):
        """
        Optional helper method to code the loss function.
        """

        return -np.sum(np.dot(true_label, np.log(predicted_label)), np.dot(1-true_label, np.log(1-predicted_label)))

    def stochastic_gradient_descent(self, data, error):
        """
        Optional helper method to compute a gradient update for a single point.
        """

        return np.dot(data, error)

    def update_weights(self, learning_rate, gradient):
        """
        Optional helper method to update the weights during stochastic gradient descent.
        """
        new_weights = learning_rate*gradient
        self.weights -= new_weights

    def update_bias(self, learning_rate, error):
        """
        Optional helper method to update the bias during stochastic gradient descent.
        """
        new_bias = np.dot(learning_rate,error)
        self.bias -= new_bias

    def predict_labels(self, data_point):
        """
        Optional helper method to produce predictions for a single data point
        """
        return np.round(self.sigmoid(np.dot(data_point, self.weights)))

    def train(self, labeled_data, learning_rate=0.001, max_epochs=20):
        """
        You must implement this function and it must take in as input data in the form of a pandas dataframe. This
        dataframe must have the label of the data points stored in a column called 'Label'. For example, the column
        labeled_data['Label'] must return the labels of every data point in the dataset. Additionally, this function
        should not return anything.

        The hyperparameters for training will be the learning rate and maximum number of epochs. Once you find the
        optimal values, update the default values for both the learning rate and max epochs keyword argument.

        The goal of this function is to train the logistic function on the labeled data. Feel free to code this
        however you want.
        """
        self.get_vocab(labeled_data)
        X, y = get_features_labels(labeled_data, 'Label')
        X = self.feature_extraction(X)
     
        self.weights = np.zeros(X.shape[1])
        self.bias = 0

        for step in range(0, max_epochs):
            for idx, x_feature in enumerate(X):
                scores = np.dot(x_feature, self.weights) + self.bias
                prediction = self.sigmoid(scores)

                output_error_signal = prediction - y[idx]
                gradient = self.stochastic_gradient_descent(x_feature, output_error_signal)
                self.update_weights(learning_rate, gradient)
                self.update_bias(learning_rate, output_error_signal)

    def predict(self, data):
        predicted_labels = []
        """
        This function is designed to produce labels on some data input. The only input is the data in the form of a 
        pandas dataframe. 

        Finally, you must return the variable predicted_labels which should contain a list of all the 
        predicted labels on the input dataset. This list should only contain integers  that are either 0 (negative) or 1
        (positive) for each data point.

        The rest of the implementation can be fully customized.
        """
        X, y = get_features_labels(data, 'Label')
        
        X = self.feature_extraction(X)

        for feature in X:
            predicted_labels.append(self.predict_labels(feature))
        return predicted_labels

In [None]:
logistic = Logistic()
logistic.train(train_data)
# predicted_train_labels_logistic = logistic.predict(train_data)

In [None]:
predicted_train_labels_logistic = logistic.predict(train_data)

In [100]:
predicted_test_labels_logistic = logistic.predict(test_data)

In [101]:
def accuracy(orig, pred):
    num = len(orig)
    if (num != len(pred)):
        print('Error!! Num of labels are not equal.')
        return
    match = 0
    for i in range(len(orig)):
        o_label = orig[i]
        p_label = pred[i]
        if (o_label == p_label):
            match += 1
    print('***************\nAccuracy: '+str(float(match) / num)+'\n***************')

def eval(o_train, p_train, o_val, p_val):
    print('\nTraining Accuracy Result!')
    accuracy(o_train, p_train)
    print('\nTesting Accuracy Result!')
    accuracy(o_val, p_val)

print('\n\n-------------Logistic Function Performance-------------\n')
    # This command also runs the evaluation on the unseen test
eval(train_data['Label'].tolist(), predicted_train_labels_logistic, test_data['Label'].tolist(),
         predicted_test_labels_logistic)



-------------Logistic Function Performance-------------


Training Accuracy Result!
***************
Accuracy: 0.9485294117647058
***************

Testing Accuracy Result!
***************
Accuracy: 0.7235294117647059
***************


In [15]:
def generate_ngrams(s, n):
    # Convert to lowercases
    s = s.lower()
    
    # Replace all none alphanumeric characters with spaces
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    
    # Break sentence in the token, remove empty tokens
    token = [token for token in s.split(" ") if token != ""]
    
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[token[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]
sent = "Firstly, word tokenization is done[89] where the 1100 stop words are ignored and the remaining words are retained."
ans = generate_ngrams(sent, 2)
ans

['firstly word',
 'word tokenization',
 'tokenization is',
 'is done',
 'done 89',
 '89 where',
 'where the',
 'the 1100',
 '1100 stop',
 'stop words',
 'words are',
 'are ignored',
 'ignored and',
 'and the',
 'the remaining',
 'remaining words',
 'words are',
 'are retained']