In [51]:
import pandas as pd
import numpy as np
import nltk
import string
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\garga\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [44]:
data = pd.read_csv('./data.csv')
data.drop('Unnamed: 0', 1, inplace=True)
data.head(5)

Unnamed: 0,Text,Label
0,spoiled rich kid kelley morse ( chris klein ) ...,0
1,the bond series is an island in the film world...,1
2,"tarzan chad'z = good ) 1999 , g , 90 minutes [...",1
3,a frequent error is the categorization of a te...,1
4,"part one of "" the strangest movies ever made ""...",1


In [54]:
def tokenize_text(text):
  return [word for sent in sent_tokenize(text) for word in word_tokenize(sent)]

In [77]:
# data['Label'].value_counts()

stopwords_list = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
def remove_stopwords(text):
    words = [word for word in word_tokenize(text) if word.lower() not in stopwords_list]
    return " ".join(words)

def get_word_count(text):
    word_count = {}
    for sent in sent_tokenize(text):
        for word in word_tokenize(sent):
            if word in word_count and word not in stopwords_list and word not in string.punctuation:
                word_count[word] += 1
            elif word not in word_count and word not in stopwords_list and word not in string.punctuation:
                word_count[word] = 1
    return word_count 

In [78]:
text_corpus = ""
for i in range(0, data.shape[0]):
    text_corpus += " " + data.iloc[i]['Text']
text_corpus = text_corpus.lower()


In [79]:
stemmer = nltk.stem.PorterStemmer()
def replace_text_stemming(text):
  stems = [stemmer.stem(word) for word in tokenize_text(text)]
  return " ".join(stems)

In [80]:
stemmed_corpus = replace_text_stemming(text_corpus)
len(stemmed_corpus)
# stemmed_corpus

6018307

In [81]:
# stemmed_corpus_without_stopwords = remove_stopwords(stemmed_corpus)
# len(stemmed_corpus)

In [82]:
vocabulary = get_word_count(stemmed_corpus)
len(vocabulary)

30332

In [86]:
def get_vector(text, vocabulary):
    tokens = tokenize_text(text)
    vector = np.zeros(len(vocabulary))
    for token in tokens:
        if token in vocabulary:
            vector[list(vocabulary.keys()).index(token)]+= 1
    return vector

In [87]:
def get_features_labels(df, label_column_name):
    if 'Unnamed: 0' in df.columns:
        df.drop('Unnamed: 0', 1, inplace=True)
    labels = df[label_column_name]
    train_features = df.drop(label_column_name, 1)
    return train_features, labels.to_numpy()

In [90]:
vector_first = get_vector(data.iloc[1]['Text'], vocabulary)
vector_first

array([1., 0., 0., ..., 0., 0., 0.])

In [95]:
class Perceptron():
    def __init__(self):
        """
        The __init__ function initializes the instance attributes for the class. There should be no inputs to this
        function at all. However, you can setup whatever instance attributes you would like to initialize for this
        class. Below, I have just placed as an example the weights and bias of the perceptron as instance attributes.
        """
        self.vocabulary = None
        self.weights = None
        self.bias = None

    def build_vocab(self, dataset):
        text_corpus = ""
        for i in range(0, dataset.shape[0]):
            text_corpus += " " + dataset.iloc[i]['Text']
        text_corpus = text_corpus.lower()
        
        stemmed_corpus = replace_text_stemming(text_corpus)
        vocabulary = get_word_count(stemmed_corpus)
        self.vocabulary = vocabulary

    def feature_extraction(self, data):
        """
        Optional helper method to code the feature extraction function to transform the raw dataset into a processed
        dataset to be used in perceptron training.
        """
        df = []
        for i in range(0, data.shape[0]):
            df.append(get_vector(data.iloc[i]['Text'], self.vocabulary))
        return np.array(df)

    def sgn_function(self, perceptron_input):
        """
        Optional helper method to code the sign function for the perceptron.
        """
        if perceptron_input <=0: return 0
        return 1

    def update_weights(self, update, feature):
        """
        Optional helper method to update the weights of the perceptron.
        """
        new_weights = update * feature
        self.weights += new_weights

    def update_bias(self, update):
        """
        Optional helper method to update the bias of the perceptron.
        """
        self.bias += update

    def predict_labels(self, data_point):
        """
        Optional helper method to produce predictions for a single data point.
        """
        prediction = np.dot(data_point, self.weights) + self.bias
        return self.sgn_function(prediction)

    def train(self, labeled_data, learning_rate=None, max_iter=None):
        """
        You must implement this function and it must take in as input data in the form of a pandas dataframe. This
        dataframe must have the label of the data points stored in a column called 'Label'. For example, the column
        labeled_data['Label'] must return the labels of every data point in the dataset. Additionally, this function
        should not return anything.

        The hyperparameters for training will be the learning rate and max number of iterations. Once you find the
        optimal values of the hyperparameters, update the default values for each keyword argument to reflect those
        values.

        The goal of this function is to train the perceptron on the labeled data. Feel free to code this however you
        want.
        """
        self.build_vocab(labeled_data)
        X, y = get_features_labels(labeled_data, 'Label')
        X = self.feature_extraction(X)

        # initialize weights
        self.weights = np.zeros(X.shape[1])
        self.bias = 0

        for _ in range(max_iter):
            
            for idx, x_feature in enumerate(X):
                output = np.dot(x_feature, self.weights) + self.bias
                y_predicted = self.sgn_function(output)

                # Perceptron update rule
                update = learning_rate * (y[idx] - y_predicted)
                self.update_weights(update, x_feature)
                self.update_bias(update)
        return

    def predict(self, data):
        predicted_labels = []
        """
        This function is designed to produce labels on some data input. The first input is the data in the form of a 
        pandas dataframe. 
        
        Finally, you must return the variable predicted_labels which should contain a list of all the 
        predicted labels on the input dataset. This list should only contain integers that are either 0 (negative) or 1
        (positive) for each data point.
        
        The rest of the implementation can be fully customized.
        """
        X, y = get_features_labels(data, 'Label')
        X = self.feature_extraction(X)

        for feat in X:
            predicted_labels.append(self.predict_labels(feat))
        return predicted_labels


In [96]:
model_perceptron  = Perceptron()

In [97]:
dataset = pd.read_csv('data.csv')

In [98]:
model_perceptron.train(dataset, 0.1, 50)

In [99]:
prediction_perceptron = model_perceptron.predict(dataset)

In [101]:
true_labels = dataset['Label']
predictions_correct = np.array(prediction_perceptron) == true_labels
predictions_correct.mean()

1.0

In [103]:
test_data = pd.read_csv('./test_data.csv')
test_pred = model_perceptron.predict(test_data)
test_labels = test_data['Label']
predictions_correct = np.array(test_pred) == test_labels
predictions_correct.mean()

1.0

In [104]:
def get_word_count(text):
    stopwords_list = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
    word_count = {}
    for sent in sent_tokenize(text):
        for word in word_tokenize(sent):
            if word in word_count and word not in stopwords_list and word not in string.punctuation:
                word_count[word] += 1
            elif word not in word_count and word not in stopwords_list and word not in string.punctuation:
                word_count[word] = 1
    return word_count

def tokenize_text(text):
    return [word for sent in sent_tokenize(text) for word in word_tokenize(sent)]

def replace_text_stemming(text):
    stemmer = nltk.stem.PorterStemmer()
    stems = [stemmer.stem(word) for word in tokenize_text(text)]
    return " ".join(stems)

def get_vector(text, vocabulary):
    tokens = tokenize_text(text)
    vector = np.zeros(len(vocabulary))
    for token in tokens:
        if token in vocabulary:
            vector[list(vocabulary.keys()).index(token)]+= 1
    return vector
    
def get_features_labels(df, label_column_name):
    if 'Unnamed: 0' in df.columns:
        df.drop('Unnamed: 0', 1, inplace=True)
    labels = df[label_column_name]
    train_features = df.drop(label_column_name, 1)
    return train_features, labels.to_numpy()

def build_vocab(dataset):
    text_corpus = ""
    for i in range(0, dataset.shape[0]):
        text_corpus += " " + dataset.iloc[i]['Text']
    text_corpus = text_corpus.lower()
    
    stemmed_corpus = replace_text_stemming(text_corpus)
    vocabulary = get_word_count(stemmed_corpus)
    return vocabulary

In [108]:
class Logistic():
    def __init__(self):
        """
        The __init__ function initializes the instance attributes for the class. There should be no inputs to this
        function at all. However, you can setup whatever instance attributes you would like to initialize for this
        class. Below, I have just placed as an example the weights and bias of the logistic function as instance
        attributes.
        """
        self.vocabulary = None
        self.weights = None
        self.bias = None

    def get_vocab(self, dataset):
        self.vocabulary = Pre.build_vocab(dataset)
    
    def sigmoid(self, z):
	    return(1 / (1 + np.exp(-self.z))) 
    
    def feature_extraction(self, data):
        """
        Optional helper method to code the feature extraction function to transform the raw dataset into a processed
        dataset to be used in training.
        """
        df = []
        for i in range(0, data.shape[0]):
            df.append(Pre.get_vector(data.iloc[i]['Text'], self.vocabulary))
        return np.array(df)

    def logistic_loss(self, predicted_label, true_label):
        """
        Optional helper method to code the loss function.
        """

        return - np.sum(np.dot(true_label, np.log(predicted_label)), np.dot(1-true_label, np.log(1-predicted_label)))

    def stochastic_gradient_descent(self, data, error):
        """
        Optional helper method to compute a gradient update for a single point.
        """

        return np.dot(data, error)

    def update_weights(self, learning_rate, gradient):
        """
        Optional helper method to update the weights during stochastic gradient descent.
        """
        new_weights = learning_rate*gradient
        self.weights -= new_weights

    def update_bias(self, learning_rate, error):
        """
        Optional helper method to update the bias during stochastic gradient descent.
        """
        new_bias = np.dot(learning_rate,error)
        self.bias -= new_bias

    def predict_labels(self, data_point):
        """
        Optional helper method to produce predictions for a single data point
        """
        return np.round(self.sigmoid(np.dot(data_point, self.weights)))

    def train(self, labeled_data, learning_rate=0.1, max_epochs=50):
        """
        You must implement this function and it must take in as input data in the form of a pandas dataframe. This
        dataframe must have the label of the data points stored in a column called 'Label'. For example, the column
        labeled_data['Label'] must return the labels of every data point in the dataset. Additionally, this function
        should not return anything.

        The hyperparameters for training will be the learning rate and maximum number of epochs. Once you find the
        optimal values, update the default values for both the learning rate and max epochs keyword argument.

        The goal of this function is to train the logistic function on the labeled data. Feel free to code this
        however you want.
        """
        self.get_vocab(labeled_data)
        X, y = Pre.get_features_labels(labeled_data, 'Label')
        X = self.feature_extraction(X)

        bias = np.ones((X.shape[0], 1))
        X = np.concatenate((bias, X), axis=1)

        # initialize weights
        self.weights = np.zeros(X.shape[1])
        self.bias = 0

        for step in range(0, max_epochs):
            for idx, x_feature in enumerate(X):
                scores = np.dot(x_feature, self.weights)
                prediction = self.sigmoid(scores)

                # Update weights with gradient
                output_error_signal = y[idx] - prediction
                gradient = self.stochastic_gradient_descent(x_feature, output_error_signal)
                self.update_weights(learning_rate, gradient)
                self.update_bias(learning_rate, output_error_signal)
        return

    def predict(self, data):
        predicted_labels = []
        """
        This function is designed to produce labels on some data input. The only input is the data in the form of a 
        pandas dataframe. 

        Finally, you must return the variable predicted_labels which should contain a list of all the 
        predicted labels on the input dataset. This list should only contain integers  that are either 0 (negative) or 1
        (positive) for each data point.

        The rest of the implementation can be fully customized.
        """
        X, y = Pre.get_features_labels(data, 'Label')
        X = self.feature_extraction(X)

        for feature in X:
            predicted_labels.append(self.predict_labels(feature))
        return predicted_labels

In [109]:
model_logistic = Logistic()
dataset = pd.read_csv('data.csv')

In [110]:
model_perceptron.train(dataset, 0.1, 50)
# prediction_perceptron = model_perceptron.predict(dataset)
# true_labels = dataset['Label']
# predictions_correct = np.array(prediction_perceptron) == true_labels
# predictions_correct.mean()

In [111]:
prediction_perceptron = model_perceptron.predict(dataset)
true_labels = dataset['Label']
predictions_correct = np.array(prediction_perceptron) == true_labels
predictions_correct.mean()

1.0

In [116]:
def get_kfold_split(train_set, fold_number):
    # train_set = pd.read_csv(input_filename)
    # train_set = train_set.sample(random_state=18, frac=1)
    # fold_number = 10
    fold_size=len(train_set)/fold_number
    fold_data_list=[]
    for i in range(fold_number):
        new_fold=train_set.iloc[int(i*fold_size):int((i+1)*fold_size),:]
        fold_data_list.append(new_fold)
    return fold_data_list

def get_train_test_data(fold_data_list, ind):
    test_set = fold_data_list[ind]
    rem_set = []
    for k, data in enumerate(fold_data_list):
        if(k!=ind):
            rem_set.append(fold_data_list[k])
    # new_train_set = pd.concat(rem_set)
    train_set = pd.concat(rem_set)
    # new_train_set = new_train_set.sample(random_state=32, frac=fraction)
    # new_X_train, new_y_train = q2.get_features_labels(new_train_set)
    # new_X_test, new_y_test = q2.get_features_labels(test_set)
    return train_set, test_set

In [127]:
"""
You may need to import necessary modules like numpy and pandas. However, you can't use any external
libraries such as sci-kit learn, etc. to implement the perceptron and the training of the perceptron.
The implementation must be done completely by yourself.

We are allowing you to use two packages from nltk for text processing: nltk.stem and nltk.tokenize. You cannot import
nltk in general, but we are allowing the use of these two packages only. We will check the code in your programs to
make sure this is the case and if other packages in nltk are used then we will deduct points from your assignment.
"""

"""
This is a Python class meant to represent the perceptron model and any sort of feature processing that you may do. You 
have a lot of flexibility on how you want to implement the training of the perceptron but below I have listed 
functionality that should not change:
    - Arguments to the __init__ function 
    - Arguments and return statement of the train function
    - Arguments and return statement of the predict function 


When you want the program (perceptron) to train on a dataset, the train function will only take one input which is the 
raw copy of the data file as a pandas dataframe. Below, is example code of how this is done:

    data = pd.read_csv('data.csv', index_col=0)
    model = Perceptron()
    model.train(data) # Train the model on data.csv


It is assumed when this program is evaluated, the predict function takes one input which is the raw copy of the
data file as a pandas dataframe and produce as output the list of predicted labels. Below is example code of how this 
is done:

    data = pd.read_csv('data.csv', index_col=0)
    model = Perceptron()
    predicted_labels = model.predict(data) # Produce predictions using model on data.csv

I have added several optional helper methods for you to use in building the pipeline of training the perceptron. It is
up to your discretion on if you want to use them or add your own methods.
"""
import numpy as np
import Preprocess as Pre

class Perceptron():
    def __init__(self):
        """
        The __init__ function initializes the instance attributes for the class. There should be no inputs to this
        function at all. However, you can setup whatever instance attributes you would like to initialize for this
        class. Below, I have just placed as an example the weights and bias of the perceptron as instance attributes.
        """
        self.vocabulary = None
        self.weights = None
        self.bias = None
    
    def get_vocab(self, dataset):
        self.vocabulary = Pre.build_vocab(dataset)

    def feature_extraction(self, data):
        """
        Optional helper method to code the feature extraction function to transform the raw dataset into a processed
        dataset to be used in perceptron training.
        """
        df = []
        for i in range(0, data.shape[0]):
            df.append(Pre.get_vector(data.iloc[i]['Text'], self.vocabulary))
        return np.array(df)

    def sgn_function(self, perceptron_input):
        """
        Optional helper method to code the sign function for the perceptron.
        """
        if perceptron_input <=0: return 0
        return 1

    def update_weights(self, update, feature):
        """
        Optional helper method to update the weights of the perceptron.
        """
        new_weights = update * feature
        self.weights += new_weights

    def update_bias(self, update):
        """
        Optional helper method to update the bias of the perceptron.
        """
        self.bias += update

    def predict_labels(self, data_point):
        """
        Optional helper method to produce predictions for a single data point.
        """
        prediction = np.dot(data_point, self.weights) + self.bias
        return self.sgn_function(prediction)

    def train(self, labeled_data, learning_rate=0.1, max_iter=50):
        """
        You must implement this function and it must take in as input data in the form of a pandas dataframe. This
        dataframe must have the label of the data points stored in a column called 'Label'. For example, the column
        labeled_data['Label'] must return the labels of every data point in the dataset. Additionally, this function
        should not return anything.

        The hyperparameters for training will be the learning rate and max number of iterations. Once you find the
        optimal values of the hyperparameters, update the default values for each keyword argument to reflect those
        values.

        The goal of this function is to train the perceptron on the labeled data. Feel free to code this however you
        want.
        """
        self.get_vocab(labeled_data)
        X, y = Pre.get_features_labels(labeled_data, 'Label')
        X = self.feature_extraction(X)

        # initialize weights
        self.weights = np.zeros(X.shape[1])
        self.bias = 0

        for _ in range(max_iter):
            
            for idx, x_feature in enumerate(X):
                output = np.dot(x_feature, self.weights) + self.bias
                y_predicted = self.sgn_function(output)

                # Perceptron update rule
                update = learning_rate * (y[idx] - y_predicted)
                self.update_weights(update, x_feature)
                self.update_bias(update)
        return

    def predict(self, data):
        predicted_labels = []
        """
        This function is designed to produce labels on some data input. The first input is the data in the form of a 
        pandas dataframe. 
        
        Finally, you must return the variable predicted_labels which should contain a list of all the 
        predicted labels on the input dataset. This list should only contain integers that are either 0 (negative) or 1
        (positive) for each data point.
        
        The rest of the implementation can be fully customized.
        """
        X, y = Pre.get_features_labels(data, 'Label')
        X = self.feature_extraction(X)

        for feat in X:
            predicted_labels.append(self.predict_labels(feat))
        return predicted_labels


In [128]:
all_data = pd.read_csv('data.csv', index_col=0)

In [132]:
import pandas as pd
import numpy as np
import nltk
import string
import re
from nltk.tokenize import sent_tokenize, word_tokenize

def remove_between_square_brackets(text):
  return re.sub('\[[^]]*\]', '', text)

def remove_special_characters(text, remove_digits=True):
  pattern=r'[^a-zA-z0-9\s]'
  text=re.sub(pattern,'',text) 
  return text

def get_word_count(text):
    stopwords_list = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
    word_count = {}
    for sent in sent_tokenize(text):
        for word in word_tokenize(sent):
            if word in word_count and word not in stopwords_list and word not in string.punctuation:
                word_count[word] += 1
            elif word not in word_count and word not in stopwords_list and word not in string.punctuation:
                word_count[word] = 1
    return word_count

def tokenize_text(text):
    return [word for sent in sent_tokenize(text) for word in word_tokenize(sent)]

def replace_text_stemming(text):
    stemmer = nltk.stem.PorterStemmer()
    stems = [stemmer.stem(word) for word in tokenize_text(text)]
    return " ".join(stems)

def get_vector(text, vocabulary):
    tokens = tokenize_text(text)
    vector = np.zeros(len(vocabulary))
    for token in tokens:
        if token in vocabulary:
            vector[list(vocabulary.keys()).index(token)]+= 1
    return vector
    
def get_features_labels(df, label_column_name):
    if 'Unnamed: 0' in df.columns:
        df.drop('Unnamed: 0', 1, inplace=True)
    labels = df[label_column_name]
    train_features = df.drop(label_column_name, 1)
    return train_features, labels.to_numpy()

def build_vocab(dataset):
    text_corpus = ""
    for i in range(0, dataset.shape[0]):
        text_corpus += " " + dataset.iloc[i]['Text']
    text_corpus = text_corpus.lower()
    text_corpus = remove_between_square_brackets(text_corpus)
    text_corpus = remove_special_characters(text_corpus, True)
    
    stemmed_corpus = replace_text_stemming(text_corpus)
    vocabulary = get_word_count(stemmed_corpus)
    updated_vocab = {}
    for key in vocabulary.keys():
        if(vocabulary[key]>0.01*dataset.shape[0] and vocabulary[key]<2*dataset.shape[0]):
            updated_vocab[key] = vocabulary[key]
    return updated_vocab

In [133]:
perceptron = Perceptron()
max_iter_list = [1,5,10,20,50,100]
learning_rate_list = [0.01, 0.05, 0.1, 0.2, 0.8]
# learning_rate_list = [0.1]
fold_data_list = get_kfold_split(all_data, 5)
# create a for loop here
# for iter in max_iter_list:
train_iter_list = []
val_iter_list = []
for l in learning_rate_list:
    print("Current learning rate:", l)
    train_learn_list = []
    val_learn_list = []
    for ind in range(5):
        print("Current index :", ind)
        train_set, test_set = get_train_test_data(fold_data_list,ind)
        perceptron.train(train_set, l, 5)
        predicted_train_labels_perceptron = perceptron.predict(train_set)
        predicted_test_labels_perceptron = perceptron.predict(test_set)
        train_accuracy, val_accuracy = eval(train_set['Label'].tolist(), predicted_train_labels_perceptron, test_set['Label'].tolist(),predicted_test_labels_perceptron)
        train_learn_list.append(train_accuracy)
        val_learn_list.append(val_accuracy)
    train_mean = np.mean(train_learn_list)
    val_mean = np.mean(val_learn_list)
train_iter_list.append(train_mean)
val_iter_list.append(val_mean)
plt.scatter(learning_rate_list,val_iter_list)
plt.title("accuracy vs learning rate curve")
plt.xlabel("Learning Rate")
plt.ylabel("Accuracy")
plt.show()

Current learning rate: 0.1
Current index : 0

Training Accuracy Result!
***************
Accuracy: 0.8985294117647059
***************

Testing Accuracy Result!
***************
Accuracy: 0.7558823529411764
***************


TypeError: cannot unpack non-iterable NoneType object

In [123]:
def eval(o_train, p_train, o_val, p_val):
    # print('\nTraining Accuracy Result!')
    train_accuracy = accuracy(o_train, p_train)
    # print('\nTesting Accuracy Result!')
    val_accuracy = accuracy(o_val, p_val)
    return train_accuracy, val_accuracy

In [124]:
def accuracy(orig, pred):
    num = len(orig)
    if (num != len(pred)):
        print('Error!! Num of labels are not equal.')
        return
    match = 0
    for i in range(len(orig)):
        o_label = orig[i]
        p_label = pred[i]
        if (o_label == p_label):
            match += 1
    print('***************\nAccuracy: '+str(float(match) / num)+'\n***************')



-------------Perceptron Performance-------------



NameError: name 'train_data' is not defined

In [125]:
print('\n\n-------------Perceptron Performance-------------\n')
# This command also runs the evaluation on the unseen test set
eval(train_set['Label'].tolist(), predicted_train_labels_perceptron, test_set['Label'].tolist(),
        predicted_test_labels_perceptron)



-------------Perceptron Performance-------------


Training Accuracy Result!
***************
Accuracy: 1.0
***************

Testing Accuracy Result!
***************
Accuracy: 0.7794117647058824
***************
