## Pre-Process Data ##

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import sklearn as sk
import sklearn.datasets
import sklearn.metrics
import os
from LinearRegression import LinearRegression
#from sklearn.linear_model import LinearRegression
from LogisticRegression import LogisticRegression as LR
from sklearn.linear_model import LogisticRegression
from MultiClassRegression import MultiClassRegression
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder


### Extact, Transform, Load (ETL) ###

In [22]:
RATINGS = [1, 2, 3, 4, 7, 8, 9, 10]

def extract_word_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read().splitlines()

def parse_features_and_labels(feature_path='../aclImdb/test/labeledBow.feat'):
    """
    Parse the feature file to extract word frequencies and labels for each review.
    
    Args:
    - feature_path (str): Path to the feature file.
    
    Returns:
    - List of dictionaries with 'label' and 'features' keys.
    """
    reviews_data = []
    with open(feature_path, 'r', encoding='utf-8') as file:
        for line in file:
            label, *features = line.strip().split()
            review_data = {
                'label': int(label),
                'features': {int(f.split(':')[0]): int(f.split(':')[1]) for f in features}
            }
            reviews_data.append(review_data)
    return reviews_data

def filter_features(reviews_data, min_feature_frequency=0.01, max_feature_frequency=0.5):
    """
    Filters out features (words) that occur in less than 1% of the reviews or more than 50% of the reviews.
    
    Args:
    - reviews_data (List[Dict]): List of dictionaries, each representing a review with word frequencies.
    - min_feature_frequency (float): Minimum frequency threshold as a fraction of total reviews.
    - max_feature_frequency (float): Maximum frequency threshold as a fraction of total reviews.
    
    Returns:
    - A filtered list of dictionaries, with each dictionary representing a review and containing only the features that meet the frequency criteria.
    """
    total_reviews = len(reviews_data)
    word_occurrences = defaultdict(int)

    # Count the number of reviews each word appears in
    for review in reviews_data:
        for word_index in review['features']:
            word_occurrences[word_index] += 1

    # Calculate frequency thresholds
    min_reviews_threshold = total_reviews * min_feature_frequency
    max_reviews_threshold = total_reviews * max_feature_frequency

    # Identify words that meet the frequency criteria
    valid_words = {word_index for word_index, count in word_occurrences.items()
                   if min_reviews_threshold <= count <= max_reviews_threshold}

    # Filter reviews to only include valid words
    filtered_reviews_data = []
    for review in reviews_data:
        filtered_features = {word_index: count for word_index, count in review['features'].items()
                             if word_index in valid_words}
        filtered_review = {'label': review['label'], 'features': filtered_features}
        filtered_reviews_data.append(filtered_review)

    return filtered_reviews_data

def create_frequency_dataframe(reviews_data):
    """
    Create a DataFrame from review data with word frequencies for each label.
    
    Args:
    - reviews_data (List[Dict]): Parsed review data including labels and features.
    
    Returns:
    - A pandas DataFrame with word indices as rows, labels as columns, and frequencies as cell values.
    """
    word_label_freq = defaultdict(lambda: defaultdict(int))
    
    for review in reviews_data:
        label = review['label']
        for word_index, count in review['features'].items():
            word_label_freq[word_index][label] += count
            
    data = []
    for word_index, label_freqs in word_label_freq.items():
        for label, freq in label_freqs.items():
            data.append({'word_index': word_index, 'label': label, 'frequency': freq})
    
    df = pd.DataFrame(data)
    df_pivoted = df.pivot(index='word_index', columns='label', values='frequency').fillna(0)

    for rating in RATINGS:
        if rating not in df_pivoted.columns:
            df_pivoted[rating] = 0.0
    df_pivoted = df_pivoted[RATINGS]

    return df_pivoted

def add_words_to_dataframe(df, word_data_path='../aclImdb/imdb.vocab', debug=False):
    """
    Add words to the DataFrame as a new column, matching the index of each word in the vocab file
    to the word index in the DataFrame.
    If a word occurs in the vocab file but not the dataframe, don't add it to the DataFrame.
    
    Args:
    - df (pd.DataFrame): DataFrame with word frequencies for each label.
    - word_data_path (str): Path to the vocab file.
    
    Returns:
    - A DataFrame with a new 'word' column.
    """
    words = extract_word_data(word_data_path)
    index_to_word = {index+1: word for index, word in enumerate(words)}
    df['word'] = df.index.map(index_to_word)
    
    if debug:
        # Correct debug check: Verify all DataFrame indices have corresponding words in vocab
        vocab_indices_set = set(index_to_word.keys())
        dataframe_indices_set = set(df.index.astype(int))  # Ensure type alignment for comparison
        
        missing_indices_in_vocab = dataframe_indices_set - vocab_indices_set
        if missing_indices_in_vocab:
            print(f"Warning: {len(missing_indices_in_vocab)} indices in DataFrame not found in vocab file.")
            
        missing_indices_in_df = vocab_indices_set - dataframe_indices_set
        if missing_indices_in_df:
            print(f"Note: {len(missing_indices_in_df)} indices in vocab file not represented in DataFrame. This may be expected due to filtering.")
        
    return df


def evaluate_binary_classification(df, encoder, model):
    """Evaluate binary classification using ROC curve and AUROC."""
    X = encoder.transform(df[['feature_index']])
    y = df['sentiment']
    
    model.fit(X, y)
    
    # Assume get_test_data() is defined to return test features and labels
    test_features, test_labels = get_test_data(encoder)  # Define this function based on your needs
    predictions = model.predict(test_features)
    
    fpr, tpr, _ = roc_curve(test_labels, predictions)
    auroc = auc(fpr, tpr)
    return fpr, tpr, auroc

In [23]:
reviews_data = parse_features_and_labels()
filtered_reviews_data = filter_features(reviews_data)
df = create_frequency_dataframe(filtered_reviews_data)
df = add_words_to_dataframe(df, debug=True)
print(len(df))
print(df.head())


Note: 87806 indices in vocab file not represented in DataFrame. This may be expected due to filtering.
1721
label          1     2     3     4     7     8     9    10 word
word_index                                                     
21          4562  2367  2899  3250  3425  3886  2931  5066  you
23          3572  2042  2733  3323  3794  4128  3357  5211  are
27          6765  2272  2014  1873  1435  2038  1830  6375  one
30          3449  1844  2228  2600  2462  2925  2252  4064   at
31          3504  1863  2364  2497  2352  2832  2215  3764   by


# Evaluation and Plotting Functions #

### Evaluation Functions ###

In [9]:

def evaluate_multiclass_classification(df):
    """
    compute classification accuracy
    """
    pass

### Plotting Functions ###

In [10]:
# Helper functions
def plot_word_distribution(df, column='frequency'):
    """
    Plot the distribution of word frequencies using Seaborn.

    Args:
    - df (DataFrame): DataFrame containing word frequencies.
    - column (str): The column name in df that contains the word frequencies.
    """
    # Setting the style of seaborn
    sns.set(style='whitegrid')

    # Creating the plot
    plt.figure(figsize=(10, 6))
    sns.histplot(df[column], bins=30, kde=True)

    # Adding plot title and labels
    plt.title('Distribution of Word Frequencies')
    plt.xlabel('Word Frequency')
    plt.ylabel('Count')

    # Show plot
    plt.show()

In [11]:


def plot_training_progress(logistic_regression_model):
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.plot(logistic_regression_model.loss_history, label='Loss', color='b', ls='-',linewidth=1)
    plt.title('Loss Progress over Iterations')
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(logistic_regression_model.gradient_norm_history, label='Gradient Norm')
    plt.title('Gradient Norm Progress over Iterations')
    plt.xlabel('Iteration')
    plt.ylabel('Gradient Norm')
    plt.legend()

    plt.tight_layout()
    plt.show()

def plot_binary_classification(results):
    """
    plot ROC curve
    compare to DT from sklearn
    
    Args:
    - results: tuple of fpr, tpr, auroc
    
    Return:
    - None, plot the ROC curve
    
    """
    fpr, tpr, auroc = results
    print(f"Area under the ROC curve: {auroc}")
    print(f"True positive rate: {tpr}")
    print(f"False positive rate: {fpr}")
    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % auroc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()
    

def plot_multiclass_classification(results):
    """
    compare accuracy to DT from sklearn
    
    """
    pass

def plot_top_20_features_from_imdb_simple(top_positive_features, top_negative_features):
    """
    A horizontal bar plot showing the top 20 features from the Simple linear regression on the IMDB data
    
    Characteristics:
    - 10 most positive and negative coefficients as the x-axis 
    - Feature names (i.e., words) as the y-axis  
    """
    # Plot the top 20 positive and negative features on the same plot
    plt.figure(figsize=(10, 10))
    # assign each a different color and make negatives positive
    top_negative_features['Impact'] = top_negative_features['Impact'].abs()
    plt.barh(top_positive_features['Word'].head(10), top_positive_features['Impact'].head(10), color='b', label='Positive Impact')
    plt.barh(top_negative_features['Word'].head(10), top_negative_features['Impact'].head(10), color='r', label='Negative Impact')
    plt.xlabel('Regression Coefficients (Absolute Value)')
    plt.ylabel('Word')
    plt.title('Top 20 Features from Simple Linear Regression')
    plt.legend()
    #plt.savefig('top_20_features_from_imdb_simple.png')
    plt.show()
    
    
def plot_model_convergence(results, learning_rate):
    """
    Convergence plot on how the logistic and multiclass regression converge given a reason- ably chosen learning rate.
    """
    pass

def plot_imdb_data_roc_curve(results):
    """
    A single plot containing two ROC curves of logistic regression and sklearn-DT (Decision Trees) on the IMDB test data.
    """
    pass

def plot_imdb_data_auroc(results):
    """
    A bar plot that shows the AUROC of logistic regression and DT on the test data (y-axis) 
    as a function of the 20%, 40%, 60%, 80%, and 100% training data (x-axis)
    """
    pass

def plot_news_data_classification_accuracy(results):
    """
    A bar plot that shows the classification accuracies of multiclass regression and DT 
    on the test data (y-axis) as a function of the 20%, 40%, 60%, 80%, and 100% training data (x- axis)
    """
    pass

def plot_top_20_features_from_imdb_logistic(results):
    """
    A horizontal bar plot showing the top 20 features (10 most positive and 10 most negative) 
    from the logistic regression on the IMDB data with the coefficient as the x-axis and the feature names (i.e., words) as the y-axis
    """
    pass

def plot_heatmap_of_multi_classification(results):
    """
    A heatmap showing the top 5 most positive features as rows for each class as columns in the multi-class classification 
    on 4 the chosen classes from the 20-news group datasets. Therefore, your heatmap should be a 20-by-4 dimension.

    """
    pass

In [12]:
plot_training_progress(model)
plot_binary_classification(results)
    

NameError: name 'model' is not defined

# Experiments #

### Experiment 1 ###

In [None]:
def experiment_one():
    """
    Report the:
    - top 10 features with the most positive coefficients
    - top 10 features with the most negative coefficients 
    
    on the IMDB data using simple linear regression on the movie rating scores
    """
    pass

### Experiment 2 ###

In [None]:
def experiment_two():
    """
    Implement and conduct:
    - Binary classification on the IMDb Reviews
    - Multi-class classification on the 20 news group dataset
    """
    pass

### Experiment 3 ###

In [None]:
def experiment_three():
    """
    On the same plot as 2, draw ROC curves and report the AUROC 
    values of logistic regression and Decision Trees on the 
    IMDB data binary classification task
    
    """
    pass

### Experiment 4 ###

In [None]:
def experiment_four():
    """
    Report the:
    - Multiclass classification accuracy of multiclass linear regression 
    - Decision Trees on the 5 chosen classes from the 20-news-group data
    """
    pass

### Experiment 5 ###

In [None]:
def experiment_five():
    """
    Plot and compare the accuracy of the two models as a function of the
    size of dataset by controlling the training size
    
    For example, you can randomly select 20%, 40%, 60% and 80% of the available 
    training data and train your model on this subset and evaluate the trained 
    model on the held-out test set.
    """
    pass

### Experiment 6 ###

In [None]:
def experiment_six():
    """
    Compare and evaluate the performance of different learning rates
    """
    pass

### Experiment 7 ###

In [None]:
def experiment_seven():
    """
    Evaluate the performance of the multiclass regression on more than 5 classes
    
    Compare the top k (e.g. k =3) predicted classes. 
    A correct prediction is determined by whether the true label is within the top k predicted labels. 
    The scoring mechanism involves assigning a score of 1 if the correct label is among the top k predictions and 0 otherwise.
    """
    pass