## Pre-Process Data ##

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import sklearn as sk
import sklearn.datasets
import sklearn.metrics
import os
from LinearRegression import LinearRegression
from sklearn.linear_model import LinearRegression as LR
#from LogisticRegression import LogisticRegression as LR
from sklearn.linear_model import LogisticRegression
from MultiClassRegression import MultiClassRegression
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder


### Extact, Transform, Load (ETL) ###

In [None]:
RATINGS = [1, 2, 3, 4, 7, 8, 9, 10]

def extract_word_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read().splitlines()

def parse_features_and_labels(feature_path='../aclImdb/test/labeledBow.feat'):
    """
    Parse the feature file to extract word frequencies and labels for each review.
    
    Args:
    - feature_path (str): Path to the feature file.
    
    Returns:
    - List of dictionaries with 'label' and 'features' keys.
    """
    reviews_data = []
    with open(feature_path, 'r', encoding='utf-8') as file:
        for line in file:
            label, *features = line.strip().split() # ratings and (word : count) pairs
            review_data = {
                'label': int(label),
                'features': {int(f.split(':')[0]): int(f.split(':')[1]) for f in features}
            }
            reviews_data.append(review_data)
    return reviews_data

def create_frequency_dataframe(reviews_data):
    """
    Create a DataFrame from review data with word frequencies for each label.
    
    Args:
    - reviews_data (List[Dict]): Parsed review data including labels and features.
    
    Returns:
    - A pandas DataFrame with word indices as rows, labels as columns, and frequencies as cell values.
    """
    word_label_freq = defaultdict(lambda: defaultdict(int))
    
    for review in reviews_data:
        label = review['label']
        for word_index, count in review['features'].items():
            word_label_freq[word_index][label] += count
            
    data = []
    for word_index, label_freqs in word_label_freq.items():
        
        for label, freq in label_freqs.items():
            data.append({'word_index': word_index, 'label': label, 'frequency': freq})
    
    df = pd.DataFrame(data)
    df_pivoted = df.pivot(index='word_index', columns='label', values='frequency').fillna(0)

    for rating in RATINGS:
        if rating not in df_pivoted.columns:
            df_pivoted[rating] = 0.0
    df_pivoted = df_pivoted[RATINGS]

    return df_pivoted

def add_words_to_dataframe(df, word_data_path='../aclImdb/imdb.vocab', debug=False):
    """
    Add words to the DataFrame as a new column, matching the index of each word in the vocab file
    to the word index in the DataFrame.
    
    Args:
    - df (pd.DataFrame): DataFrame with word frequencies for each label.
    - word_data_path (str): Path to the vocab file.
    
    Returns:
    - A DataFrame with a new 'word' column.
    """
    words = extract_word_data(word_data_path)
    
    # The vocabulary indices are off by one, subtract one to align with the DataFrame
    index_to_word = {index: word for index, word in enumerate(words, start=0)}
    
    # Map the word indices to the DataFrame indices
    df['word'] = df.index.map(index_to_word)

    if debug:
        # Verify all DataFrame indices have corresponding words in vocab
        vocab_indices_set = set(range(len(words)))
        dataframe_indices_set = set(df.index.astype(int))
        
        missing_indices_in_vocab = dataframe_indices_set - vocab_indices_set
        if missing_indices_in_vocab:
            print(f"Warning: {len(missing_indices_in_vocab)} indices in DataFrame not found in vocab file.")
            
        missing_indices_in_df = vocab_indices_set - dataframe_indices_set
        if missing_indices_in_df:
            print(f"Note: {len(missing_indices_in_df)} indices in vocab file not represented in DataFrame. This may be expected due to filtering.")
        
    return df


def filter_features(reviews_data, min_feature_frequency=0.01, max_feature_frequency=0.5, use_quantiles=False, quantiles=(0.05, 0.95)):
    """
    Filters out features (words) that occur in less than the specified minimum frequency of reviews or more than the specified maximum frequency of reviews.
    Optionally, filters based on quantiles.

    Args:
    - reviews_data (List[Dict]): List of dictionaries, each representing a review with word frequencies.
    - min_feature_frequency (float): Minimum frequency threshold as a fraction of total reviews if not using quantiles.
    - max_feature_frequency (float): Maximum frequency threshold as a fraction of total reviews if not using quantiles.
    - use_quantiles (bool): Whether to use quantile-based filtering.
    - quantiles (tuple): A tuple representing the lower and upper quantile thresholds for filtering.

    Returns:
    - A filtered list of dictionaries, with each dictionary representing a review and containing only the features that meet the frequency criteria.
    """
    total_reviews = len(reviews_data)
    word_occurrences = defaultdict(int)

    # Count the number of reviews each word appears in
    for review in reviews_data:
        for word_index in review['features']:
            word_occurrences[word_index] += 1

    if use_quantiles:
        # Use quantile-based filtering
        sorted_occurrences = sorted(word_occurrences.values())
        lower_bound = sorted_occurrences[int(quantiles[0] * total_reviews)]
        upper_bound = sorted_occurrences[int(quantiles[1] * total_reviews) - 1]
    else:
        # Use fixed frequency thresholds
        lower_bound = total_reviews * min_feature_frequency
        upper_bound = total_reviews * max_feature_frequency

    # Identify words that meet the frequency criteria
    valid_words = {word_index for word_index, count in word_occurrences.items()
                   if lower_bound <= count <= upper_bound}

    # Filter reviews to only include valid words
    filtered_reviews_data = []
    for review in reviews_data:
        filtered_features = {word_index: count for word_index, count in review['features'].items()
                             if word_index in valid_words}
        filtered_review = {'label': review['label'], 'features': filtered_features}
        filtered_reviews_data.append(filtered_review)

    return filtered_reviews_data

def select_top_features_by_regression_coefficients(df, n=500):
    features_df = df.drop(columns=['word'])
    top_features = []
    correlations = []
    for index, row in features_df.iterrows():
        word = df.loc[index, 'word']
        
        X = row.values.reshape(-1, 1)
        y = np.array(RATINGS).reshape(-1, 1)
        model = LR()
        model.fit(X, y)
        coefficient = model.coef_[0][0]  # Assuming a single coefficient
        top_features.append((word, coefficient))
        correlation = np.corrcoef(X.flatten(), y.flatten())[0, 1]
        correlations.append((word, correlation))
    
    top_features.sort(key=lambda x: abs(x[1]), reverse=True)
    correlations.sort(key=lambda x: abs(x[1]), reverse=True)
    
    top_n_features = [feature[0] for feature in top_features[:n]]
    top_features_df = pd.DataFrame(top_features[:n], columns=['word', 'coefficient'])
    top_correlations_df = pd.DataFrame(correlations[:n], columns=['word', 'correlation'])
    
    return df[df['word'].isin(top_n_features)], top_features_df, top_correlations_df

    

In [None]:
# get the reviews data (features and labels)
reviews_data = parse_features_and_labels()
# Filter the features by frequency
filtered_reviews_data = filter_features(reviews_data, min_feature_frequency=0.01, max_feature_frequency=0.5)
# Create a DataFrame with word frequencies for each label (rating)
imdb_feature_frequency_df = create_frequency_dataframe(filtered_reviews_data)
unfiltered_imdb_feature_frequency_df = create_frequency_dataframe(reviews_data)
unfiltered_imdb_feature_frequency_df = add_words_to_dataframe(unfiltered_imdb_feature_frequency_df, debug=True)
print(imdb_feature_frequency_df.head())
imdb_feature_frequency_df = add_words_to_dataframe(imdb_feature_frequency_df, debug=True)
top_features, features_and_coefficients, correlations = select_top_features_by_regression_coefficients(imdb_feature_frequency_df, n=500)
print(features_and_coefficients.head())
print(top_features.head())

# Evaluation and Plotting Functions #

### Evaluation Functions ###

In [None]:

def evaluate_multiclass_classification(df):
    """
    compute classification accuracy
    """
    pass


### Plotting Functions ###

In [None]:
def plot_word_frequency_distribution_with_regression(df, word_of_interest):
    """
    Plot the distribution of frequencies for a specific word across labels with flipped axes and a regression line.

    Args:
    - df (pd.DataFrame): DataFrame with word frequencies for each label and a 'word' column.
    - word_of_interest (str): The word to plot the distribution for.
    - RATINGS (list): The list of labels (ratings) corresponding to the columns in df.
    """
    # Ensure the word of interest is in the DataFrame
    if word_of_interest not in df['word'].values:
        print(f"The word '{word_of_interest}' does not exist in the DataFrame.")
        return

    # Extract frequencies for the word of interest
    word_data = df.loc[df['word'] == word_of_interest, RATINGS].values.flatten()
    
    # Convert RATINGS to a numeric scale if they're not already (e.g., strings to integers)
    ratings_numeric = np.arange(len(RATINGS))

    # Prepare data for regression model
    X = word_data.reshape(-1, 1)  # Frequency data as features
    y = ratings_numeric.reshape(-1, 1)  # Numeric ratings as target

    # Fit the linear regression model
    model = LinearRegression()
    model.fit(X, y)
    
    # Predict y values for plotting the regression line
    y_pred = model.predict(X)

    # Plotting
    plt.figure(figsize=(10, 6))
    plt.scatter(X, y, color='blue', label=f'Frequencies of "{word_of_interest}"')
    plt.plot(X, y_pred, color='red', label='Regression Line')
    plt.title(f'Distribution of Frequencies for "{word_of_interest}" Across Labels')
    plt.xlabel('Frequency')
    plt.ylabel('Labels (Ratings)')
    plt.yticks(ratings_numeric, RATINGS)  # Map numeric ratings back to original RATINGS for y-axis labels
    plt.legend()
    plt.grid(True)
    plt.show()

def plot_word_distribution(df, column='frequency', include_percentiles=False):
    """
    Plot the distribution of word frequencies using Seaborn and optionally mark specified percentiles.

    Args:
    - df (pd.DataFrame): DataFrame containing word frequencies.
    - column (str): The column name in df that contains the word frequencies.
    - include_percentiles (bool): Whether to include and annotate percentiles on the plot.
    """
    # Setting the style of seaborn
    sns.set(style='whitegrid')

    # Creating the plot
    plt.figure(figsize=(10, 6))
    sns.histplot(df[column], bins=30, kde=True, color='skyblue')

    if include_percentiles:
        # Specified percentiles to include
        percentiles = [1, 2, 5, 25, 50, 75]
        percentile_values = np.percentile(df[column], percentiles)

        # Annotate each specified percentile on the plot
        for percentile, value in zip(percentiles, percentile_values):
            plt.axvline(x=value, color='red', linestyle='--')
            plt.text(value, plt.gca().get_ylim()[1], f'{value:.2f}', color='red', ha='right', va='bottom')
            
            # Calculate and print the number of words below each percentile threshold
            num_words_below = (df[column] <= value).sum()
            print(f'Number of words below the {percentile}th percentile (frequency <= {value:.2f}): {num_words_below}')
            print(f'Percentage of total words below this percentile: {100 * num_words_below / 89526:.2f}%')

        plt.title('Distribution of Word Frequencies with Percentiles')
    else:
        plt.title('Distribution of Word Frequencies')
    
    plt.xlabel('Word Frequency')
    plt.ylabel('Count')

    # Show plot
    plt.show()


In [None]:
# Sum frequencies across labels to get a total frequency for each word
#imdb_feature_frequency_df['total_frequency'] = imdb_feature_frequency_df.sum(axis=1)
plot_word_frequency_distribution_with_regression(imdb_feature_frequency_df, 'avoid')
# Now you can plot the distribution of these total frequencies
#plot_word_distribution(imdb_feature_frequency_df, column='total_frequency', include_percentiles=True)

In [None]:


def plot_training_progress(logistic_regression_model):
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.plot(logistic_regression_model.loss_history, label='Loss', color='b', ls='-',linewidth=1)
    plt.title('Loss Progress over Iterations')
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(logistic_regression_model.gradient_norm_history, label='Gradient Norm')
    plt.title('Gradient Norm Progress over Iterations')
    plt.xlabel('Iteration')
    plt.ylabel('Gradient Norm')
    plt.legend()

    plt.tight_layout()
    plt.show()

def plot_binary_classification(results):
    """
    plot ROC curve
    compare to DT from sklearn
    
    Args:
    - results: tuple of fpr, tpr, auroc
    
    Return:
    - None, plot the ROC curve
    
    """
    fpr, tpr, auroc = results
    print(f"Area under the ROC curve: {auroc}")
    print(f"True positive rate: {tpr}")
    print(f"False positive rate: {fpr}")
    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % auroc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()
    

def plot_multiclass_classification(results):
    """
    compare accuracy to DT from sklearn
    
    """
    pass

def plot_top_20_features_from_imdb_simple(top_positive_features, top_negative_features):
    """
    A horizontal bar plot showing the top 20 features from the Simple linear regression on the IMDB data
    
    Characteristics:
    - 10 most positive and negative coefficients as the x-axis 
    - Feature names (i.e., words) as the y-axis  
    """
    # Plot the top 20 positive and negative features on the same plot
    plt.figure(figsize=(10, 10))
    # assign each a different color and make negatives positive
    top_negative_features['Impact'] = top_negative_features['Impact'].abs()
    plt.barh(top_positive_features['Word'].head(10), top_positive_features['Impact'].head(10), color='b', label='Positive Impact')
    plt.barh(top_negative_features['Word'].head(10), top_negative_features['Impact'].head(10), color='r', label='Negative Impact')
    plt.xlabel('Regression Coefficients (Absolute Value)')
    plt.ylabel('Word')
    plt.title('Top 20 Features from Simple Linear Regression')
    plt.legend()
    #plt.savefig('top_20_features_from_imdb_simple.png')
    plt.show()
    
    
def plot_model_convergence(results, learning_rate):
    """
    Convergence plot on how the logistic and multiclass regression converge given a reason- ably chosen learning rate.
    """
    pass

def plot_imdb_data_roc_curve(results):
    """
    A single plot containing two ROC curves of logistic regression and sklearn-DT (Decision Trees) on the IMDB test data.
    """
    pass

def plot_imdb_data_auroc(results):
    """
    A bar plot that shows the AUROC of logistic regression and DT on the test data (y-axis) 
    as a function of the 20%, 40%, 60%, 80%, and 100% training data (x-axis)
    """
    pass

def plot_news_data_classification_accuracy(results):
    """
    A bar plot that shows the classification accuracies of multiclass regression and DT 
    on the test data (y-axis) as a function of the 20%, 40%, 60%, 80%, and 100% training data (x- axis)
    """
    pass

def plot_top_20_features_from_imdb_logistic(results):
    """
    A horizontal bar plot showing the top 20 features (10 most positive and 10 most negative) 
    from the logistic regression on the IMDB data with the coefficient as the x-axis and the feature names (i.e., words) as the y-axis
    """
    pass

def plot_heatmap_of_multi_classification(results):
    """
    A heatmap showing the top 5 most positive features as rows for each class as columns in the multi-class classification 
    on 4 the chosen classes from the 20-news group datasets. Therefore, your heatmap should be a 20-by-4 dimension.

    """
    pass

In [None]:
plot_training_progress(model)
plot_binary_classification(results)
    

# Experiments #

### Experiment 1 ###

In [None]:
def experiment_one():
    """
    Report the:
    - top 10 features with the most positive coefficients
    - top 10 features with the most negative coefficients 
    
    on the IMDB data using simple linear regression on the movie rating scores
    """
    pass

### Experiment 2 ###

In [None]:
def experiment_two():
    """
    Implement and conduct:
    - Binary classification on the IMDb Reviews
    - Multi-class classification on the 20 news group dataset
    """
    pass

### Experiment 3 ###

In [None]:
def experiment_three():
    """
    On the same plot as 2, draw ROC curves and report the AUROC 
    values of logistic regression and Decision Trees on the 
    IMDB data binary classification task
    
    """
    pass

### Experiment 4 ###

In [None]:
def experiment_four():
    """
    Report the:
    - Multiclass classification accuracy of multiclass linear regression 
    - Decision Trees on the 5 chosen classes from the 20-news-group data
    """
    pass

### Experiment 5 ###

In [None]:
def experiment_five():
    """
    Plot and compare the accuracy of the two models as a function of the
    size of dataset by controlling the training size
    
    For example, you can randomly select 20%, 40%, 60% and 80% of the available 
    training data and train your model on this subset and evaluate the trained 
    model on the held-out test set.
    """
    pass

### Experiment 6 ###

In [None]:
def experiment_six():
    """
    Compare and evaluate the performance of different learning rates
    """
    pass

### Experiment 7 ###

In [None]:
def experiment_seven():
    """
    Evaluate the performance of the multiclass regression on more than 5 classes
    
    Compare the top k (e.g. k =3) predicted classes. 
    A correct prediction is determined by whether the true label is within the top k predicted labels. 
    The scoring mechanism involves assigning a score of 1 if the correct label is among the top k predictions and 0 otherwise.
    """
    pass