## Pre-Process Data ##

In [189]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import sklearn as sk
import sklearn.datasets
import os
from LinearRegression import LinearRegression
#from sklearn.linear_model import LinearRegression
from LogisticRegression import LogisticRegression
from MultiClassRegression import MultiClassRegression
from collections import defaultdict

### Extact, Transform, Load (ETL) ###

In [190]:
# Don't change RATINGS
RATINGS = [1,2,3,4,7,8,9,10]

def extract_word_data(file_path):
    """Extract word data from .vocab file and return as a list."""
    with open(file_path, 'r', encoding='utf-8') as f:
        words = f.read().splitlines()
    return words

def get_correlations(df):
    """ Get the correlation between the word and the ratings"""
    correlations = []
    for _, row in df.iterrows():
        X = np.array(RATINGS).reshape(-1, 1).astype(float)
        y = row[RATINGS].values.astype(float)
        correlation = np.corrcoef(X.flatten(), y)[0, 1]
        correlations.append(correlation)
        
    # Add the correlation to the DataFrame
    df['correlation'] = correlations
    return df

def process_imdb_data(feat_file_path, vocab_file_path):
    words = extract_word_data(vocab_file_path)
    index_word_dict = {index: word for index, word in enumerate(words)}
    
    # Read in the .feat file
    with open(feat_file_path, 'r') as f:
        lines = f.readlines()
    
    total_docs = len(lines)
    print(f"Total documents: {total_docs}")
    
    # Initialize a dictionary to store the counts of each word across all ratings and document count
    word_counts = {}
    for line in lines:
        rating = int(line.split()[0])
        word_count_pairs = line.split()[1:]
        document_words = set()  # To keep track of words in this document
        
        for pair in word_count_pairs:
            word_index, count = map(int, pair.split(":"))
            document_words.add(word_index)
            
            if word_index not in word_counts:
                word_counts[word_index] = {'doc_count': 1, rating: count}
            else:
                if rating not in word_counts[word_index]:
                    word_counts[word_index][rating] = count
                else:
                    word_counts[word_index][rating] += count
        
        # Update document count for words in this document
        for word_index in document_words:
            if 'doc_count' in word_counts[word_index]:
                word_counts[word_index]['doc_count'] += 1
            else:
                word_counts[word_index]['doc_count'] = 1
                
    # Prepare data for DataFrame creation
    df_data = []
    for word_index, counts in word_counts.items():
        row = [index_word_dict.get(word_index, "Unknown"), counts.pop('doc_count')] + [counts.get(rating, 0) for rating in RATINGS]
        df_data.append(row)
    
    # Create DataFrame with correct columns, including 'word' and 'word_doc_count' as the first columns
    df = pd.DataFrame(df_data, columns=['word', 'word_doc_count'] + RATINGS)
    df = get_correlations(df)
    df = filter_words(df, total_docs)
    
    return df

def filter_words(df, total_docs):
    """Filter words based on document frequency and variance in frequencies."""
    # Convert 'word_doc_count' to document frequency by dividing by total_docs
    df['doc_frequency'] = df['word_doc_count'] / total_docs
    
    # Calculate the variance or standard deviation of frequencies across ratings for each word
    df['frequency_std'] = df[RATINGS].std(axis=1)
    
    # Adjusted Filter criteria
    min_doc_presence = 0.01  # Adjusted minimum document presence
    max_doc_presence = 0.6   # Maximum document presence remains the same
    min_frequency_std = 0.6  # Minimum standard deviation of frequencies across ratings
    min_correlation = 0.85    # Minimum correlation with ratings
    
    # Filter the DataFrame to keep only the selected words based on criteria
    filtered_df = df[(df['doc_frequency'] > min_doc_presence) & 
                 (df['doc_frequency'] < max_doc_presence) & 
                 (df['frequency_std'] > min_frequency_std) &
                 ((df['correlation'] > min_correlation) | (df['correlation'] < -min_correlation))]
    
    return filtered_df


def reduce_imdb_data_dimensionality(df, n_features=500):
    """Reduce dimensionality of the dataset based on regression coefficients."""
    model = LinearRegression()
    coefficients = []
    
    for _, row in df.iterrows():
        X = np.array(RATINGS).reshape(-1, 1).astype(float)
        y = row[RATINGS].values.astype(float)
        model.fit(X, y)
        impact = model.w[0]
        absolute_impact = abs(impact)
        correlation = row['correlation']
        coefficients.append({
            'Word': row['word'],
            'Impact': impact,
            'Absolute Impact': absolute_impact,
            'Correlation': correlation
        })

    coef_df = pd.DataFrame(coefficients)
    top_positive_features = pd.DataFrame(coef_df, columns=[ 'Word', 'Impact','Absolute Impact','Correlation']).sort_values(by='Impact', ascending=False).head(n_features)
    top_negative_features = pd.DataFrame(coef_df, columns=['Word', 'Impact','Absolute Impact','Correlation']).sort_values(by='Impact', ascending=True).head(n_features)
    top_abs_features = pd.DataFrame(coef_df, columns=['Word', 'Impact','Absolute Impact','Correlation']).sort_values(by='Absolute Impact', ascending=False).head(n_features)
    return top_positive_features, top_negative_features, top_abs_features



In [191]:

results = process_imdb_data("../aclImdb/train/labeledBow.feat","../aclImdb/imdb.vocab")
#print(results.sort_values(by='frequency', ascending=False).head(10))
#print(results.shape)
top_positive_features, top_negative_features, top_absolute_features = reduce_imdb_data_dimensionality(results)

Total documents: 25000


In [192]:
print("Top Positive Features")
print(top_positive_features.head(10))
print("Top Negative Features")
print(top_negative_features.head(10))
print("Top Absolute Features")
print(top_absolute_features.head(10))

plot_word_regression(results, "bad")

Top Positive Features
            Word     Impact  Absolute Impact  Correlation
3      excellent  53.945122        53.945122     0.908437
2          young  41.024390        41.024390     0.876922
10   performance  40.097561        40.097561     0.867315
30       perfect  36.524390        36.524390     0.864079
24  performances  26.902439        26.902439     0.898885
0      fantastic  22.597561        22.597561     0.851958
16        superb  21.054878        21.054878     0.859257
8        enjoyed  19.780488        19.780488     0.868926
11        strong  16.091463        16.091463     0.901589
86        simple  14.987805        14.987805     0.851459
Top Negative Features
        Word     Impact  Absolute Impact  Correlation
74    boring -48.317073        48.317073    -0.884543
57      poor -44.030488        44.030488    -0.866326
38  supposed -36.585366        36.585366    -0.882679
19   instead -31.865854        31.865854    -0.858675
85  annoying -24.621951        24.621951    -0.9

# Evaluation and Plotting Functions #

### Evaluation Functions ###

In [193]:
def evaluate_binary_classification(df):
    """
    use Receiver Operating Characteristic (ROC) curve and area under the ROC curve (AUROC)
    """
    pass

def evaluate_multiclass_classification(df):
    """
    compute classification accuracy
    """
    pass

### Plotting Functions ###

In [194]:
def plot_binary_classification(results):
    """
    plot ROC curve
    compare to DT from sklearn
    """
    pass

def plot_multiclass_classification(results):
    """
    compare accuracy to DT from sklearn
    
    """
    pass

def plot_top_20_features_from_imdb_simple(results):
    """
    A horizontal bar plot showing the top 20 features from the Simple linear regression on the IMDB data
    
    Characteristics:
    - 10 most positive and negative coefficients as the x-axis 
    - Feature names (i.e., words) as the y-axis  
    """
    
def plot_model_convergence(results, learning_rate):
    """
    Convergence plot on how the logistic and multiclass regression converge given a reason- ably chosen learning rate.
    """
    pass

def plot_imdb_data_roc_curve(results):
    """
    A single plot containing two ROC curves of logistic regression and sklearn-DT (Decision Trees) on the IMDB test data.
    """
    pass

def plot_imdb_data_auroc(results):
    """
    A bar plot that shows the AUROC of logistic regression and DT on the test data (y-axis) 
    as a function of the 20%, 40%, 60%, 80%, and 100% training data (x-axis)
    """
    pass

def plot_news_data_classification_accuracy(results):
    """
    A bar plot that shows the classification accuracies of multiclass regression and DT 
    on the test data (y-axis) as a function of the 20%, 40%, 60%, 80%, and 100% training data (x- axis)
    """
    pass

def plot_top_20_features_from_imdb_logistic(results):
    """
    A horizontal bar plot showing the top 20 features (10 most positive and 10 most negative) 
    from the logistic regression on the IMDB data with the coefficient as the x-axis and the feature names (i.e., words) as the y-axis
    """
    pass

def plot_heatmap_of_multi_classification(results):
    """
    A heatmap showing the top 5 most positive features as rows for each class as columns in the multi-class classification 
    on 4 the chosen classes from the 20-news group datasets. Therefore, your heatmap should be a 20-by-4 dimension.

    """
    pass

# Experiments #

### Experiment 1 ###

In [195]:
def experiment_one():
    """
    Report the:
    - top 10 features with the most positive coefficients
    - top 10 features with the most negative coefficients 
    
    on the IMDB data using simple linear regression on the movie rating scores
    """
    pass

### Experiment 2 ###

In [196]:
def experiment_two():
    """
    Implement and conduct:
    - Binary classification on the IMDb Reviews
    - Multi-class classification on the 20 news group dataset
    """
    pass

### Experiment 3 ###

In [197]:
def experiment_three():
    """
    On the same plot as 2, draw ROC curves and report the AUROC 
    values of logistic regression and Decision Trees on the 
    IMDB data binary classification task
    
    """
    pass

### Experiment 4 ###

In [198]:
def experiment_four():
    """
    Report the:
    - Multiclass classification accuracy of multiclass linear regression 
    - Decision Trees on the 5 chosen classes from the 20-news-group data
    """
    pass

### Experiment 5 ###

In [199]:
def experiment_five():
    """
    Plot and compare the accuracy of the two models as a function of the
    size of dataset by controlling the training size
    
    For example, you can randomly select 20%, 40%, 60% and 80% of the available 
    training data and train your model on this subset and evaluate the trained 
    model on the held-out test set.
    """
    pass

### Experiment 6 ###

In [200]:
def experiment_six():
    """
    Compare and evaluate the performance of different learning rates
    """
    pass

### Experiment 7 ###

In [201]:
def experiment_seven():
    """
    Evaluate the performance of the multiclass regression on more than 5 classes
    
    Compare the top k (e.g. k =3) predicted classes. 
    A correct prediction is determined by whether the true label is within the top k predicted labels. 
    The scoring mechanism involves assigning a score of 1 if the correct label is among the top k predictions and 0 otherwise.
    """
    pass