## Pre-Process Data ##

In [None]:
import pandas as pd
import matplotlib as plt
import numpy as np
import seaborn as sns
import sklearn as sk
import sklearn.datasets
import os
from LinearRegression import LinearRegression
from LogisticRegression import LogisticRegression
from MultiClassRegression import MultiClassRegression

### Extact, Transform, Load (ETL) ###

In [None]:
def extract_word_data(file_path):
    """
    Extract word data from .vocab file
    
    Returns:
    - numpy array: (n_word_indices)
    """
    with open(file_path, 'r') as f:
        lines = f.readlines()
        words = [line.split()[0] for line in lines]
        
    return np.array(words)   

def processs_imdb_data(feat_file_path, vocab_file_path):
    """
    The .vocab and .feat files contain enough information to create the features and labels.
    However, to select the top n_features based on their regression coefficients we need to count the
    number of times each word occurs with each rating.
    
    Right now, the .feat file is structured such that the first value contains the rating of that review and (word_index: count) pairs
    We need to create a data frame where the columns are [word_index, 1,2,...,10] where the 1,..,10 represents the rating. The values of cols
    1-10 will be the number of times the word_index occurs in a review with each respective rating
    
    Return:
    - Pandas Dataframe: columns = [feature_index, 1,2,3,4,5,6,7,8,9,10]
    
    """
    words = extract_word_data(vocab_file_path)
    index_word_dict = {index: word for index, word in enumerate(words)}
    
    # Read in the .feat file
    with open(feat_file_path, 'r') as f:
        lines = f.readlines()
        data = [line.split() for line in lines]
        
    # Create a nested dictionary to store the counts of each word with each rating
    word_counts = {}
    for line in data:
        rating = int(line[0])
        for word_count_pair in line[1:]:
            word_index, count = map(int, word_count_pair.split(":"))
            if word_index not in word_counts:
                word_counts[word_index] = {rating: count}
            else:
                word_counts[word_index][rating] = word_counts[word_index].get(rating, 0) + count
    
    # Prepare data for DataFrame creation
    df_data = []
    ratings = [1,2,3,4,6,7,8,9,10]
    for word_index, counts in word_counts.items():
        row = [word_index, index_word_dict.get(word_index, "Unknown")] + [counts.get(rating, 0) for rating in ratings]
        df_data.append(row)
    
    # Create DataFrame
    df = pd.DataFrame(df_data, columns=['feature_index','word'] + ratings)
    
    return df
    
    # # Sort the columns from 1-10
    # df = df.reindex(sorted(df.columns), axis=1)
    
    # # get the word data and add it to the dataframe
    # words = extract_word_data()
    # # convert words to a dict mapping index to word
    # n_words = len(words)
    # words = {i: words[i] for i in range(n_words)}
    # df = df.rename(columns=words)
    # # Match the word index (current dataframe index values) to the word
    
    
    # return df
    

def reduce_imdb_data_dimensionality(features, ratings, n_features=500):
    """
    Reduce dimensionality of set based on the absolute regression coefficients
    of the top n_features with the rating scores (1-10)
    """
    model = LinearRegression()

def reduce_news_data_dimensionality(n_categories=5, min_word_freq=10):
    """
    Bring down size of sk learn dataset by:
    - Picking 5 categories
    - Filtering out rare words
    """
    categories = ['comp.graphics' ,'rec.sport.hockey', 'sci.med', 'soc.religion.christian', 'talk.politics.guns']
    data = sklearn.datasets.fetch_20newsgroups(subset='train',remove=(('headers', 'footers', 'quotes')))
    df = pd.DataFrame(data.data, columns = ["Text"])

In [None]:

results = processs_imdb_data("../aclImdb/train/labeledBow.feat","../aclImdb/imdb.vocab")
# print the number of rows
print(results.head(10)) 
print(results.shape)
results.dropna(inplace=True)
print(results.shape)

# Evaluation and Plotting Functions #

### Evaluation Functions ###

In [None]:
def evaluate_binary_classification(df):
    """
    use Receiver Operating Characteristic (ROC) curve and area under the ROC curve (AUROC)
    """
    pass

def evaluate_multiclass_classification(df):
    """
    compute classification accuracy
    """
    pass

### Plotting Functions ###

In [None]:
def plot_binary_classification(results):
    """
    plot ROC curve
    compare to DT from sklearn
    """
    pass

def plot_multiclass_classification(results):
    """
    compare accuracy to DT from sklearn
    
    """
    pass

def plot_top_20_features_from_imdb_simple(results):
    """
    A horizontal bar plot showing the top 20 features from the Simple linear regression on the IMDB data
    
    Characteristics:
    - 10 most positive and negative coefficients as the x-axis 
    - Feature names (i.e., words) as the y-axis  
    """
    
def plot_model_convergence(results, learning_rate):
    """
    Convergence plot on how the logistic and multiclass regression converge given a reason- ably chosen learning rate.
    """
    pass

def plot_imdb_data_roc_curve(results):
    """
    A single plot containing two ROC curves of logistic regression and sklearn-DT (Decision Trees) on the IMDB test data.
    """
    pass

def plot_imdb_data_auroc(results):
    """
    A bar plot that shows the AUROC of logistic regression and DT on the test data (y-axis) 
    as a function of the 20%, 40%, 60%, 80%, and 100% training data (x-axis)
    """
    pass

def plot_news_data_classification_accuracy(results):
    """
    A bar plot that shows the classification accuracies of multiclass regression and DT 
    on the test data (y-axis) as a function of the 20%, 40%, 60%, 80%, and 100% training data (x- axis)
    """
    pass

def plot_top_20_features_from_imdb_logistic(results):
    """
    A horizontal bar plot showing the top 20 features (10 most positive and 10 most negative) 
    from the logistic regression on the IMDB data with the coefficient as the x-axis and the feature names (i.e., words) as the y-axis
    """
    pass

def plot_heatmap_of_multi_classification(results):
    """
    A heatmap showing the top 5 most positive features as rows for each class as columns in the multi-class classification 
    on 4 the chosen classes from the 20-news group datasets. Therefore, your heatmap should be a 20-by-4 dimension.

    """
    pass

# Experiments #

### Experiment 1 ###

In [None]:
def experiment_one():
    """
    Report the:
    - top 10 features with the most positive coefficients
    - top 10 features with the most negative coefficients 
    
    on the IMDB data using simple linear regression on the movie rating scores
    """
    pass

### Experiment 2 ###

In [None]:
def experiment_two():
    """
    Implement and conduct:
    - Binary classification on the IMDb Reviews
    - Multi-class classification on the 20 news group dataset
    """
    pass

### Experiment 3 ###

In [None]:
def experiment_three():
    """
    On the same plot as 2, draw ROC curves and report the AUROC 
    values of logistic regression and Decision Trees on the 
    IMDB data binary classification task
    
    """
    pass

### Experiment 4 ###

In [None]:
def experiment_four():
    """
    Report the:
    - Multiclass classification accuracy of multiclass linear regression 
    - Decision Trees on the 5 chosen classes from the 20-news-group data
    """
    pass

### Experiment 5 ###

In [None]:
def experiment_five():
    """
    Plot and compare the accuracy of the two models as a function of the
    size of dataset by controlling the training size
    
    For example, you can randomly select 20%, 40%, 60% and 80% of the available 
    training data and train your model on this subset and evaluate the trained 
    model on the held-out test set.
    """
    pass

### Experiment 6 ###

In [None]:
def experiment_six():
    """
    Compare and evaluate the performance of different learning rates
    """
    pass

### Experiment 7 ###

In [None]:
def experiment_seven():
    """
    Evaluate the performance of the multiclass regression on more than 5 classes
    
    Compare the top k (e.g. k =3) predicted classes. 
    A correct prediction is determined by whether the true label is within the top k predicted labels. 
    The scoring mechanism involves assigning a score of 1 if the correct label is among the top k predictions and 0 otherwise.
    """
    pass