# Import packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
pd.options.display.max_columns = 50

# Read data

In [None]:
data = pd.read_csv('/home/asberk/data/1-Donaldson/AllData.csv')

In [None]:
data.head()

In [None]:
# from pprint import pprint
# pprint(data.columns.tolist())

## Cleaning up the junk

In [None]:
def checkColumn(df, colNum):
    """
    Used in throwAwayUnchanged
    """
    return np.all(df.iloc[0, colNum] == df.iloc[1:, colNum])


def throwAwayUnchanged(df):
    """
    Made specifically for the data we were given for the Midvale project. 
    Could, however, prove useful on subsetted-by-group data...
    This function throws away columns that are the same in every entry
    """
    idxUnhelpful = [j for j in range(df.columns.size)
                    if checkColumn(df, j)]
    df = df.drop(df.columns[idxUnhelpful], axis=1)
    return df


def throwAwayBizarre(df):
    """
    Throws away rows where TotalBytes is negative 
    (because this doesn't make sense).
    """
    df = df.loc[df['TotalBytes'] >= 0]
    return df


def removeUnwanted(data):
    """
    Made specifically for the data we were given for the Midvale project. 
    """
    # Don't worry about High Performance mode for this task
    data = data.groupby('Mode').get_group(0)
    # Flicker is not useful for prediction
    data = data.drop('Flicker', axis=1)
    # We will throw away columns that are all the same
    # (On `data`, this gets rid of Sharpening, 
    #  WaitSeconds and Status)
    data = throwAwayUnchanged(data)
    data = throwAwayBizarre(data)
    data = data.drop('Index', axis=1)
    return data


def fixMiscValues(df):
    """
    Made specifically for the data we were given for the Midvale project. 
    """
    df = df.fillna({'TertiaryResolution': 'NaN'})
    df = df.replace('-', value=0)
    df['SecondaryBitsPerSecond'] = df['SecondaryBitsPerSecond'].astype(np.float64)
    df['TertiaryBitsPerSecond'] = df['TertiaryBitsPerSecond'].astype(np.float64)
    return df


def preProcess(df):
    """
    Made specifically for the data we were given for the Midvale project
    """
    df = removeUnwanted(df)
    df = fixMiscValues(df)
    return df

In [None]:
data = preProcess(data)

# Make a subset of the data for a simpler time

First we have to figure out the subset...

Roger recommended sticking with `Test == Base` and a single camera. Let's choose the camera with the most observations.

In [None]:
Base_gb_CameraName = data.loc[data['Test']=='Base'].groupby(['CameraName'])
CameraName_highestCountOf_Base = Base_gb_CameraName.count()['Test'].argmax()
data_A3Base = Base_gb_CameraName.get_group(CameraName_highestCountOf_Base)
data_A3Base = data_A3Base.drop(['CameraName', 'Test'], axis=1)

In [None]:
data_A3Base.head()

# Data exploration

## Histogram of continuous

In [None]:
plt.hist(np.log(data_A3Base['TotalBytes'].values));

In [None]:
plt.hist(np.log(data_A3Base['PrimaryBitsPerSecond'].values));

In [None]:
plt.hist(np.log(data_A3Base['SecondaryBitsPerSecond'].values));

In [None]:
def logTransformColumns(df, columns):
    logDict = {'log' + col: lambda x: np.log(x[col]) 
               for col in columns}
    df = df.assign(**logDict)
    return df
        

In [None]:
data = data.assign(logTotalBytes = lambda x: np.log(x.TotalBytes))
data = data.assign(logPrimaryBPS = lambda x: np.log(x.PrimaryBitsPerSecond) 
                   if x.PrimaryBitsPerSecond > 0 else 0)
data = data.assign(logSecondaryBPS = lambda x: np.log(x.SecondaryBitsPerSecond) 
                   if x.SecondaryBitsPerSecond > 0 else 0)
data = data.assign(logTertiaryBPS = lambda x: np.log(x.TertiaryBitsPerSecond) 
                   if x.TertiaryBitsPerSecond > 0 else 0)

In [None]:
data = logTransformColumns(data, ['TotalBytes', 'PrimaryBitsPerSecond', 
                                  'SecondaryBitsPerSecond'])

In [None]:
data.head()

## Histogram of categoricals

In [None]:
def hist_colVals(X, **kwargs):
    """
    X : a categorical column of a data frame
    """
    # Check if not categorical
    #
    #
    # get value counts
    vc = X.value_counts()
    n = vc.shape[0]
    xrange = np.arange(n)
    plt.bar(xrange, vc.values, **kwargs)
    plt.xticks(xrange, vc.index.tolist(), rotation=90)
    return
    
    

In [None]:
# make a histogram of these
# (these are the *useful* categories for CamerName:A3;Test:Base)
categs = ['PrimaryResolution', 'SecondaryResolution', 
          'Keyframe', 'ImageRate', 'Quality',
          'Detail', 'Motion']

C = len(categs)
ncols = 4
nrows = np.int(np.ceil(C/5))
figwidth = 20
figheight = 20 #np.int(np.min([np.ceil(20/ncols*nrows), 20]))

plt.subplots(nrows, ncols, figsize=(figwidth, figheight))
for j, categ in enumerate(categs):
    plt.subplot(nrows, ncols, j+1)
    hist_colVals(data_BaseA3[categ])
    plt.xlabel(categ)
for j in range(C, nrows*ncols):
    plt.subplot(nrows, ncols, j+1)
    plt.axis('off')

## Sorting out `TotalBytes` and some categorical features

In [None]:
sorted_data_BaseA3 = data_BaseA3.sort_values(by='TotalBytes')

In [None]:
sorted_data_BaseA3.head()

In [None]:
sorted_data_BaseA3.tail()

## Finding correlations with PrimaryBitsPerSecond

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
plt.hist(np.log(data_BaseA3['PrimaryBitsPerSecond']), bins=1000, cumulative=True);

In [None]:
data_BaseA3 = data_BaseA3.assign(logPrimaryBitsPerSecond = lambda x: np.log(x.PrimaryBitsPerSecond))

In [None]:
scaler = StandardScaler()
qualityResponse = scaler.fit_transform(data_BaseA3.loc[:, ['Quality', 'logPrimaryBitsPerSecond']])

In [None]:
plt.hist2d(qualityResponse[:,0],
           qualityResponse[:,1],
           bins=20);
plt.xlabel('Quality')
plt.ylabel('log(PrimaryBitsPerSecond)');

In [None]:
from sklearn.linear_model import LinearRegression, ElasticNetCV
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_BaseA3.loc[:,['Quality', 'ImageRate', 'Keyframe']].values,
                                                    data_BaseA3['logPrimaryBitsPerSecond'].values)

In [None]:
en = ElasticNetCV(normalize=True)
en.fit(X_train, y_train)

In [None]:
en.score(X_test, y_test)

In [None]:
plt.hist2d(data_BaseA3['Quality'], 
           np.log(data_BaseA3['PrimaryBitsPerSecond']),
           bins=20);
plt.xlabel('Quality')
plt.ylabel('log(PrimaryBitsPerSecond)');
qual = np.arange(0, 21)
logPBPS_pred = en.predict(qual.reshape(-1,1))
plt.plot(qual, logPBPS_pred, 'r-')

Whaddaya know, (log) bit rate is correlated with -Quality...

# Encoding features for regression

## Encoding the categoricals

If there are any categoricals, then maybe we should put columns as integers so we can regress on them? 

In [None]:
def setUpCategs(data, sparse=False):
    from sklearn.preprocessing import LabelEncoder, OneHotEncoder
    lb = LabelEncoder()
    oh = OneHotEncoder()
    
    categoricals = data.select_dtypes(include=['object'])
    categoricals = pd.concat((categoricals, data['Nonlinear']), axis=1)
    categoricals = categoricals.drop('Message', axis=1)
    categoricals = categoricals.apply(lb.fit_transform)
    categoricals = oh.fit_transform(categoricals)
    if not sparse:
        categoricals = categoricals.toarray()
    return categoricals


In [None]:
categoricals = setUpCategs(data)

## Now for the continous

In [None]:
continuous = data.loc[:, ['Keyframe', 'ImageRate', 'Quality', 'KbpsLimit', 'CollectSeconds']]

## Now the response(s)
TotalBytes should never be negative so far as I'm aware, so let's fix this:

In [None]:
data['TotalBytes'] = data['TotalBytes'].apply(lambda x: x if x > 0 else 0)

In [None]:
response_names = ['TotalBytes', 'PrimaryBitsPerSecond', 'SecondaryBitsPerSecond', 'TertiaryBitsPerSecond']
responses = data.filter(items=response_names).values

In [None]:
plt.hist(np.log(responses[:,0]), stacked=True);
plt.legend(response_names[0]);

In [None]:
testingCorrelation = data.loc[:,['TotalBytes', 'PrimaryBitsPerSecond']].values

In [None]:
plt.plot(np.log(np.abs(testingCorrelation[:,0])), np.log(testingCorrelation[:,1]), '.', alpha=.1)

In [None]:
ohCameraName = oh.fit_transform(lb.fit_transform(data['CameraName'].values.ravel()).reshape(-1,1))

In [None]:
ohCameraName.shape

In [None]:
features_to_encode = ['CameraName', 'PrimaryResolution', 'SecondaryResolution', 
                      'Nonlinear', 'Mode', 'Test', 'Detail', 'Motion']

In [None]:
for ftrName in features_to_encode:
    le = LabelEncoder()
    data[ftrName+'Enc'] = le.fit_transform(data[ftrName])

In [None]:
data.head()

# Notes

* We really want to predict `log(PrimaryBitsPerSecond)`. And we probably want to scale it first. 
* What are the other variables we want to scale? 
* Are we allowed to use Quality in our prediction? 