In [76]:
from typing import Any

import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')
# plt.style.use('dark_background')

import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.base import BaseEstimator
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import RidgeClassifier

In [77]:
# Regression data path
BITRATE_PREDICTION_TRAIN = 'bitrate_prediction/bitrate_train.csv'
BITRATE_PREDICTION_TEST = 'bitrate_prediction/bitrate_test.csv'

# Classification data path
STREAM_CLASSIFICATION_TRAIN = 'stream_quality_data/train_data.csv'
STREAM_CLASSIFICATION_TEST = 'stream_quality_data/test_data.csv'

# read csv
bitrate_reg_train = pd.read_csv(BITRATE_PREDICTION_TRAIN)
bitrate_reg_test = pd.read_csv(BITRATE_PREDICTION_TEST)

stream_class_train = pd.read_csv(STREAM_CLASSIFICATION_TRAIN)
stream_class_test = pd.read_csv(STREAM_CLASSIFICATION_TEST)

In [78]:
print("bitrate train data shape: ", bitrate_reg_train.shape)
print("bitrate test data shape: ", bitrate_reg_test.shape)

print("stream train data shape: ", stream_class_train.shape)
print("stream test data shape: ", stream_class_test.shape)

bitrate train data shape:  (379021, 10)
bitrate test data shape:  (228145, 10)
stream train data shape:  (406572, 12)
stream test data shape:  (243596, 12)


In [79]:
bitrate_reg_train.isnull().sum()

fps_mean               0
fps_std                0
rtt_mean               0
rtt_std                0
dropped_frames_mean    0
dropped_frames_std     0
dropped_frames_max     0
bitrate_mean           0
bitrate_std            0
target                 0
dtype: int64

In [80]:
stream_class_train.isnull().sum()

fps_mean               0
fps_std                0
fps_lags               0
rtt_mean               0
rtt_std                0
dropped_frames_mean    0
dropped_frames_std     0
dropped_frames_max     0
auto_bitrate_state     0
auto_fec_state         0
auto_fec_mean          0
stream_quality         0
dtype: int64

In [81]:
bitrate_reg_train.describe().T.style.bar(
    subset=['mean'], color='#606ff2'
).background_gradient(
    subset=['std'], cmap='PuBu'
).background_gradient(
    subset=['50%'], cmap='PuBu'
)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fps_mean,379021.0,35.231127,10.97501,10.0,28.8,30.0,43.6,125.8
fps_std,379021.0,1.725705,2.505942,0.0,0.316228,0.942809,2.233582,307.167273
rtt_mean,379021.0,49.623858,94.781098,0.0,14.3,32.2,55.9,12898.4
rtt_std,379021.0,12.763672,112.68446,0.0,0.699206,1.433721,4.948625,40721.933293
dropped_frames_mean,379021.0,0.180451,1.73289,0.0,0.0,0.0,0.0,540.0
dropped_frames_std,379021.0,0.469548,3.157866,0.0,0.0,0.0,0.0,202.38577
dropped_frames_max,379021.0,1.450719,9.670928,0.0,0.0,0.0,0.0,640.0
bitrate_mean,379021.0,7516.585502,6073.992189,0.0,2773.3,6287.2,10187.2,64913.5
bitrate_std,379021.0,1603.487501,1721.021623,0.0,383.68355,1112.71001,2241.848801,26908.532303
target,379021.0,7525.396231,6070.817736,0.0,2785.0,6296.0,10192.0,64913.0


In [82]:
stream_class_train.describe().T.style.bar(
    subset=['mean'], color='#606ff2'
).background_gradient(
    subset=['std'], cmap='PuBu'
).background_gradient(
    subset=['50%'], cmap='PuBu'
)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fps_mean,406572.0,34.497561,11.625494,0.0,28.3,30.0,43.0,127.1
fps_std,406572.0,2.285486,3.708531,0.0,0.316228,0.994429,2.590581,312.540842
fps_lags,406572.0,0.183151,1.099384,0.0,0.0,0.0,0.0,10.0
rtt_mean,406572.0,54.3144,133.872062,0.0,14.1,32.3,57.1,12898.4
rtt_std,406572.0,19.525019,156.364337,0.0,0.699206,1.490712,5.334375,40721.933293
dropped_frames_mean,406572.0,1730432.423986,49300910.708629,0.0,0.0,0.0,0.0,2097288600.0
dropped_frames_std,406572.0,137827.870355,9229775.537018,0.0,0.0,0.0,0.0,996375136.438125
dropped_frames_max,406572.0,1893338.780248,52410034.996107,0.0,0.0,0.0,0.0,2097288600.0
auto_fec_mean,406572.0,51.413536,34.836045,0.0,50.0,50.0,50.0,250.0
stream_quality,406572.0,0.06846,0.252534,0.0,0.0,0.0,0.0,1.0


In [83]:
def evaluate_model(
        model: str,
        x_train=False,
        y_train=False,
        x_test=False,
        y_test=False,
        metrics=None,
        fit_model=False
) -> tuple[Any, Any, Any]:
    if metrics is None:
        metrics = {"r2_score": r2_score, "mse": mean_squared_error}
    if type(x_test) == bool or type(y_test) == bool:
        x_test = x_train
        y_test = y_train

    if fit_model:
        model.fit(x_train, y_train)

    y_pred = model.predict(x_test)

    scores = {}
    for t in metrics.keys():
        scores.update({t: metrics[t](y_test, y_pred)})

    return scores, model, y_pred


In [84]:
class Classifier(BaseEstimator):
    def __init__(self, estimator, threshold=0.5):
        self.estimator = estimator
        self.threshold = threshold

    def fit(self, x, y):
        self.estimator.fit(x, y)

    def predict(self, x):
        return self.estimator.predict_proba(x)[:, 0] < self.threshold

    def get_params(self, deep=True):
        return {'threshold': self.threshold, 'subestimator': self.estimator}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self


class Polynomial(BaseEstimator):
    def __init__(self, degree=2, estimator=LinearRegression()):
        self.poly_transform = None
        self.degree = degree
        self.estimator = estimator

    def fit(self, X, y):
        self.poly_transform = PolynomialFeatures(self.degree).fit(X)
        self.estimator.fit(self.poly_transform.transform(X), y)

    def predict(self, X):
        return self.estimator.predict(self.poly_transform.transform(X))

    def get_params(self, deep=True):
        return {'degree': self.degree,
                'estimator': self.estimator}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self


class Pipeline:
    def __init__(self):
        self.scaler = None

    def regression_pipe(self, X_, y, fit_mode=False, final=True, show_shapes=True):
        X = X_.copy()

        if show_shapes:
            print(f'Initial data shape: {X.shape}')

        # dropping bitrate features
        X.drop(["bitrate_mean", "bitrate_std"], inplace=True, axis=1)

        # adding binary features
        X['dropped_frames_occured'] = X['dropped_frames_mean'].apply(lambda x: np.int8(x > 0))
        X['fps_std'] = X['fps_std'].apply(lambda x: x > 6)
        X['fps_unstable'] = X['fps_std'].apply(lambda x: x > 5)
        X['rtt_unstable'] = X['rtt_std'].apply(lambda x: x > 5)
        X['fps_anomaly'] = X['fps_mean'].apply(lambda x: not (25 < x < 35))
        X['rtt_anomaly'] = X['rtt_mean'].apply(lambda x: x > 50)

        # adding logarithmic features
        X["fps_std_log"] = X["fps_std"].apply(lambda x: np.log(x) if x > 2 else x)
        X.drop(["fps_std"], axis=1, inplace=True)
        X['rtt_std_log'] = X['rtt_std'].apply(lambda x: np.log(x) if x > 2 else x)
        X.drop(["dropped_frames_mean"], axis=1, inplace=True)

        if fit_mode:
            X["target"] = y

            # dropping duplicates
            X.drop_duplicates(inplace=True)

            # dropping outliers
            X = X.loc[X["rtt_std"] < 1000]
            X = X.loc[(X["fps_mean"] < 80)]
            X = X.loc[X["rtt_mean"] < 600]

        X.drop(["dropped_frames_std", 'dropped_frames_max'], inplace=True, axis=1)

        if final:
            X.drop(["fps_unstable"], inplace=True, axis=1)
            X.drop(["fps_std_log"], inplace=True, axis=1)

        if "target" in X.columns:
            y = X["target"]
            X = X.drop("target", axis=1)

        if fit_mode:
            self.scaler = MinMaxScaler().fit(X)
        X = pd.DataFrame(self.scaler.transform(X), columns=X.columns, index=X.index)

        if show_shapes:
            print(f'Out data shape: {X.shape}')

        if type(y) != bool:
            return X, y
        return X


In [85]:
pipeline = Pipeline()
_X_train = bitrate_reg_train.drop(["target"], axis=1)
x_train_pipe, y_train = pipeline.regression_pipe(_X_train,
                                                 bitrate_reg_train["target"],
                                                 fit_mode=True,
                                                 final=False)

Initial data shape: (379021, 9)
Out data shape: (373472, 10)


In [86]:
model = LinearRegression(fit_intercept=True).fit(x_train_pipe, y_train)
y_train_pred_r = model.predict(x_train_pipe)

print(f'r2_score: {r2_score(y_train, y_train_pred_r)}')
print(f'root mean squared error: {mean_squared_error(y_train, y_train_pred_r, squared=False)}')

r2_score: 0.10501965582705142
root mean squared error: 5741.341937583855


In [87]:
x_train_pipe, y_train_r = pipeline.regression_pipe(_X_train, bitrate_reg_train["target"], fit_mode=True)

model = LinearRegression(fit_intercept=True).fit(x_train_pipe, y_train_r)
y_train_pred_r = model.predict(x_train_pipe)

print(f'r2_score: {r2_score(y_train_r, y_train_pred_r)}')
print(f'root mean squared error: {mean_squared_error(y_train_r, y_train_pred_r, squared=False)}')

Initial data shape: (379021, 9)
Out data shape: (373472, 8)
r2_score: 0.10501878388273789
root mean squared error: 5741.344734365521


In [88]:
metrics = {
    "r2_score" : r2_score,
    "Root Squared Mean Error" : lambda x, y: mean_squared_error(x, y, squared=False)
}

models = {
    "Simple Linear" : LinearRegression(),
    "Ridge_0.1" : Ridge(0.1),
    "Ridge_0.4" : Ridge(0.4),
    "Ridge_0.7" : Ridge(0.7),
    "Ridge_1.0" : Ridge(1.0),
    "Lasso_0.1" : Lasso(0.1),
    "Lasso_0.4" : Lasso(0.4),
    "Lasso_0.7" : Lasso(0.7),
    "Lasso_1.0" : Lasso(1.0),
}

x_train_r_piped, y_train_r = pipeline.regression_pipe(bitrate_reg_train.drop("target", axis=1), bitrate_reg_train["target"], fit_mode=True)
for n in models:
    scores = evaluate_model(models[n], x_train_r_piped, y_train_r, metrics=metrics, fit_model=True)[0]
    print(scores)

Initial data shape: (379021, 9)
Out data shape: (373472, 8)
{'r2_score': 0.10501878388273789, 'Root Squared Mean Error': 5741.344734365521}
{'r2_score': 0.1050187838407517, 'Root Squared Mean Error': 5741.344734500192}
{'r2_score': 0.10501878321155855, 'Root Squared Mean Error': 5741.344736518343}
{'r2_score': 0.10501878182909763, 'Root Squared Mean Error': 5741.344740952618}
{'r2_score': 0.10501877969539297, 'Root Squared Mean Error': 5741.344747796524}
{'r2_score': 0.10501820609642343, 'Root Squared Mean Error': 5741.3465876278115}
{'r2_score': 0.10500952822098619, 'Root Squared Mean Error': 5741.374422031873}
{'r2_score': 0.10499032734673963, 'Root Squared Mean Error': 5741.436008618826}
{'r2_score': 0.10496066774554791, 'Root Squared Mean Error': 5741.53114015579}


In [89]:
metrics = {
    "r2_score": r2_score,
    "Root Squared Mean Error": lambda x, y: mean_squared_error(x, y, squared=False),
    "Mean Absolute Error": mean_absolute_error
}

models = {
    "Poly_1": Polynomial(1),
    "Poly_2": Polynomial(2),
    "Poly_3": Polynomial(3),
    "Poly_4": Polynomial(4),
    "Poly_5": Polynomial(5)
}

x_train_r_piped, y_train_r = pipeline.regression_pipe(bitrate_reg_train.drop("target", axis=1),
                                                      bitrate_reg_train["target"], fit_mode=True)
for n in models:
    scores = evaluate_model(models[n], x_train_r_piped, y_train_r, metrics=metrics, fit_model=True)[0]
    print(scores)

Initial data shape: (379021, 9)
Out data shape: (373472, 8)
{'r2_score': 0.105018783882738, 'Root Squared Mean Error': 5741.34473436552, 'Mean Absolute Error': 4443.309825613737}
{'r2_score': 0.12825060571056524, 'Root Squared Mean Error': 5666.338190367393, 'Mean Absolute Error': 4363.889152009785}
{'r2_score': 0.1423409998792613, 'Root Squared Mean Error': 5620.35812003486, 'Mean Absolute Error': 4306.682076511484}
{'r2_score': 0.15603390440894138, 'Root Squared Mean Error': 5575.311851549453, 'Mean Absolute Error': 4255.82257539302}
{'r2_score': 0.16558648722429947, 'Root Squared Mean Error': 5543.669471728776, 'Mean Absolute Error': 4233.406325885124}


In [90]:
class Pipeline:
    def __init__(self):
        self.scaler_c = None
        self.pca_c = None

    def class_pipeline(self, X_, y=False, fit_mode=False, final=True, show_shapes=False, with_pca=False):
        X = X_.copy()

        if show_shapes:
            print(f"Input shape: {X.shape}")

        # dropping features highly correlated with other features
        X.drop(["dropped_frames_max"], inplace=True, axis=1)

        # encoding categorical features
        X["auto_bitrate_state"] = X["auto_bitrate_state"].apply(lambda x: x != "off")
        X["auto_fec_state"] = X["auto_fec_state"].apply(lambda x: x != "off")

        # adding binary features
        X['dropped_frames_occured'] = X['dropped_frames_mean'].apply(lambda x: np.int8(x > 0))
        X['fps_unstable'] = X['fps_std'].apply(lambda x: x > 5)
        X['rtt_unstable'] = X['rtt_std'].apply(lambda x: x > 5)
        X['fps_anomaly'] = X['fps_mean'].apply(lambda x: not (20 < x < 30))
        X['rtt_anomaly'] = X['rtt_mean'].apply(lambda x: x > 50)
        X['fps_lag_huge'] = X['fps_lags'].apply(lambda x: x > 2)
        X['auto_fec_mean_high'] = X['auto_fec_mean'].apply(lambda x: x > 50)

        if fit_mode:
            X["stream_quality"] = y
            X.drop_duplicates(inplace=True)

            X = X.loc[X["fps_std"] < 30]
            X = X.loc[X["fps_mean"] < 80]
            X = X.loc[X["rtt_mean"] < 600]
            X = X.loc[X["rtt_std"] < 1000]
            X = X.loc[X["dropped_frames_std"] < 100]

        if final:
            X.drop(["dropped_frames_mean"], axis=1, inplace=True)
            X.drop(["rtt_unstable"], axis=1, inplace=True)

        features_pca = ["fps_unstable", "fps_anomaly", "rtt_anomaly", "fps_lag_huge", "auto_fec_mean_high",
                        "dropped_frames_std", "auto_bitrate_state", "auto_fec_state", "auto_fec_mean"]

        if with_pca and fit_mode:
            self.pca_c = PCA(1).fit(X[features_pca])

        if with_pca:
            X["Other_feats"] = self.pca_c.transform(X[features_pca])
            X.drop(features_pca, axis=1, inplace=True)

        if "stream_quality" in X.columns:
            y = X["stream_quality"]
            X.drop("stream_quality", axis=1, inplace=True)

        if fit_mode:
            self.scaler_c = MinMaxScaler().fit(X)

        X = pd.DataFrame(self.scaler_c.transform(X), columns=X.columns, index=X.index)

        if show_shapes:
            print(f"Output shape: {X.shape}")

        if type(y) != bool:
            return X, y
        return X



In [91]:
pipeline = Pipeline()
x_train_c_piped, y_train_c = pipeline.class_pipeline(stream_class_train.drop("stream_quality", axis=1),
                                                     stream_class_train["stream_quality"], fit_mode=True,
                                                     show_shapes=True)

Input shape: (406572, 11)
Output shape: (371028, 15)


In [92]:
model = LogisticRegression().fit(x_train_c_piped, y_train_c)
pd.DataFrame(model.coef_, columns=x_train_c_piped.columns).T

Unnamed: 0,0
fps_mean,-2.499149
fps_std,1.628067
fps_lags,5.498069
rtt_mean,0.439947
rtt_std,1.141372
dropped_frames_std,0.518376
auto_bitrate_state,0.235656
auto_fec_state,0.594924
auto_fec_mean,-0.359755
dropped_frames_occured,1.0274


In [93]:
x_train_c_piped, y_train_c = pipeline.class_pipeline(stream_class_train.drop("stream_quality", axis=1),
                                                     stream_class_train["stream_quality"], fit_mode=True, with_pca=True)
model = LogisticRegression().fit(x_train_c_piped, y_train_c)
pd.DataFrame(model.coef_, columns=x_train_c_piped.columns).T

Unnamed: 0,0
fps_mean,-1.689172
fps_std,1.877139
fps_lags,5.79516
rtt_mean,1.141644
rtt_std,0.723113
dropped_frames_occured,1.089065
Other_feats,0.485232


In [94]:
x = x_train_c_piped
y = y_train_c

threshold = 0.5

model = LogisticRegression(max_iter=1000, class_weight="balanced").fit(x_train_c_piped, y_train_c)
y_pred = model.predict_proba(x)[:,0] < threshold

print(f'accuracy score: {accuracy_score(y_train_c, y_pred)}')
print(f'precision score: {precision_score(y_train_c, y_pred, average="weighted")}')
print(f'recall score: {recall_score(y_train_c, y_pred, average="weighted")}')
print(f'f1_score score: {f1_score(y_train_c, y_pred, average="weighted")}')
print(f'precision score unweighted: {precision_score(y_train_c, y_pred)}')

accuracy score: 0.8840141444850523
precision score: 0.9194280000785652
recall score: 0.8840141444850523
f1_score score: 0.8991843629006989
precision score unweighted: 0.2578346892600972


In [95]:
threshold = 0.45

models_final = {
    "Logistic_l2": Classifier(LogisticRegression(penalty="l2", max_iter=100, class_weight='balanced'), threshold),
    "Ridge": RidgeClassifier(max_iter=100, class_weight='balanced'),
    "Polynomial_d4": Polynomial(4, estimator=Classifier(LogisticRegression(max_iter=100, class_weight='balanced'),
                                                        threshold)),
    "Polynomial_d5": Polynomial(5, estimator=Classifier(LogisticRegression(max_iter=100, class_weight='balanced'),
                                                        threshold)),
    "Polynomial_d7": Polynomial(7, estimator=Classifier(LogisticRegression(max_iter=100, class_weight='balanced'),
                                                        threshold - 0.1))
}

metrics = {
    "accuracy score": lambda x, y: accuracy_score(x, y),
    "precision weighted score": lambda x, y: precision_score(x, y, average="weighted"),
    "recall_score": lambda x, y: recall_score(x, y),
    "f1_score": lambda x, y: f1_score(x, y)
}

x_test_c_piped, y_test_c = pipeline.class_pipeline(stream_class_train.drop("stream_quality", axis=1),
                                                   stream_class_train["stream_quality"],
                                                   with_pca=True)

for n in models_final:
    scores = evaluate_model(models_final[n], x_train_c_piped, y_train_c, x_test_c_piped, y_test_c, fit_model=True,
                            metrics=metrics)[0]
    print(scores)

{'accuracy score': 0.9005907932666293, 'precision weighted score': 0.9231406030372495, 'recall_score': 0.5430768125314364, 'f1_score': 0.4279183003297994}
{'accuracy score': 0.8923610086282381, 'precision weighted score': 0.9216657993618442, 'recall_score': 0.5502263418840267, 'f1_score': 0.41173228663987205}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'accuracy score': 0.8867310095136901, 'precision weighted score': 0.9225636131011863, 'recall_score': 0.5766328950204785, 'f1_score': 0.4107380489302897}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'accuracy score': 0.884704308216011, 'precision weighted score': 0.9227763160275648, 'recall_score': 0.5843931881871093, 'f1_score': 0.4096816441668346}


KeyboardInterrupt: 