# Task 1 Descriptive Statistics 

## Import and Initializing

In [1]:
import itertools

import numpy as np  # pip install numpy
import scipy.stats  # pip install scipy

import pandas as pd  # pip install pandas
import matplotlib.pyplot as plt  # pip install matplotlib
import seaborn as sns  # pip install seaborn

sns.set_context("notebook")
sns.set_style("whitegrid")

FIG_SIZE = (14, 4)

In [2]:
def ci_95(data):
    """https://stackoverflow.com/a/15034143"""
    confidence = 0.95
    n = len(data)
    std_err_of_measurement = scipy.stats.sem(data)
    ci = std_err_of_measurement * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return ci

## Loading the data

In [3]:
sheets = pd.read_excel("../datasets/DB04_speech_quality_crowdsourcing_dataset.xlsx", sheet_name=None)
print("Sheets:")
list(sheets.keys())

Sheets:


['Description',
 'CS -Full',
 'CS-per file',
 'CS-per condition',
 'CS participants',
 'Lab-per condition']

In [4]:
prev_max_cols = pd.get_option('display.max_columns')
pd.set_option('display.max_columns', None)  
display(sheets["CS-per file"].head(3))
pd.set_option('display.max_columns', prev_max_cols)  

Unnamed: 0,files,Num_Ratings,condition,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,P11,P12,P13,P14,P15,P16,P17,P18,MOS,STD
0,D401_c01_ef01_s001.wav,9,1.0,5,4,5,5,5,5,5.0,5.0,4.0,,,,,,,,,,4.77778,0.440959
1,D401_c01_ef01_s002.wav,9,1.0,4,5,5,5,5,5,4.0,5.0,5.0,,,,,,,,,,4.77778,0.440959
2,D401_c01_ef01_s003.wav,10,1.0,5,5,4,5,5,5,5.0,5.0,5.0,4.0,,,,,,,,,4.8,0.421637


## Data preparation

In [5]:
sheet = sheets["CS-per file"]

def parse_condition(file_name):
    _, cond, _, _ = file_name.split("_")
    return cond

sheet["condition_filled"] = sheet.files.transform(parse_condition)

In [6]:
persons = [col for col in sheet.columns if col.startswith("P")]
columns = ["condition_filled"] + persons
all_conditions = sheet[columns]
all_conditions

Unnamed: 0,condition_filled,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,P11,P12,P13,P14,P15,P16,P17,P18
0,c01,5,4,5,5,5,5,5.0,5.0,4.0,,,,,,,,,
1,c01,4,5,5,5,5,5,4.0,5.0,5.0,,,,,,,,,
2,c01,5,5,4,5,5,5,5.0,5.0,5.0,4.0,,,,,,,,
3,c01,5,5,5,5,4,5,5.0,,,,,,,,,,,
4,c01,4,4,5,5,5,5,5.0,5.0,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1147,c48,4,2,4,5,3,5,4.0,4.0,3.0,5.0,3.0,4.0,3.0,5.0,4.0,1.0,2.0,4.0
1148,c48,2,4,3,3,3,4,3.0,5.0,2.0,,,,,,,,,
1149,c48,4,4,3,3,3,4,4.0,4.0,4.0,,,,,,,,,
1150,c48,5,4,4,4,4,4,4.0,4.0,3.0,5.0,,,,,,,,


## Calculating MOS, std and 95% CI

In [7]:
by_condition = all_conditions.groupby("condition_filled")

def block_stats(data, columns, agg_fns, agg_names):
    selected = data[columns].to_numpy()
    selected = selected[~np.isnan(selected)]
    
    results = [fn(selected) for fn in agg_fns]
    return pd.Series(results, index=agg_names)

stats_names = ["mean", "std", "95% CI", "n"]
stats_fns = [np.mean, np.std, ci_95, len]

statistics = by_condition.apply(
    block_stats,
    columns=persons,
    agg_fns=stats_fns,
    agg_names=stats_names,
)

In [8]:
pre_calculated = sheets["CS-per condition"].set_index("condition")
pd.concat([statistics, pre_calculated], axis="columns", keys=["self calculated", "pre calculated"], sort=True)

Unnamed: 0_level_0,self calculated,self calculated,self calculated,self calculated,pre calculated,pre calculated,pre calculated,pre calculated
Unnamed: 0_level_1,mean,std,95% CI,n,MOS,STD,CI95,n
c01,4.836449,0.428406,0.057861,214.0,4.83645,0.42941,0.0484937,214
c02,1.541284,0.583128,0.078021,218.0,?,?,?,?
c03,4.351852,0.802944,0.107936,216.0,,,,
c04,2.78341,0.912838,0.122421,217.0,,,,
c05,3.537037,0.731892,0.098385,216.0,,,,
c06,4.623256,0.547653,0.073792,215.0,,,,
c07,3.665094,1.035278,0.140495,212.0,,,,
c08,3.933962,0.723838,0.09823,212.0,,,,
c09,3.117371,0.855981,0.115886,213.0,,,,
c10,4.518182,0.642889,0.085619,220.0,,,,
