In [4]:
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
df = pd.read_csv('data/clip_labels.csv')

df.head()

Unnamed: 0,Show,EpId,ClipId,Start,Stop,Unsure,PoorAudioQuality,Prolongation,Block,SoundRep,WordRep,DifficultToUnderstand,Interjection,NoStutteredWords,NaturalPause,Music,NoSpeech
0,FluencyBank,10,0,88960,136960,0,0,0,1,0,0,0,0,2,0,0,0
1,FluencyBank,10,1,1271520,1319520,0,0,0,0,0,0,0,0,3,0,0,0
2,FluencyBank,10,2,1813760,1861760,0,0,1,0,0,0,0,0,2,0,0,0
3,FluencyBank,10,3,1842720,1890720,0,0,1,0,0,0,0,0,2,1,0,0
4,FluencyBank,10,4,1893280,1941280,0,0,0,0,0,3,0,0,0,0,0,0


In [6]:
initial_columns_to_check = ['Unsure', 'PoorAudioQuality', 'Prolongation', 'Block', 'SoundRep', 'WordRep', 
                    'DifficultToUnderstand', 'Interjection', 'NoStutteredWords', 'NaturalPause', 
                    'Music', 'NoSpeech']

# Continuing from your previous filtering
filtered_data = df[~df[initial_columns_to_check].isin([1, 2]).any(axis=1)]

# remove all rows where the value in the music column is 3
filtered_data = filtered_data[filtered_data['Music'] != 3]

# create a new column called Rep. the value in this column will be a sort of "OR" operation on the values in the SoundRep and WordRep columns. it is 3 if either of the columns is 3, and 0 otherwise
filtered_data['Rep'] = filtered_data['SoundRep'] | filtered_data['WordRep']

# remove soundrep and wordrep columns
filtered_data.drop(['SoundRep', 'WordRep'], axis=1, inplace=True)

final_columns_to_check = ['Unsure', 'PoorAudioQuality', 'Prolongation', 'Block', 'Rep', 
                    'DifficultToUnderstand', 'Interjection', 'NoStutteredWords', 'NaturalPause', 
                    'Music', 'NoSpeech']

# Summing up the values in the specified columns for each row
filtered_data['sum_columns'] = filtered_data[final_columns_to_check].sum(axis=1)

In [7]:
filtered_data.head()

Unnamed: 0,Show,EpId,ClipId,Start,Stop,Unsure,PoorAudioQuality,Prolongation,Block,DifficultToUnderstand,Interjection,NoStutteredWords,NaturalPause,Music,NoSpeech,Rep,sum_columns
1,FluencyBank,10,1,1271520,1319520,0,0,0,0,0,0,3,0,0,0,0,3
4,FluencyBank,10,4,1893280,1941280,0,0,0,0,0,0,0,0,0,0,3,3
6,FluencyBank,10,6,1982720,2030720,0,0,0,0,0,0,3,0,0,0,0,3
8,FluencyBank,10,8,2047520,2095520,0,0,0,0,0,0,3,0,0,0,0,3
13,FluencyBank,10,13,2801440,2849440,0,0,0,0,0,0,3,0,0,0,0,3


In [13]:
# create a new column called y
# go through each row. if the sum_column value for that row is 3, the cell in the y column for that row is the name of the column that has a value of 3
# if the sum_column value for that row is 6, it is kinda complicated. i need you to follow the rules i made:
# if the pair is ('Interjection', 'NoStutteredWords'), then the y value is Interjection
# if the pair is ('Rep', 'Interjection'), then the y value is Rep
# if the pair is ('NoStutteredWords', 'NaturalPause'), then the y value is NaturalPause
# if the pair is ('Prolongation', 'Interjection'), then the y value is Prolongation
# if the pair is ('NoStutteredWords', 'NoSpeech'), then the y value is NoSpeech
# if the pair is ('Prolongation', 'Rep'), then the y value is Unsure
# if the pair is ('DifficultToUnderstand', 'NoStutteredWords'), then the y value is Unsure
# if the pair is ('Block', 'Interjection'), then the y value is Unsure
# if the pair is ('Prolongation', 'NaturalPause'), then the y value is Unsure
# if the pair is ('Block', 'Rep'), then the y value is Unsure
# if the pair is ('PoorAudioQuality', 'NoStutteredWords), then the y value is Unsure
# if the pair is ('Rep', 'DifficultToUnderstand'), then the y value is Unsure
# if the pair is ('Block', 'DifficultToUnderstand'), then the y value is Unsure
# if the pair is ('PoorAudioQuality', 'Prolongation'), then the y value is Unsure
# if the pair is ('Prolongation', 'DifficultToUnderstand), then the y value is Unsure
# if the pair is ('Rep', 'NaturalPause'), then the y value is Unsure
# if the pair is ('Prolongation', 'Block'), then the y value is Unsure
# if the sum_column value for that row is 9, then the cell in the y column for that row is 'Interjection'


In [10]:
df_filtered = filtered_data[filtered_data['sum_columns'] == 6]

print(df_filtered.shape)

# Proceed to sum the values in the specified columns for this filtered dataset
summed_values = df_filtered[final_columns_to_check].sum()

print(summed_values)

(790, 17)
Unsure                      0
PoorAudioQuality            9
Prolongation              171
Block                      27
Rep                       333
DifficultToUnderstand      27
Interjection             1983
NoStutteredWords         1875
NaturalPause              234
Music                       0
NoSpeech                   81
dtype: int64


In [12]:
from itertools import combinations
from collections import Counter

# Define the specific columns to consider
columns_to_consider = [
    'Unsure', 'PoorAudioQuality', 'Prolongation', 'Block', 'Rep',
    'DifficultToUnderstand', 'Interjection', 'NoStutteredWords',
    'NaturalPause', 'Music', 'NoSpeech'
]

# Find pairs of columns with the value 3 in the same row
pair_counts = Counter()

for _, row in df_filtered.iterrows():
    # Get columns from the defined list that have the value 3
    cols_with_3 = [col for col in columns_to_consider if row[col] == 3]
    # If there are at least two columns with the value 3, compute all pairs
    if len(cols_with_3) > 1:
        for pair in combinations(cols_with_3, 2):
            pair_counts[pair] += 1

# Output the counts of each pair
print(pair_counts)


Counter({('Interjection', 'NoStutteredWords'): 517, ('Rep', 'Interjection'): 98, ('NoStutteredWords', 'NaturalPause'): 73, ('Prolongation', 'Interjection'): 42, ('NoStutteredWords', 'NoSpeech'): 27, ('Prolongation', 'Rep'): 8, ('DifficultToUnderstand', 'NoStutteredWords'): 6, ('Block', 'Interjection'): 4, ('Prolongation', 'NaturalPause'): 4, ('Block', 'Rep'): 3, ('PoorAudioQuality', 'NoStutteredWords'): 2, ('Rep', 'DifficultToUnderstand'): 1, ('Block', 'DifficultToUnderstand'): 1, ('PoorAudioQuality', 'Prolongation'): 1, ('Prolongation', 'DifficultToUnderstand'): 1, ('Rep', 'NaturalPause'): 1, ('Prolongation', 'Block'): 1})
