In [19]:
import io
import pandas as pd

file_path = '/home/cbolanos/datasets/librispeech-raw/SPEAKERS.TXT'

with open(file_path, 'r') as file:
    content = file.read()

content_io = io.StringIO(content)

# Skip the comments and header
lines = [line.strip() for line in content_io if not line.startswith(';') and line.strip()]

# Process each line manually
data = []
for line in lines:
    parts = line.split('|')
    if len(parts) >= 5:  # Ensure we have at least 5 parts
        id_, sex, subset, minutes, name = parts[0], parts[1], parts[2], parts[3], '|'.join(parts[4:])
        data.append([id_.strip(), sex.strip(), subset.strip(), minutes.strip(), name.strip()])

# Create DataFrame
df = pd.DataFrame(data, columns=['ID', 'SEX', 'SUBSET', 'MINUTES', 'NAME'])

# Clean up the data
df['ID'] = df['ID'].astype(int)
df['MINUTES'] = df['MINUTES'].astype(float)

# Display the DataFrame
print(df)

        ID SEX           SUBSET  MINUTES              NAME
0       14   F  train-clean-360    25.03   Kristin LeMoine
1       16   F  train-clean-360    25.11    Alys AtteWater
2       17   M  train-clean-360    25.04    Gord Mackenzie
3       19   F  train-clean-100    25.19  Kara Shallenberg
4       20   F  train-other-500    30.07            Gesine
...    ...  ..              ...      ...               ...
2479  8975   F  train-clean-100    25.11       Daisy Flaim
2480  9000   M  train-other-500    27.26   Ramon Escamilla
2481  9022   F  train-clean-360    25.17          Claire M
2482  9023   F  train-clean-360    25.19      P. J. Morgan
2483  9026   F  train-clean-360    21.75      Tammy Porter

[2484 rows x 5 columns]


In [20]:
df_devclean = df[df['SUBSET'] == 'dev-clean']

In [36]:
df_devclean['ID'] = df_devclean['ID'].astype(str)
df_devclean

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_devclean['ID'] = df_devclean['ID'].astype(str)


Unnamed: 0,ID,SEX,SUBSET,MINUTES,NAME
46,84,F,dev-clean,8.02,Christie Nowak
93,174,M,dev-clean,8.04,Peter Eastman
135,251,M,dev-clean,8.04,Mark Nelson
200,422,M,dev-clean,8.38,President Lethe
279,652,M,dev-clean,8.31,Scott Walter
318,777,M,dev-clean,8.06,fling93
470,1272,M,dev-clean,8.02,John Rose
529,1462,F,dev-clean,8.04,E. Tavano
595,1673,F,dev-clean,8.07,Tonia
681,1919,F,dev-clean,8.17,nprigoda


In [31]:
import json

with open('alignments/audio_alignments_10_16.json', 'r') as file:
    data = json.load(file)  

def process_transcripts(data):
    result = {}
    for key, value in data.items():
        # Extract the ID from the key (assuming it's always the first number in the path)
        id = key.split('/')[4]
        
        # Count words in the transcript
        word_count = len(value['transcript_audio'].split())
        if id in result:
            result[id] += word_count
        else:
            # If it's a new key, create a new list
            result[id] = word_count
    
    return result

# Process the data
result = process_transcripts(data)

df_words = pd.DataFrame(list(result.items()), columns=['ID', 'Cantidad de Palabras Dichas'])
df_words['ID'] = df_words['ID'].astype(str)

# Print the DataFrame
print(df_words)

      ID  Cantidad de Palabras Dichas
0    777                         1492
1   1988                         1456
2   2086                         1413
3   3000                         1233
4   2902                         1308
5   1462                         1366
6   3170                         1384
7   5536                         1501
8   7850                         1259
9   1272                         1150
10  3576                         1301
11  6313                         1598
12  2412                         1568
13  8297                         1150
14  6295                         1262
15  2078                         1140
16  1993                         1412
17  7976                         1449
18  2035                         1378
19  1673                         1392
20  6241                         1517
21   422                         1286
22   174                         1160
23  3536                         1531
24  2277                         1592
25  5895    

In [30]:
import json

# Read the JSON file
with open('words_in_order1.json', 'r') as file:
    data = json.load(file)

# Create a new dictionary
new_dict = {}

# Process each key-value pair in the original dictionary
for key, value in data.items():
    # Extract the first element of the key (before the first hyphen)
    new_key = key.split('-')[0]
    
    # If the key already exists in the new dictionary, extend the list
    if new_key in new_dict:
        new_dict[new_key].extend(value)
    else:
        # If it's a new key, create a new list
        new_dict[new_key] = value

count_dict = {}

# Count the length of each value in the new dictionary
for key, value in new_dict.items():
    count_dict[key] = len(value)

# Print the count dictionary
df_words_obtenidas = pd.DataFrame(list(count_dict.items()), columns=['ID', 'Cantidad de Palabras Obtenidas'])
df_words_obtenidas['ID'] = df_words_obtenidas['ID'].astype(str)

# Print the DataFrame
print(df_words_obtenidas)


      ID  Cantidad de Palabras Obtenidas
0   1272                            1035
1   1462                            1186
2   1673                            1083
3    174                             960
4   1919                            1095
5   1988                            1270
6   1993                            1213
7   2035                            1204
8   2078                             929
9   2086                            1245
10  2277                            1435
11  2412                            1346
12  2428                            1218
13   251                            1065
14  2803                            1001
15  2902                            1050
16  3000                            1031
17  3081                            1070
18  3170                            1149
19  3536                            1318
20  3576                            1084
21  3752                            1146
22  3853                            1281
23   422        

In [38]:
merged_df = pd.merge(df_devclean, df_words, on='ID', how='inner')
merged_df_final = pd.merge(merged_df, df_words_obtenidas, on='ID', how='inner')
merged_df_final = merged_df_final[['ID', 'SEX', 'Cantidad de Palabras Dichas', 'Cantidad de Palabras Obtenidas']]

In [43]:
print(f"Cantidad de palabras obtenidas en total: {merged_df_final['Cantidad de Palabras Obtenidas'].sum()}") 
print(f"Cantidad de palabras obtenidas por mujeres: {merged_df_final[merged_df_final['SEX'] == 'F']['Cantidad de Palabras Obtenidas'].sum()}") 
print(f"Cantidad de palabras obtenidas por hombres: {merged_df_final[merged_df_final['SEX'] == 'M']['Cantidad de Palabras Obtenidas'].sum()}") 
print(f"Cantidad de palabras dichas en total: {merged_df_final['Cantidad de Palabras Dichas'].sum()}") 
print(f"Cantidad de palabras dichas por mujeres: {merged_df_final[merged_df_final['SEX'] == 'F']['Cantidad de Palabras Dichas'].sum()}") 
print(f"Cantidad de palabras dichas por hombres: {merged_df_final[merged_df_final['SEX'] == 'M']['Cantidad de Palabras Dichas'].sum()}") 


Cantidad de palabras obtenidas en total: 46906
Cantidad de palabras obtenidas por mujeres: 24310
Cantidad de palabras obtenidas por hombres: 22596
Cantidad de palabras dichas en total: 54402
Cantidad de palabras dichas por mujeres: 28075
Cantidad de palabras dichas por hombres: 26327
