# Score Transcriptions with IPA and no IPA
### Install python packages

In [13]:
!pip install Unidecode

Collecting Unidecode
  Downloading Unidecode-1.2.0-py2.py3-none-any.whl (241 kB)
[K     |████████████████████████████████| 241 kB 18.6 MB/s eta 0:00:01
[?25hInstalling collected packages: Unidecode
Successfully installed Unidecode-1.2.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


# Import Pyhthon Packages

In [21]:
import json
import pandas as pd
from io import StringIO
import boto3
import sys, os
from datetime import date, timedelta, datetime, timezone
import time 
import numpy as np
import random
import tarfile
import unidecode as uni

# Define functions

### Function to get file name from S3 buckets

In [22]:
def get_all_s3_objects(s3, **base_kwargs):
    ###
    #   Function to amplify the limit of AWS results to 1000+
    ###
    continuation_token = None
    while True:
        list_kwargs = dict(MaxKeys=1000, **base_kwargs)
        if continuation_token:
            list_kwargs['ContinuationToken'] = continuation_token
        response = s3.list_objects_v2(**list_kwargs)
        yield from response.get('Contents', [])
        if not response.get('IsTruncated'):  # At the end of the list?
            break
        continuation_token = response.get('NextContinuationToken')

def get_folder_list(bucket='awstranscribe-tests', key='transcribeOutputs/Files'):
    ###
    #  Get the name of the files in a bucket. While bucket is the AWS S3 Bucket and key is the folder inside that bucket
    # it defaults to transcribeOutputs/Files
    ###
    s3 = boto3.client('s3')
    data_loc = []
    for obj in get_all_s3_objects(s3, Bucket=bucket, Prefix=key):
        names = 's3://{}/{}'.format(bucket, obj['Key'])
        data_loc.append(names)
    return data_loc

### Function to create a .csv file from a Pandas DataFrame
Creates a file into a definded AWS S3 folder

In [23]:
def to_file(df, file_name=date.today()):
    today = date.today()
    csv_buffer = StringIO()
    data_frame = df
    data_frame.to_csv(csv_buffer, decimal='.', sep=',', encoding='utf-8', index=False, header=None)
    s3_resource = boto3.resource('s3')
    s3_resource.Object('awstranscribe-tests', f'transcribeOutputs/proc_files/{file_name}.csv').put(Body=csv_buffer.getvalue()) ## CHANGE temp_Mail for Mails
    
    return f'Saved as file: awstranscribe-tests/transcribeOutputs/proc_files/{file_name}.csv'

### Function to neutralize words
Remove accent characters and lowercase the words

In [30]:
def neutralize(a_string):
    a_string = uni.unidecode(a_string)
    a_string = a_string.replace('?', '')
    return a_string.lower()

### function that calculates percentage of similarity

In [25]:
def similarity_score(string_real, string_model, extra_info=False):
    # cut strings to bag of words
    words_real = neutralize(string_real).split(' ')
    count = 0
    error_words = []
    for word in words_real:
        # search word in string_model
        if word in neutralize(string_model):
            count = count + 1
        else:
            error_words.append(word)

    score = count / len(words_real)
    if extra_info:
        return score, error_words

    return score

### Function that calculates the percentage of similarity words usaing DataFrames

In [26]:
def similarity_score_dataframes(_df_real, _df_model):
    score_list = []

    for a_file, a_string in zip(_df_model['file'], _df_model['transcript']):
        # first, we search for file in real
        real_str = _df_real[_df_real['file']==a_file]['transcript']

        # files should be 1:1, if not, we riot
        if len(real_str) != 1:
            AttributeError('Dude files are not 1:1 in ' + a_file)

        # otherwise let's continue calculating the score
        #print(real_str.array)
        score, fails = similarity_score(real_str.array[0], a_string, extra_info=True)
        score_list.append((a_file, score, fails))

    return pd.DataFrame(score_list, columns=['file', 'score', 'failed_words'])


# Testing executions

In [37]:
df_real = get_folder_list(key='transcribeOutputs/proc_files/IPA')
df_model = get_folder_list(key='transcribeOutputs/proc_files/no_IPA')

for df_real_ in df_real[1:]:
    df_real_1 = pd.read_csv(df_real_, names=['file', 'transcript'])

for df_model_ in df_model[1:]:
    df_model_1 = pd.read_csv(df_model_, names=['file', 'transcript'])

    
scr_list = similarity_score_dataframes(df_real_1, df_model_1)


print(df_real_1.loc[df_real_1['file'] == '101233987_957841393_100174498730201118_20201118_124750_1001744989_1001744987_13139a7ae50fc843b1dadd0c_.json']['transcript'])
print(df_model_1.loc[df_model_1['file'] == '101233987_957841393_100174498730201118_20201118_124750_1001744989_1001744987_13139a7ae50fc843b1dadd0c_.json']['transcript'])



#Qto_file(scr_list, file_name='Testing01')

43    alo? Buenos días. Habló con José? García Sepúl...
Name: transcript, dtype: object
43    malo. Buenos días. Habló con José? García Sepú...
Name: transcript, dtype: object
