In [1]:
# Import the required modules
import json
import csv
import requests
import time
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient
from azure.ai.textanalytics import TextAnalyticsApiVersion
from azure.core.exceptions import HttpResponseError

# Define the Azure Text Analytics and Translator credentials
text_analytics_key = "07e23bb5db3c47f294e5ab663a07d78c"
text_analytics_endpoint = "https://azureapi-language.cognitiveservices.azure.com/"
translator_key = "cfb87536adfa4c4cb30b5fc061f1bd62"
translator_endpoint = "https://api.cognitive.microsofttranslator.com/"

# Create the Text Analytics and Translator clients
text_analytics_client = TextAnalyticsClient(
    endpoint=text_analytics_endpoint,
    credential=AzureKeyCredential(text_analytics_key),
    #api_version=TextAnalyticsApiVersion.V3_0_0 # Use the latest preview version for Text Analytics for health
)
translator_client = requests.Session()
translator_client.headers.update({
    'Ocp-Apim-Subscription-Key': translator_key,
    'Ocp-Apim-Subscription-Region': "eastus",
    'Content-type': 'application/json'
})

# Define the input and output files
input_file = "transcripts.json"
output_file = "output_final_updeate.csv"

# Define the rate limit for the free tier (5 requests per second)
rate_limit = 5

# Define a helper function to translate a text to English
def translate_to_english(text):
    # Construct the request body
    body = [{
        'text': text
    }]
    # Send the request to the Translator service
    response = translator_client.post(translator_endpoint + '/translate?api-version=3.0&to=en', json=body)
    # Parse the response
    result = response.json()
    # Return the translated text
    return result[0]['translations'][0]['text']

# Open the input file and load the JSON data with error handling
try:
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
except (IOError, json.JSONDecodeError) as e:
    print(f'Error reading input file: {e}')
    data = {}

# Open the output file and create a CSV writer with error handling
try:
    with open(output_file, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        # Write the header row
        writer.writerow(['id', 'text', 'language', 'key_phrases'])

        # Loop through the data items
        for id, text in data.items():
            # Detect the language of the text
            try:
                response = text_analytics_client.detect_language(documents=[text])
                language = response[0].primary_language.name
            except HttpResponseError as e:
                print(f'Error detecting language for id {id}: {e}')
                language = 'Unknown'

            # If the language is not English, translate the text to English
            if language != 'English':
                text = translate_to_english(text)

            # Extract the key phrases from the text
            try:
                response = text_analytics_client.extract_key_phrases(documents=[text])
                key_phrases = ', '.join(response[0].key_phrases)
            except HttpResponseError as e:
                print(f'Error extracting key phrases for id {id}: {e}')
                key_phrases = ''

            # Write the output row
            try:
                writer.writerow([id, text, language, key_phrases])
            except UnicodeEncodeError as e:
                print(f'Error writing row to output file for id {id}: {e}')

            # Wait for 1/rate_limit seconds to avoid exceeding the free tier limit
            time.sleep(1/rate_limit)
except (IOError, csv.Error) as e:
    print(f'Error writing to the output file: {e}')


In [3]:
import csv
import re

# Input and output file paths
input_file = "output_final_updeate.csv"
output_file = "cleaned_output_file.csv"

# Function to clean text by removing random signs
def clean_text(text):
    # Define regular expressions to match unwanted characters
    unwanted_chars = r"[^a-zA-Z0-9\s]"
    clean_text = re.sub(unwanted_chars, "", text)
    return clean_text

# Read the input CSV file and create the cleaned data
cleaned_data = []

with open(input_file, 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        id = row['id']
        text = row['text']
        cleaned_text = clean_text(text)
        cleaned_data.append({'id': id, 'text': cleaned_text})

# Write the cleaned data to the output CSV file
with open(output_file, 'w', encoding='utf-8', newline='') as f:
    fieldnames = ['id', 'text']
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    for row in cleaned_data:
        writer.writerow(row)

print(f"Text cleaning and saving to {output_file} is complete.")


Text cleaning and saving to cleaned_output_file.csv is complete.
