In [3]:
import os 
import pandas as pd
from datasets import load_dataset

ds = load_dataset("jtatman/python-code-dataset-500k")

Downloading readme:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/212M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/135M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/559515 [00:00<?, ? examples/s]

In [2]:
# Define the directory containing the .txt files
directory = '../UniPy/LanguageData'

# Initialize an empty dictionary to store data
data = {}

# Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith('.txt') and 'Key' in filename:
        # Extract the language from the filename
        language = filename.split('+')[0].strip('$')
        
        try:
            # Read the content of the file
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                content = file.read()
            
            # Split the content by new lines and append to the corresponding language key
            lines = content.split('\n')
            if language not in data:
                data[language] = []
            data[language].extend(lines)  # Use extend to add each line as a new entry

        except Exception as e:
            print(f"Error reading {filename}: {e}")

# Convert the dictionary to a DataFrame
# The DataFrame will be constructed such that each language's lines are in a separate column
df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in data.items()]))

# Save the DataFrame to a CSV file
df.to_csv('segregated_data.csv', index=False)

print("CSV file 'segregated_data.csv' has been created successfully.")


CSV file 'segregated_data.csv' has been created successfully.


In [6]:
ds

DatasetDict({
    train: Dataset({
        features: ['output', 'instruction', 'system'],
        num_rows: 559515
    })
})

In [13]:
# Access the train dataset
train_dataset = ds['train']

['Here is an example of a nested loop in Python to print every combination of numbers between 0-9, excluding any combination that contains the number 5 or repeating digits:\n\n```python\nfor i in range(10):  # First digit\n    for j in range(10):  # Second digit\n        for k in range(10):  # Third digit\n            # Checking for the conditions\n            if i != 5 and j != 5 and k != 5 and i != j and i != k and j != k:\n                print(i, j, k)\n```\n\nThis code will generate and print every combination of three digits between 0-9 that do not contain the number 5 and do not have any repeating digits.',
 "The given problem can be solved by iterating through each cell of the matrix and converting the state of the cell into a string. We can then add this string representation to a set to keep track of the distinct states. Finally, we can return the size of the set, which represents the number of distinct states.\n\nHere's the correct code to solve the problem:\n\n```python\nde

In [16]:
import re
import pandas as pd

# Function to extract Python code blocks from text
def extract_python_code_blocks(text):
    # Regular expression pattern to match text between ```python and ```
    pattern = r'```python(.*?)```'
    
    # Find all matches in the text
    matches = re.findall(pattern, text, re.DOTALL)
    
    # Return the extracted code blocks
    return [match.strip() for match in matches]

# Convert the list of strings into a DataFrame
df = pd.DataFrame(train_dataset, columns=['output'])

# Extract Python code blocks from each element in the DataFrame
df['extracted_code'] = df['output'].apply(extract_python_code_blocks)

# Flatten the list of extracted code blocks and create a new DataFrame
flattened_code_blocks = [code for sublist in df['extracted_code'] for code in sublist]
code_df = pd.DataFrame(flattened_code_blocks, columns=['English_code'])

# Save the extracted code blocks into a CSV file
code_df.to_csv('extracted_code_blocks.csv', index=False)

# Print the DataFrame to verify

                                            English_code
0      for i in range(10):  # First digit\n    for j ...
1      def count_distinct_states(matrix):\n    count ...
2      def remove_spaces_and_punctuation(s):\n    res...
3      def remove_spaces_and_punctuation(s):\n    res...
4      import math\n\ndef is_prime(n):\n    # Check i...
...                                                  ...
90389  # A high-dimensional quadratic bowl.\n    ndim...
90390  tf.gradients(tfp.math.clip_by_value_preserve_g...
90391  def cholesky_concat_slow(chol, cols):  # cols ...
90392  inv_X = tf.lu_matrix_inverse(*tf.linalg.lu(X))...
90393  import numpy as np\n  import tensorflow as tf\...

[90394 rows x 1 columns]


In [22]:
pd.set_option('display.max_colwidth', None)
code_df.head()

Unnamed: 0,English_code
0,"for i in range(10): # First digit\n for j in range(10): # Second digit\n for k in range(10): # Third digit\n # Checking for the conditions\n if i != 5 and j != 5 and k != 5 and i != j and i != k and j != k:\n print(i, j, k)"
1,"def count_distinct_states(matrix):\n count = 0\n states = set()\n for row in matrix:\n for col in row:\n state = ''.join(col)\n if state not in states:\n count += 1\n states.add(state)\n return count\n\nmatrix = [['A', 'B', 'C'],\n ['A', 'B', 'D'],\n ['A', 'B', 'C']]\nprint(count_distinct_states(matrix))\n# Output: 4"
2,"def remove_spaces_and_punctuation(s):\n result = """"\n for char in s:\n if char.isalnum():\n result += char\n return result"
3,"def remove_spaces_and_punctuation(s):\n result = """"\n for char in s:\n ascii_value = ord(char)\n if (ascii_value >= 48 and ascii_value <= 57) or (ascii_value >= 65 and ascii_value <= 90) or (ascii_value >= 97 and ascii_value <= 122):\n result += char\n return result"
4,"import math\n\ndef is_prime(n):\n # Check if the number is less than 2 or not an integer\n if n < 2 or not isinstance(n, int):\n return ""Not Prime""\n\n # Check if the number is divisible by any integer from 2 to the square root of n\n for i in range(2, math.isqrt(n) + 1):\n if n % i == 0:\n return ""Not Prime""\n\n return ""Prime"""


In [9]:
from openai import OpenAI

client = OpenAI(
  api_key='',  # this is also the default, it can be omitted
)
# Load your list of keywords and their translations
keywords_translations = {
    "abs": {"French": "valeur_absolue", "Hindi": "निरपेक्ष_मान", "Bengali": "পরম_মান"},
    "all": {"French": "tous", "Hindi": "सब", "Bengali": "সব"},
    # Add more keywords here...
}
# # Read the CSV file into a DataFrame
# df = pd.read_csv('segregated_data.csv')

# # Convert the DataFrame to a dictionary where each column is a key and the values are lists of strings
# keywords_translations = df.to_dict(orient='list')

# # Remove NaN values from the lists
# for key in keywords_translations:
#     keywords_translations[key] = [x for x in keywords_translations[key] if pd.notna(x)]

# Function to generate examples using ChatGPT
def generate_code_examples(keyword, translations):
    prompt = f"""
    Generate a simple Python code example using the keyword `{keyword}` with comments explaining the code. Also, provide translations of both the Python keyword and the comments in French, Hindi, and Bengali. Here's the structure:
    1. The original Python code with comments in English.
    2. The translated code with comments in French, using the keyword `{translations['French']}`.
    3. The translated code with comments in Hindi, using the keyword `{translations['Hindi']}`.
    4. The translated code with comments in Bengali, using the keyword `{translations['Bengali']}`.
    """
    
    # Call the OpenAI API to get the generated content
    response = client.chat.completions.create(
        model="davinci-",  # Make sure this model is available to you
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=1000
    )
    
    # Extract the response content
    return response.choices[0].message.content

# Loop through your keywords and generate examples
for keyword, translations in keywords_translations.items():
    try:
        examples = generate_code_examples(keyword, translations)
        print(f"Examples for '{keyword}':\n")
        print(examples)
        print("\n" + "="*80 + "\n")
    except Exception as e:
        print(f"An error occurred while processing '{keyword}': {str(e)}")


An error occurred while processing 'abs': Error code: 404 - {'error': {'message': 'The model `davinci-` does not exist or you do not have access to it.', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_found'}}
An error occurred while processing 'all': Error code: 404 - {'error': {'message': 'The model `davinci-` does not exist or you do not have access to it.', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_found'}}


In [27]:
import csv

def read_csv_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        headers = next(reader)
        data = list(reader)
    return headers, data

def write_csv_file(file_path, headers, data):
    with open(file_path, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(headers)
        writer.writerows(data)

def generate_code_examples(keyword, translations):
    languages = list(translations.keys())
    prompt = f"""
    Generate a simple Python code example using the keyword `{keyword}` with comments explaining the code. 
    Then, provide translations of both the Python keyword and the comments in the following languages: {', '.join(languages)}.
    
    Here's the structure:
    1. The original Python code with comments in English.
    {chr(10).join([f"{i+2}. The translated code with comments in {lang}, using the keyword `{translations[lang]}`." for i, lang in enumerate(languages) if lang != 'English'])}
    
    Ensure that the translations are accurate and maintain the meaning of the original code and comments.
    Start each language section with '### [Language Name] ###' and end it with '### End [Language Name] ###'.
    """
    
    response = client.chat.completions.create(
        model="gpt-4o-mini",  # Changed to gpt-3.5-turbo for potentially faster and cheaper responses
        messages=[
            {"role": "system", "content": "You are a helpful assistant skilled in programming and multiple languages."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=2000
    )
    
    return response.choices[0].message.content

def parse_examples(content, languages):
    examples = {}
    print("Raw content from API:")
    print(content)
    print("\nAttempting to parse examples for languages:", languages)
    
    for lang in languages:
        start_marker = f"### {lang} ###"
        end_marker = f"### End {lang} ###"
        start_index = content.find(start_marker)
        end_index = content.find(end_marker)
        
        if start_index != -1 and end_index != -1:
            example = content[start_index + len(start_marker):end_index].strip()
            examples[lang] = example
            print(f"Found example for {lang}")
        else:
            print(f"Could not find markers for {lang}")
            # Fallback: try to find a section that mentions the language
            lang_section = content.split("\n\n")
            for section in lang_section:
                if lang.lower() in section.lower():
                    examples[lang] = section.strip()
                    print(f"Found fallback example for {lang}")
                    break
    
    return examples

# Read the input CSV file
input_file_path = 'segregated_data.csv'  # Replace with your actual input file path
headers, data = read_csv_file(input_file_path)

# Process the data
keywords_translations = {}
for row in data:
    english_keyword = row[headers.index('EnglishKey.txt')]
    translations = {headers[i].replace('Key', ''): value for i, value in enumerate(row) if headers[i] != 'EnglishKey'}
    keywords_translations[english_keyword] = translations

print("Keywords and translations:")
print(keywords_translations)

# Prepare data for output CSV
output_data = []
languages = [header.replace('Key', '') for header in headers]

# Generate examples for each keyword and prepare output data
for keyword, translations in keywords_translations.items():
    try:
        print(f"\nProcessing keyword: {keyword}")
        examples_content = generate_code_examples(keyword, translations)
        parsed_examples = parse_examples(examples_content, languages)
        
        row = [keyword]
        for lang in languages:
            row.append(parsed_examples.get(lang, ''))
        
        output_data.append(row)
        
        print(f"Generated examples for '{keyword}'")
        print(f"Parsed examples: {parsed_examples}")
    except Exception as e:
        print(f"An error occurred while processing '{keyword}': {str(e)}")

# Write the output CSV file
output_file_path = 'output_examples.csv'
output_headers = ['Keyword'] + languages
write_csv_file(output_file_path, output_headers, output_data)

print(f"\nExamples have been generated and saved to {output_file_path}")
print(f"Output data: {output_data}")

Keywords and translations:
{'abs': {'French.txt': 'valeur_absolue', 'Spanish.txt': 'valor_absoluto', 'Kurdish.txt': 'بەهای_ڕەها', 'English.txt': 'abs', 'Hindi.txt': 'निरपेक्ष_मान', 'Bengali.txt': 'পরম_মান', 'Mandarin.txt': '绝对值', 'Greek.txt': 'απόλ'}, 'all': {'French.txt': 'tous', 'Spanish.txt': 'todo', 'Kurdish.txt': 'گشت', 'English.txt': 'all', 'Hindi.txt': 'सब', 'Bengali.txt': 'সব', 'Mandarin.txt': '全', 'Greek.txt': 'όλα'}, 'any': {'French.txt': 'quelconque', 'Spanish.txt': 'cualquier', 'Kurdish.txt': 'هەر', 'English.txt': 'any', 'Hindi.txt': 'कोई', 'Bengali.txt': 'যেকোনো', 'Mandarin.txt': '任何', 'Greek.txt': 'όποια'}, 'ascii': {'French.txt': 'ascii', 'Spanish.txt': 'ascii', 'Kurdish.txt': 'ascii', 'English.txt': 'ascii', 'Hindi.txt': 'आस्की', 'Bengali.txt': 'আস্কি', 'Mandarin.txt': 'ascii码', 'Greek.txt': 'ascii'}, 'bin': {'French.txt': 'binaire', 'Spanish.txt': 'binario', 'Kurdish.txt': 'دوودوویی', 'English.txt': 'bin', 'Hindi.txt': 'द्वि', 'Bengali.txt': 'বাইনারি', 'Mandarin.txt': 

KeyboardInterrupt: 