In [None]:
# Necessary Installations
##---------------------##

# Installing openai
# Installing langdetect

!pip install openai
!pip install python-docx

In [None]:
!pip install langdetect

In [None]:
# Necessary Imports
##---------------##

# OpenAI library imported - Send API requests to Generative Pretrained Models that are not available in the open source repository, but are licensed, owned and serviced by OpenAI
# Pandas library imported with alias pd - Read and analyse data from a dataset at a memory location
# Re library imported - Regular Expression Parsing
# Imported detect function from langdetect library - Recognize Kannada language in text
import openai
import pandas as pd
import re
from langdetect import detect

In [None]:
# Preprocessing
##-----------##

# Cleaning each text item and removing links and hashtags
def text_cleaning(text):
    lang = detect(text)
    if (lang == 'kan'):
        text = re.sub(r"@[A-Za-z0-9]+", ' ', text)
        text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
        return text
    text = re.sub(r"@[A-Za-z0-9]+", ' ', text)
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
    #text = re.sub(r"[^a-zA-z.!?'0-9]", ' ', text)
    #text = re.sub('\t', ' ',  text)
    text = re.sub(r" +", ' ', text)
    return text

In [None]:
df = pd.read_csv('/kaggle/input/kannadaoffensivetest/kannada_offensive_test.csv')
print(df)
df['text, offense'] = df['text, offense'].apply(text_cleaning)

In [None]:
df

In [None]:
df['text, offense'][29]
textl = []
offensel = []

In [None]:
# Splitting and formatting each text review item
for text in df['text, offense'][:777]:
    stext, soffense = text.split('\t')
    textl.append(stext)
    offensel.append(soffense)

In [None]:
len(textl)

In [None]:
len(offensel)
print(offensel)

In [None]:
# Converting processed text item lists into DataFrames
textl = pd.DataFrame(textl)
offensel = pd.DataFrame(offensel)
print(offensel)

# Appending the processed text items into separate columns of the existing DataFrame
df['text'] = textl
df['offense'] = offensel
df.head()

In [None]:
df['text, offense'][29]

In [None]:
# Further Preprocessing
def rem_spl(text):
    text = str(text)
    if (str(text[0]) == '@'):
        text = text.replace(text.split(' ')[0], '')
    return text
df['text'] = df['text'].apply(rem_spl)

In [None]:
# Further preprocessing
def clean_spaces(text):
    text = text.strip(' ')
    spl = text.split(' ')
    for i in spl:
        if i == ' ':
            i = ''
    text = ' '.join(spl)
    return text
df['text'] = df['text'].apply(clean_spaces)
df['text'][776]

In [None]:
# Finding out the largest text item
max_l = 0
st = ''
for i in df['text']:
    l = len(i)
    if l > max_l:
        max_l = l
        st = i
print(st)

In [None]:
# Preview - counts against each unique value in the dataframe
df['offense'].value_counts()

In [None]:
# OpenAI API Key - added as an environment variable - add your API key here ''
openai.api_key = ''

In [None]:
# Using the OpenAI API
##------------------##

def call_openai_api(text):
    '''The function takes a prompt to forward to the OpenAI Model: 'text-curie-001'
    maximum number of return tokens = 3 => Completion
    temperature = 0 => Less varibility in responses, more deterministic values
    
    Return value: response_text - contains the response to the prompt
'''
    response = openai.Completion.create(
        model = 'text-curie-001',
        max_tokens = 3,
        prompt = text,
        n = 1,
        stop = None,
        temperature = 0)
    response_text = response.choices[0].text
    #print(response_text)
    return response_text

In [None]:
# Prompting using the function
##--------------------------##

# Determine whether the  language is Kannada
for i in range(0, 777):
    review = df['text'][i]
    prompt = f'is{review}in Kannada language.respond with yes or no only:'
    res = call_openai_api(prompt)
    #print (res)
    if (res == 'no'):
        df['chatgpt_pred'][i] = 'not-Kannada'


In [None]:
# Determine whether the Kannada texts have offensive sentiments
for i in range(0, 777):
    if (df['chatgpt_pred'][i] != 'not-Kannada'):
        review = df['text'][i]
        prompt = f'does{review}have offensive content/harsh sentiment/bad words when said in angry way.respond with yes or no only:'
        res = call_openai_api(prompt)
        #print (res)
        if (res == 'no'):
            df['chatgpt_pred'][i] = 'Not_offensive'


In [None]:
# Determine the type of offensive sentiment in the Kannada text
for i in range(0, 777):
    if (df['chatgpt_pred'][i] != 'not-Kannada' && df['chatgpt_pred'][i] != 'Not_offensive'):
        review = df['text'][i]
        prompt = f'is{review} targeted to a group or individual or untargeted or other.respond with g, i, u or unknown only:'
        res = call_openai_api(prompt)
        #print (res)
        if (res == 'g'):
            df['chatgpt_pred'][i] = 'OTIG'
        if (res == 'i'):
            df['chatgpt_pred'][i] = 'OTII'
        if (res == 'u'):
            df['chatgpt_pred'][i] = 'OU'
        if (res == 'unknown'):
            df['chatgpt_pred'][i] = 'OTIO'


In [None]:
# Save the results
##--------------##

# Save as a CSV File
df.to_csv('Kannadaoffensivechatgpt.csv')