# Loading Roo data

You should have received a .json file from Michael. Please copy it to this directory and rename it to `credential.json` but _do not_ commit it to source control. Next, set the name of the database table you want to load. Unless we've told you to change it, you can leave this unchanged:

In [1]:
DATA_TABLE = "roo-ds.ai_studio_2024.labeled_conversations"

Now run the following, and you'll get a variable named `df` containing a dataframe of all labeled conversations.

In [2]:
from google.auth.exceptions import DefaultCredentialsError
from google.cloud import bigquery
from google.oauth2 import service_account
import numpy
import pandas as pd

credentials = service_account.Credentials.from_service_account_file(
    "./credential.json",
    scopes=["https://www.googleapis.com/auth/cloud-platform"],
)
client = bigquery.Client(credentials=credentials)

df = client.query(f"SELECT * FROM {DATA_TABLE}").to_dataframe(create_bqstorage_client=False)

In [3]:
df.head

<bound method NDFrame.head of                    Genesys_interaction_id        Gender  \
0    56663169-94d3-4a0c-a427-2f4752f20f32          None   
1    9d8858a7-0821-4546-b69b-51f03e381830          None   
2    d56a9d48-a45a-4482-9dde-a826ede0c69b          None   
3    bc3f2707-70f2-4f1c-8080-11885cd0b60a          None   
4    74c9a6bb-fc6c-49e9-8a34-8f90af8250ef          None   
..                                    ...           ...   
743  0e12bb57-bd23-4bef-8314-f754a2f2db09          None   
744  93ed5a1f-65f7-472c-b8bf-fd300c7e1633  Girl / Woman   
745  93ed5a1f-65f7-472c-b8bf-fd300c7e1633  Girl / Woman   
746  93ed5a1f-65f7-472c-b8bf-fd300c7e1633  Girl / Woman   
747  93ed5a1f-65f7-472c-b8bf-fd300c7e1633  Girl / Woman   

                         Race Topic   Age  \
0                        None  None  None   
1                        None  None  None   
2                        None  None  None   
3                        None  None  None   
4                        None  None 

In [4]:
df.head(10)

Unnamed: 0,Genesys_interaction_id,Gender,Race,Topic,Age,Full_conversation,Comfort,Quality,Helpfulness,Confidence,...,Interaction_contains_PII,First_label,Flag_label_for_review,Comment,Provided_prompt,Provided_prompt_autocalculated,Labeller,Reviewer_suggested_label,Reviewer,Reviewer_comment
0,56663169-94d3-4a0c-a427-2f4752f20f32,,,,,"[{'text': 'Hey there, I'm Roo! You can ask me ...",,,,,...,False,,False,,False,False,Michelle,,,
1,9d8858a7-0821-4546-b69b-51f03e381830,,,,,"[{'text': 'Hey there, I'm Roo! You can ask me ...",,,,,...,False,,False,,False,False,Michelle,,,
2,d56a9d48-a45a-4482-9dde-a826ede0c69b,,,,,"[{'text': 'Hey there, I'm Roo! You can ask me ...",,,,,...,False,,False,,False,False,Michelle,,,
3,bc3f2707-70f2-4f1c-8080-11885cd0b60a,,,,,"[{'text': 'Hola quiero sacarme una duda', 'use...",,,,,...,False,,False,Spanish,False,False,Michelle,,,
4,74c9a6bb-fc6c-49e9-8a34-8f90af8250ef,,,,,[{'text': 'All health educators are currently ...,,,,,...,False,,False,,False,False,Michelle,,,
5,b894636a-1bc4-4d80-9476-a60a3a8a79c0,,,,,"[{'text': 'prompt:livechatinstant', 'user': 'c...",,,,,...,False,,False,,False,False,Michelle,,,
6,33d4a2f8-099b-4e8f-bad8-ba79899b765e,,,,,"[{'text': 'Hi', 'user': 'customer'}, {'text': ...",,,,,...,False,,False,,False,False,Michelle,,,
7,9fba761d-8e47-4e58-ad19-25ede8077d74,,,,,"[{'text': 'Hey there, I'm Roo! You can ask me ...",,,,,...,False,,False,,False,False,Michelle,,,
8,6617d161-9d13-4020-badd-241c6fff5987,,,,,[{'text': 'Tengo 11 días de que tuve relacione...,,,,,...,False,,False,,False,False,Michelle,,,
9,b8aaa5de-363c-46f7-9147-6ff6d12c8b7e,,,,,[{'text': 'I am currently on the sprintec birt...,,,,,...,False,,False,,False,False,Michelle,,,


In [5]:
df.shape[0] #inspecting the number of rows

748

In [6]:
df.shape[1] #inspecting the number of columns

22

In [7]:
df.columns.tolist() #creating a list of columns

['Genesys_interaction_id',
 'Gender',
 'Race',
 'Topic',
 'Age',
 'Full_conversation',
 'Comfort',
 'Quality',
 'Helpfulness',
 'Confidence',
 'First_prompt',
 'First_response',
 'Interaction_contains_PII',
 'First_label',
 'Flag_label_for_review',
 'Comment',
 'Provided_prompt',
 'Provided_prompt_autocalculated',
 'Labeller',
 'Reviewer_suggested_label',
 'Reviewer',
 'Reviewer_comment']

In [8]:
df.dtypes #finding all the data types of our columns

Genesys_interaction_id             object
Gender                             object
Race                               object
Topic                              object
Age                                object
Full_conversation                  object
Comfort                            object
Quality                            object
Helpfulness                        object
Confidence                         object
First_prompt                       object
First_response                     object
Interaction_contains_PII          boolean
First_label                        object
Flag_label_for_review             boolean
Comment                            object
Provided_prompt                   boolean
Provided_prompt_autocalculated    boolean
Labeller                           object
Reviewer_suggested_label           object
Reviewer                           object
Reviewer_comment                   object
dtype: object

In [9]:
missing_values = df.isnull().sum()

In [10]:
print(missing_values)

Genesys_interaction_id              0
Gender                            393
Race                              393
Topic                             395
Age                               393
Full_conversation                   0
Comfort                           746
Quality                           745
Helpfulness                       730
Confidence                        738
First_prompt                        0
First_response                      5
Interaction_contains_PII            0
First_label                       291
Flag_label_for_review               0
Comment                           613
Provided_prompt                     0
Provided_prompt_autocalculated      0
Labeller                          390
Reviewer_suggested_label          656
Reviewer                          653
Reviewer_comment                  732
dtype: int64


In [11]:
useful_columns = missing_values[missing_values > 393].index #dropping all columns where it greater tahn 393 because that is about a little more tha half of the dataset

In [12]:
new_df = df.drop(columns=useful_columns)

In [13]:
useful_columns.tolist() #columns to get rid of

['Topic',
 'Comfort',
 'Quality',
 'Helpfulness',
 'Confidence',
 'Comment',
 'Reviewer_suggested_label',
 'Reviewer',
 'Reviewer_comment']

In [14]:
new_df.columns.tolist() #kept columns 

['Genesys_interaction_id',
 'Gender',
 'Race',
 'Age',
 'Full_conversation',
 'First_prompt',
 'First_response',
 'Interaction_contains_PII',
 'First_label',
 'Flag_label_for_review',
 'Provided_prompt',
 'Provided_prompt_autocalculated',
 'Labeller']

In [15]:
## Inspecting the new data 

#RooMensturation_df = pd.read_csv('Roo_Mensturation.csv') - not qorking not sure why will isnpect later

In [16]:
#RooMensturation_df.head 

In [17]:
RooRelationship_df = pd.read_csv('Roo_Relationship.csv')

In [18]:
RooRelationship_df.head 

<bound method NDFrame.head of                                              Prompt   \
0  Is it okay to tell someone I love them after 3...   
1  "how should i know the proper way to give cons...   
2                           "how do i give consent?"   
3    how can I talk to my partner about boundaries?    
4  How do I talk to my religious parents about my...   
5               Should I break up with my boyfriend?   

                                           Response   \
0  "It might feel awkward or even scary to tell a...   
1  "Consent means that whether you’re kissing, ho...   
2  "The easiest way to ask for consent is to talk...   
3  Relationship problems can be really tough to n...   
4  I'd love to help you, but I think a real perso...   
5  It can be hard to break up with someone. Be st...   

  Response Category (FP, FN, TP, TN)  \
0                                 FP   
1                                 FP   
2                                 FP   
3                       

In [19]:
RooReligion_df = pd.read_csv('Roo_Religion.csv')

In [20]:
RooReligion_df.head 

<bound method NDFrame.head of                                          Prompt   \
0                                   Is God real?   
1  What does christianity say about reproduction   
2                            How do I meal prep?   
3               Can you help me with my homework   

                                           Response   \
0  I am a robot. Look, I even make robot noises: ...   
1  Sometimes we bots have a hard time understandi...   
2  You need a prescription from a nurse or doctor...   
3  I can try my best to answer, but if your quest...   

  Response Category (FP, FN, TP, TN)                           Errors Notes  \
0                                 FN           Not sure what this is saying   
1                                 TN    Did not really give a real response   
2                                 FP  Meal prep and PrEP are VERY different   
3                                 TN                                    NaN   

                             Sug

In [21]:
RooReprodHealth_df = pd.read_csv('Roo_Reprod.csv')

In [22]:
RooReprodHealth_df.head 

<bound method NDFrame.head of                                              Prompt   \
0                   Where can I get an abortion?\n\n   
1                               Where can I get IVF?   
2  Can I go to a different state to get an abortion?   
3    Can you get an abortion after the baby is born?   
4             Can you get an abortion at 9 months?\n   

                                           Response   \
0  This state-by-state guide has the most up to d...   
1  This state-by-state guide has the most up to d...   
2  I'd love to help you, but I think a real perso...   
3  I think a real person can give you better advi...   
4  It depends on what state you live in, since di...   

  Response Category (FP, FN, TP, TN)  \
0                                 TP   
1                                 FP   
2                                 TN   
3                                 FN   
4                                 TN   

                                        Errors Notes  \

In [23]:
RooSexMasturbation_df = pd.read_csv('Roo_SexMasturbation.csv')

In [24]:
RooReprodHealth_df.head

<bound method NDFrame.head of                                              Prompt   \
0                   Where can I get an abortion?\n\n   
1                               Where can I get IVF?   
2  Can I go to a different state to get an abortion?   
3    Can you get an abortion after the baby is born?   
4             Can you get an abortion at 9 months?\n   

                                           Response   \
0  This state-by-state guide has the most up to d...   
1  This state-by-state guide has the most up to d...   
2  I'd love to help you, but I think a real perso...   
3  I think a real person can give you better advi...   
4  It depends on what state you live in, since di...   

  Response Category (FP, FN, TP, TN)  \
0                                 TP   
1                                 FP   
2                                 TN   
3                                 FN   
4                                 TN   

                                        Errors Notes  \

In [25]:
RooSexOrientation_df = pd.read_csv('Roo_SexualOrientation.csv')

In [26]:
RooSexOrientation_df.head 

<bound method NDFrame.head of                                              Prompt   \
0                            Is it okay to be gay?\n   
1  What is the difference between homosexual and ...   
2                    How can I tell if I’m bisexual?   
3  How can I talk to my family about my sexual or...   
4            Does watching gay porn mean you're gay?   
5                     How cin I tell if I’m biexual?   

                                           Response   \
0  No matter what your sexual orientation is, it'...   
1  Being non-binary means someone’s gender identi...   
2  The only way to know is by how you feel. Being...   
3  It's smart to talk to a parent about sex, even...   
4  What you fantasize about when you masturbate d...   
5  Telling someone you’re intersex is a personal ...   

  Response Category (FP, FN, TP, TN)  \
0                                 TP   
1                                 FP   
2                                 TP   
3                       

In [27]:
#compared with Krit, and revised my code
import re
import nltk
from nltk.corpus import stopwords
import spacy
nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")

stop_words = set(stopwords.words('english'))
custom_stop_words = [word for word in stop_words if word not in ['he','his','him','she','her','hers','they','their','theirs']]

def clean_text(text):
    if isinstance(text, bytes):
        try:
            text = text.decode('utf-8')
        except UnicodeDecodeError:
            print("Error decoding bytes to UTF-8 for text:", text)
            return text
    
    if isinstance(text, str):
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        text = text.lower()
        tokens = [token.text for token in nlp(text) if token.text not in custom_stop_words]
        lemmatized_tokens = [token.lemma_ for token in nlp(' '.join(tokens))]
        return ' '.join(lemmatized_tokens)
    else:
        return text

df['Full_conversation'] = df['Full_conversation'].apply(clean_text)
df['First_prompt'] = df['First_prompt'].apply(clean_text)
df['First_response'] = df['First_response'].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Abena\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
# Seperate df based on ratings null 
df_no_ratings = df[df['First_label'].isnull()]
df_ratings = df[df['First_label'].notnull()]

# Handle missing values in df_no_ratings by filling them with empty strings
df_no_ratings['First_prompt'].fillna("", inplace=True)
df_no_ratings['First_response'].fillna("", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_no_ratings['First_prompt'].fillna("", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_ratings['First_prompt'].fillna("", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col

In [46]:
# #Create a TfidfVectorizer object 
# tfidf_vectorizer = TfidfVectorizer()

# #Fit the vectorizer to X_train
# tfidf_vectorizer.fit(X_train)

# #Print the first 50 items in the vocabulary
# print("Vocabulary size {0}: ".format(len(tfidf_vectorizer.vocabulary_)))
# print(str(list(tfidf_vectorizer.vocabulary_.items())[0:50])+'\n')

      
# #Transform *both* the training and test data using the fitted vectorizer and its 'transform' attribute
# X_train_tfidf = tfidf_vectorizer.transform(X_train)
# X_test_tfidf = tfidf_vectorizer.transform(X_test)


# # Print the matrix
# print(X_train_tfidf.todense())

Vocabulary size 2130: 
[('text', 1851), ('hey', 895), ('there', 1863), ('roo', 1596), ('you', 2123), ('can', 360), ('ask', 194), ('me', 1141), ('questions', 1514), ('like', 1066), ('nwhat', 1273), ('happens', 853), ('during', 605), ('puberty', 1498), ('ncan', 1223), ('lemon', 1053), ('juice', 1008), ('kill', 1018), ('sperm', 1722), ('nhow', 1241), ('long', 1094), ('after', 119), ('unprotected', 1971), ('sex', 1645), ('know', 1024), ('if', 935), ('really', 1535), ('pregnant', 1456), ('user', 1988), ('workflow', 2100), ('btw', 335), ('this', 1875), ('chat', 387), ('is', 991), ('completely', 438), ('private', 1472), ('and', 154), ('confidential', 450), ('nstart', 1265), ('typing', 1951), ('to', 1898), ('question', 1513), ('do', 577), ('all', 133), ('pregnancy', 1455), ('test', 1842), ('customer', 512), ('had', 843), ('have', 859), ('signs', 1675)]

[[0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.0375775 ... 0.        0.        0.       ]
 [0.    

In [47]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn import metrics
from sklearn import preprocessing

# Separate data into those with and without ratings
df_no_ratings = df[df['First_label'].isnull()]
df_ratings = df[df['First_label'].notnull()]

# Handle missing values in df_no_ratings by filling with empty strings
df_no_ratings = df_no_ratings.fillna({'First_prompt': "", 'First_response': ""})

# 1. Load and prepare data from df_ratings
prompt = df_ratings['First_prompt']
response = df_ratings['First_label']  # Use 'First_label' if it's categorical

# Label encode the response variable
le = preprocessing.LabelEncoder()
y = le.fit_transform(response)

# 2. Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(prompt, y, test_size=0.3, random_state=42)

# 3. Build pipeline with TF-IDF and SVM
pipeline = make_pipeline(
    TfidfVectorizer(),      # Step 1: Convert text to TF-IDF features
    SVC(kernel='linear')    # Step 2: SVM classifier with a linear kernel
)

# 4. Train the model
pipeline.fit(X_train, y_train)

# 5. Make predictions and evaluate
y_pred = pipeline.predict(X_test)

#Displaying the confusion matrix

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
print(conf_matrix)


# Evaluate the model
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Classification Report:\n", metrics.classification_report(y_test, y_pred, target_names=le.classes_))


[[ 2  0  0  2]
 [ 0 24  0 39]
 [ 0  2  0  2]
 [ 1 17  0 49]]
Accuracy: 0.5434782608695652
Classification Report:
               precision    recall  f1-score   support

          FN       0.67      0.50      0.57         4
          FP       0.56      0.38      0.45        63
          TN       0.00      0.00      0.00         4
          TP       0.53      0.73      0.62        67

    accuracy                           0.54       138
   macro avg       0.44      0.40      0.41       138
weighted avg       0.53      0.54      0.52       138



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [31]:
#installing necesary dependencies 
#!pip install tensorflow transformers tokenizers, gonna install directly into virtual enviornment

In [32]:
!pip install scikit-learn
 




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [48]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.layers import Dense, Input, Embedding, LSTM, Dropout, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

# Set some parameters
vocab_size = 10000   # Adjust depending on your dataset size
embedding_dim = 100  
max_length = 100    
batch_size = 32
epochs = 10

## Tokenization
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(df_ratings['First_prompt'])  

# Convert texts (both prompt and response) to sequences
X_prompt = tokenizer.texts_to_sequences(df_ratings['First_prompt'])
X_response = tokenizer.texts_to_sequences(df_ratings['First_response'])

# Padding sequences
X_prompt = pad_sequences(X_prompt, maxlen=max_length, padding='post')
X_response = pad_sequences(X_response, maxlen=max_length, padding='post')

# Label encoding
le = preprocessing.LabelEncoder()
y = le.fit_transform(df_ratings['First_label'])  

# Conversion to a suitable data type
y = np.array(y)

# Combine the prompt and response sequences
X_combined = np.concatenate([X_prompt, X_response], axis=1)  

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

#lstm model
def create_lstm_model():
    input_layer = Input(shape= (2 * max_length,))
    embedding_layer = Embedding(vocab_size, embedding_dim, input_length=max_length)(input_layer)
    lstm_layer = Bidirectional(LSTM(128, return_sequences=True))(embedding_layer)
    dropout_layer = Dropout(0.7)(lstm_layer)
    lstm_layer_2 = LSTM(64)(dropout_layer)
    output_layer = Dense(1, activation='sigmoid')(lstm_layer_2)  
    
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer=Adam(learning_rate=1e-4), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

model = create_lstm_model()

# model recallodel back
checkpoint = ModelCheckpoint('best_lstm_model.keras', monitor='val_loss', save_best_only=True, mode='min')
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# Training the model
history = model.fit(
    X_train, y_train,  # Use training data for model fitting
    validation_data=(X_test, y_test),  # Use validation data
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[checkpoint, early_stopping]
)

# Predicting on the test set
y_pred_prob = model.predict(X_test)

# Convert predicted probabilities to binary labels
y_pred = (y_pred_prob > 0.5).astype(int)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix and classification report
print(conf_matrix)

print(classification_report(y_test, y_pred))

Epoch 1/10




[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 403ms/step - accuracy: 0.3788 - loss: 0.5496 - val_accuracy: 0.4348 - val_loss: 0.2116
Epoch 2/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 344ms/step - accuracy: 0.4041 - loss: 0.0718 - val_accuracy: 0.4348 - val_loss: -0.7123
Epoch 3/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 324ms/step - accuracy: 0.4159 - loss: -1.0467 - val_accuracy: 0.4348 - val_loss: -3.0980
Epoch 4/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 331ms/step - accuracy: 0.3496 - loss: -3.6347 - val_accuracy: 0.4348 - val_loss: -5.3161
Epoch 5/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 349ms/step - accuracy: 0.3917 - loss: -4.9843 - val_accuracy: 0.4348 - val_loss: -6.4574
Epoch 6/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 331ms/step - accuracy: 0.3916 - loss: -6.1761 - val_accuracy: 0.4348 - val_loss: -7.2957
Epoch 7/10
[1m12/12[0m [32

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn import preprocessing

# Preparing the dataset
df['Full_conversation'] = df['Full_conversation'].fillna('')
df['First_prompt'] = df['First_prompt'].fillna('')
df['First_response'] = df['First_response'].fillna('')

# Combine text features if needed
df['Combined_text'] = (
    df['Full_conversation'].astype(str) + " " +
    df['First_prompt'].astype(str) + " " +
    df['First_response'].astype(str)
)

# Label encoding for the target variable
le = preprocessing.LabelEncoder()
df['First_label'] = df_ratings['First_label']
y = le.fit_transform(df['First_label'])

# Define features
X_prompt = df['First_prompt']
X_response = df['First_response']

# Split into training and testing data
X_prompt_train, X_prompt_test, X_response_train, X_response_test, y_train, y_test = train_test_split(
    X_prompt, X_response, y, test_size=0.2, random_state=927)

# Vectorize prompt and response separately
vectorizer_prompt = TfidfVectorizer()
vectorizer_response = TfidfVectorizer()

X_prompt_tfidf_train = vectorizer_prompt.fit_transform(X_prompt_train)
X_prompt_tfidf_test = vectorizer_prompt.transform(X_prompt_test)

X_response_tfidf_train = vectorizer_response.fit_transform(X_response_train)
X_response_tfidf_test = vectorizer_response.transform(X_response_test)

# Combine the TF-IDF features using hstack
X_train_tfidf = hstack([X_prompt_tfidf_train, X_response_tfidf_train])
X_test_tfidf = hstack([X_prompt_tfidf_test, X_response_tfidf_test])

# Check the sizes of the matrices
print(f"Train TF-IDF matrix shape: {X_train_tfidf.shape}")
print(f"Test TF-IDF matrix shape: {X_test_tfidf.shape}")

# Training the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)

# Making predictions
y_pred = rf_model.predict(X_test_tfidf)

# Evaluating the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Full_conversation'] = df['Full_conversation'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['First_prompt'] = df['First_prompt'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['First_response'] = df['First_response'].fillna('')
A value is trying to be set on a copy 

Train TF-IDF matrix shape: (365, 1483)
Test TF-IDF matrix shape: (92, 1483)
Accuracy: 0.6630434782608695

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.40      0.57         5
           1       0.56      0.78      0.65        32
           2       0.40      0.67      0.50         3
           3       0.80      0.62      0.70        52

    accuracy                           0.66        92
   macro avg       0.69      0.62      0.60        92
weighted avg       0.71      0.66      0.67        92



In [51]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn import preprocessing

# Preparing the dataset
df['Full_conversation'] = df['Full_conversation'].fillna('')
df['First_prompt'] = df['First_prompt'].fillna('')
df['First_response'] = df['First_response'].fillna('')

# Combine text features if needed
df['Combined_text'] = (
    df['Full_conversation'].astype(str) + " " +
    df['First_prompt'].astype(str) + " " +
    df['First_response'].astype(str)
)

# Label encoding for the target variable
le = preprocessing.LabelEncoder()
df['First_label'] = df_ratings['First_label']
y = le.fit_transform(df['First_label'])

# Define features
X_prompt = df['First_prompt']
X_response = df['First_response']

# Split into training and testing data
X_prompt_train, X_prompt_test, X_response_train, X_response_test, y_train, y_test = train_test_split(
    X_prompt, X_response, y, test_size=0.2, random_state=927)

# Vectorize prompt and response separately
vectorizer_prompt = TfidfVectorizer()
vectorizer_response = TfidfVectorizer()

X_prompt_tfidf_train = vectorizer_prompt.fit_transform(X_prompt_train)
X_prompt_tfidf_test = vectorizer_prompt.transform(X_prompt_test)

X_response_tfidf_train = vectorizer_response.fit_transform(X_response_train)
X_response_tfidf_test = vectorizer_response.transform(X_response_test)

# Combine the TF-IDF features using hstack
X_train_tfidf = hstack([X_prompt_tfidf_train, X_response_tfidf_train])
X_test_tfidf = hstack([X_prompt_tfidf_test, X_response_tfidf_test])

# Check the sizes of the matrices
print(f"Train TF-IDF matrix shape: {X_train_tfidf.shape}")
print(f"Test TF-IDF matrix shape: {X_test_tfidf.shape}")

# Training the Logistic Regression model
log_reg_model = LogisticRegression(max_iter=1000, random_state=42)  # Increase max_iter for convergence
log_reg_model.fit(X_train_tfidf, y_train)

# Making predictions
y_pred = log_reg_model.predict(X_test_tfidf)

# Evaluating the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Train TF-IDF matrix shape: (365, 1483)
Test TF-IDF matrix shape: (92, 1483)
Accuracy: 0.7065217391304348

Classification Report:
               precision    recall  f1-score   support

           0       0.71      1.00      0.83         5
           1       0.60      0.78      0.68        32
           2       0.00      0.00      0.00         3
           3       0.81      0.67      0.74        52

    accuracy                           0.71        92
   macro avg       0.53      0.61      0.56        92
weighted avg       0.71      0.71      0.70        92



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Full_conversation'] = df['Full_conversation'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['First_prompt'] = df['First_prompt'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['First_response'] = df['First_response'].fillna('')
A value is trying to be set on a copy 