In [2]:
import pandas as pd


In [4]:
df = pd.read_excel("human_ai.xlsx")

In [7]:
import pandas as pd
import nltk
from textblob import TextBlob

nltk.download('averaged_perceptron_tagger')

def calculate_metrics(df):
    """
    Calculates the average sentence length, average grammatical complexity, and average sentiment
    score for each row of text in the input DataFrame.
    
    Parameters:
        df (Pandas DataFrame): A DataFrame with a single column of text
    
    Returns:
        Pandas DataFrame: A new DataFrame with the columns 'average_sentence_length', 'average_grammatical_complexity',
        and 'average_sentiment'
    """
    # Define a function to calculate the POS tag count for a given sentence
    def pos_tag_count(sentence):
        pos_tags = nltk.pos_tag(nltk.word_tokenize(sentence))
        tag_count = len(pos_tags)
        return tag_count
    
    # Define a function to calculate the sentiment score for a given sentence
    def sentiment_score(sentence):
        blob = TextBlob(sentence)
        score = blob.sentiment.polarity
        return score
    
    # Tokenize the text into sentences
    sentences = df['prompt'].apply(nltk.sent_tokenize)
    
    # Calculate the average sentence length for each row
    df['average_sentence_length'] = sentences.apply(lambda x: sum(len(sentence.split()) for sentence in x)/len(x))
    
    # Calculate the average POS tag count for each row
    df['average_grammatical_complexity'] = sentences.apply(lambda x: sum(pos_tag_count(sentence) for sentence in x)/len(x))
    
    # Calculate the average sentiment score for each row
    df['average_sentiment'] = sentences.apply(lambda x: sum(sentiment_score(sentence) for sentence in x)/len(x))
    
    # Drop the 'sentences' column
    #df = df.drop('sentences', axis=1)
    
    return df[['average_sentence_length', 'average_grammatical_complexity', 'average_sentiment']]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/aaronweiss/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [8]:
# df2 = pd.DataFrame()

df2 = pd.read_excel("3class.xlsx")
df2

Unnamed: 0,prompt,completion
0,"\n\nIn the late 1800s, a woman by the name of ...",10thgrade
1,\n\nThe Hunt for Enemy Technology - Early Jets...,10thgrade
2,The Division of Labour\n\nThe division of labo...,6thgrade
3,\n\n“Yes the body:” A Quarantined Review of Sp...,10thgrade
4,"\n\nOnce upon a time, in a world far different...",6thgrade
...,...,...
295,\n\nThe History of Silicon Valley — A Brief Su...,6thgrade
296,\n\nYes the Body: A Quarantined Review of Spaw...,6thgrade
297,"\n\nAs I stand atop the Eildon Hills, overlook...",college
298,"\n\nAs human beings, we have always been fasci...",10thgrade


In [9]:
# Calculate the metrics
metrics_df2 = calculate_metrics(df2)
df2

print(list(df2.columns))

['prompt', 'completion', 'average_sentence_length', 'average_grammatical_complexity', 'average_sentiment']


In [12]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, Concatenate, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



In [13]:
# Define the text input
text_input = Input(shape=(100,), name='text_input')
embedding = Embedding(10000, 16, input_length=100)(text_input)
pooling = GlobalAveragePooling1D()(embedding)

# Define the numerical input
num_input = Input(shape=(3,), name='num_input')

# Concatenate the text and numerical inputs
concat = Concatenate()([pooling, num_input])

# Add a dense layer and output layer
dense1 = Dense(16, activation='relu')(concat)
output = Dense(3, activation='softmax')(dense1)

# Create the model
model = Model(inputs=[text_input, num_input], outputs=output)

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Load the data
# ...

# Preprocess the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df2['prompt'])
sequences = tokenizer.texts_to_sequences(df2['prompt'])
X_text = pad_sequences(sequences, maxlen=100)

# Get the numerical data
X_num = df2[['average_sentence_length', 'average_grammatical_complexity', 'average_sentiment']].values

# Convert the labels to one-hot encoding
y = tf.keras.utils.to_categorical(df2['completion'])

# Split the data into training and validation sets
X_text_train, X_text_val, X_num_train, X_num_val, y_train, y_val = train_test_split(X_text, X_num, y, test_size=0.2)

# Train the model
history = model.fit([X_text_train, X_num_train], y_train, epochs=10, validation_data=([X_text_val, X_num_val], y_val))

# Evaluate the model
test_loss, test_acc = model.evaluate([X_text_test, X_num_test], y_test)


NameError: name 'data' is not defined