# Finding Polarity 

In [1]:
import nltk
import csv
import numpy as np
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
def read_csv_column(file_path, row_start, row_end):
    data = []
    rowcount = 0
    with open(file_path, 'r') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            if rowcount >= row_start and rowcount < row_end:
                if len(row) > 7:
                    data.append(row[7]) # column 7 is the column for utterances
                    rowcount += 1
            elif rowcount >= row_end:
                break
            else: 
                rowcount+= 1
    return data

In [3]:
data = read_csv_column("/Users/tvidyala/Desktop/CHILDES/eng-NA-all-utterances.csv", 1, 5)

In [4]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
data = ["I feel good"]

for item in data:
    print(f"Text: {item}")
    result = sentiment_pipeline(item)
    print(f"Sentiment: {result[0]['label']}, Confidence: {result[0]['score']:.4f}\n")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Text: I feel good
Sentiment: POSITIVE, Confidence: 0.9999



In [5]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
data = data 
for item in data:
    print(item)
    print(f"{analyzer.polarity_scores(item)}\n")

I feel good
{'neg': 0.0, 'neu': 0.256, 'pos': 0.744, 'compound': 0.4404}



In [6]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
data = ["i feel happy"] 
for item in data:
    print(item)
    print(f"{sentiment_pipeline(item)}\n")
    print(f"{analyzer.polarity_scores(item)}\n")

i feel happy
[{'label': 'POSITIVE', 'score': 0.999883770942688}]

{'neg': 0.0, 'neu': 0.213, 'pos': 0.787, 'compound': 0.5719}



# Finding Valence, Arousal, and Dominance

In [7]:
def VAD(text, vad_scores):
    i, j = 0, 0
    text_vad = np.zeros([3,])
    for word in text.split(' '):
        neg = 1  # reverse polarity for this word
        if word in vad_scores.index:
            if 'no' in text.split(' ')[max(0, j-6):j] or 'not' in text.split(' ')[max(0, j-6):j] or 'n\'t' in str(text.split(' ')[max(0, j-3):j]):
                neg = -1
            text_vad = vad_scores.loc[word] * neg + text_vad
            i += 1
        j += 1
    return text_vad / i

def read_csv_column(file_path, row_start, row_end, column_index):
    data = []
    rowcount = 0
    with open(file_path, 'r') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            if rowcount >= row_start and rowcount < row_end:
                if len(row) > column_index:
                    data.append(row[column_index])
                rowcount += 1
            elif rowcount >= row_end:
                break
            else: 
                rowcount += 1
    return data

data = read_csv_column("/Users/tvidyala/Desktop/CHILDES/Laura.csv", 1, 10, 8) 
speaker_data = read_csv_column("/Users/tvidyala/Desktop/CHILDES/Laura.csv", 1, 10, 4) 
filename_data = read_csv_column("/Users/tvidyala/Desktop/CHILDES/Laura.csv", 1, 10, 0)

vad_scores = pd.read_csv("vad-nrc.csv", index_col='Word')

for text in data:
    vad = VAD(text, vad_scores)
    print(data.index(text)+1)
    print(f'{speaker_data[data.index(text)]}: "{text}"')
    if np.isnan(vad).any():
        print("N/A \n")
    else:
        print(vad)
        print("")
        
arousal_scores = []
valence_scores = []
dominance_scores = []

for text in data:
    vad = VAD(text, vad_scores)
    arousal_scores.append(vad[0])
    valence_scores.append(vad[1])
    dominance_scores.append(vad[2])

df = pd.DataFrame({
    'File Name': filename_data,
    'Utterance': data,
    'Speaker': speaker_data,
    'Arousal': arousal_scores,
    'Valence': valence_scores,
    'Dominance': dominance_scores
})

df.to_csv("/Users/tvidyala/Desktop/CHILDES/LauraVADScores.csv", index=False) # updates LauraVADScores with VAD Scores


1
MOT: "yeah"
N/A 

2
MOT: "this is July thirtieth"
N/A 

3
CHI: "go"
valence      0.510
arousal      0.441
dominance    0.444
Name: go, dtype: float64

4
MOT: "that's the recorder"
valence      0.551
arousal      0.374
dominance    0.481
Name: recorder, dtype: float64

5
MOT: "that's correct"
valence      0.857
arousal      0.306
dominance    0.723
Name: correct, dtype: float64

6
MOT: "day day day day day day day day day day day"
valence      0.719
arousal      0.269
dominance    0.389
Name: day, dtype: float64

7
CHI: "be"
valence      0.670
arousal      0.240
dominance    0.554
Name: be, dtype: float64

8
MOT: "be what"
valence      0.670
arousal      0.240
dominance    0.554
Name: be, dtype: float64

9
CHI: "Shower"
N/A 



  return text_vad / i


# Finding Average Valence, Arousal, and Dominance for Child by Age

In [11]:
def VAD(text, vad_scores):
    i, j = 0, 0
    text_vad = np.zeros([3,])
    for word in text.split(' '):
        neg = 1  # reverse polarity for this word
        if word in vad_scores.index:
            if 'no' in text.split(' ')[max(0, j-6):j] or 'not' in text.split(' ')[max(0, j-6):j] or 'n\'t' in str(text.split(' ')[max(0, j-3):j]):
                neg = -1
            text_vad = vad_scores.loc[word] * neg + text_vad
            i += 1
        j += 1
    return text_vad / i

def read_csv_column(file_path, row_start, row_end, column_index):
    data = []
    rowcount = 0
    with open(file_path, 'r') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            if rowcount >= row_start and rowcount < row_end:
                if len(row) > column_index:
                    data.append(row[column_index])
                rowcount += 1
            elif rowcount >= row_end:
                break
            else: 
                rowcount += 1
    return data

data = read_csv_column("/Users/tvidyala/Desktop/CHILDES/Laura.csv", 1, 100, 8) 
speaker_data = read_csv_column("/Users/tvidyala/Desktop/CHILDES/Laura.csv", 1, 100, 4)  
age_data = read_csv_column("/Users/tvidyala/Desktop/CHILDES/Laura.csv", 1, 100, 1) 

vad_scores = pd.read_csv("vad-nrc.csv", index_col='Word')

total_vad = np.zeros([3,])
valid_utterances = 0

age = "105" # 1 year and 5 months
for text in data:
    if int(age) == int(age_data[data.index(text)][:3]): 
        vad = VAD(text, vad_scores)
        if speaker_data[data.index(text)] == "CHI":
            if not np.isnan(vad).any():
                total_vad += vad
                valid_utterances += 1

average_vad = total_vad / valid_utterances

if age[1] == "0":
    print(f"Average VAD scores for Child's Utterances at {age[0]} year(s) and {age[2]} month(s)")
    print("Valence:", average_vad[0])
    print("Arousal:", average_vad[1])
    print("Dominance:", average_vad[2])
else: 
    print(f"Average VAD scores for Child's Utterances at {age[0]} year(s) and {age[1]+age[2]} month(s)")
    print("Valence:", average_vad[0])
    print("Arousal:", average_vad[1])
    print("Dominance:", average_vad[2])  

Average VAD scores for Child's Utterances at 1 year(s) and 5 month(s)
Valence: 0.4869333333333333
Arousal: 0.43606666666666677
Dominance: 0.3637333333333333


  return text_vad / i
