In [1]:
!pip install vaderSentiment



In [2]:
# ******** MATH OPERATIONS ********
import numpy as np
from collections import Counter

# ******** TORCH LIBRARIES ******
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split

# ******** DATA MANIPULATION ******
import pandas as pd
from skimage.color import rgb2gray

# ******** DATA VISUALIZATION *****
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
%matplotlib inline
sns.set(style='white', context='notebook')

# ******** MODELING ***************
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from scipy.spatial import Voronoi, voronoi_plot_2d
from scipy.stats import pearsonr
from scipy.optimize import curve_fit
from sklearn.linear_model import Perceptron
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# ******** METRICS ****************
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Reference: coding tutorial 1 notebook on canvas

In [3]:
##### Bring the data in using pandas
filename1 = "scores.csv"
filename2 = "transcripts.csv"
# filename3 = "prosodic_features.csv"
# filename4 = "turker_scores_full_interview_old.csv"

# load the data
df_scores = pd.read_csv(filename1)
df_transcripts = pd.read_csv(filename2,names=["Participant", "Transcript"])
# df_prosodic = pd.read_csv(filename3)
# df_tscores = pd.read_csv(filename4)

# change the number of rows displayed
df_scores.head(10)
df_transcripts.head(10)
# df_prosodic.head(10)
# df_tscores.head(10)

# Reference: data cleaning notebook on canvas

Unnamed: 0,Participant,Transcript
0,p1,Interviewer: So how are you doing?|Interviewee...
1,p10,Interviewer: So how you doing?|Interviewee: G...
2,p11,Interviewer: So tell me about yourself. |Inte...
3,p12,Interviewer: So how are you doing today?|Inter...
4,p13,Interviewer: How are you doing today?|Intervie...
5,p14,Interviewer: So how are you doing today|Inter...
6,p15,Interviewer: Okay so how are you doing today? ...
7,p16,Interviewer: Great so how are you doing today...
8,p17,Interviewer: How are you doing today?|Intervie...
9,p20,Interviewer: How are you doing today?|Intervie...


In [4]:
# high level data type and data info
df_scores.info()
df_transcripts.info()
# df_prosodic.info()
# df_tscores.info()

# Reference: data cleaning notebook on canvas

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138 entries, 0 to 137
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Participant  138 non-null    object 
 1   Overall      138 non-null    float64
 2   Excited      138 non-null    float64
dtypes: float64(2), object(1)
memory usage: 3.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138 entries, 0 to 137
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Participant  138 non-null    object
 1   Transcript   138 non-null    object
dtypes: object(2)
memory usage: 2.3+ KB


In [13]:
df_transcripts["InterviewerTranscript"] = ""
df_transcripts["IntervieweeTranscript"] = ""
df_transcripts["OtherTranscript"] = ""
for index, row in df_transcripts.iterrows():
    elements = row.Transcript.split("|")
    interviewers = ""
    interviewees = ""
    others = ""
    for element in elements:
        if "Interviewer:" in element:
            interviewers+=element.replace("Interviewer:", " ")
        elif "Interviewee:" in element:
            interviewees+=element.replace("Interviewee:", " ")
        else:
            others+=" " + element
    df_transcripts.loc[index, 'InterviewerTranscript']= interviewers
    df_transcripts.loc[index, 'IntervieweeTranscript']= interviewees
    df_transcripts.loc[index, 'OtherTranscript']= others
    

In [15]:
df_transcripts["overall_neg"] = 0.0
df_transcripts["overall_neu"] = 0.0
df_transcripts["overall_pos"] = 0.0
df_transcripts["overall_compound"] = 0.0

df_transcripts["interviewer_neg"] = 0.0
df_transcripts["interviewer_neu"] = 0.0
df_transcripts["interviewer_pos"] = 0.0
df_transcripts["interviewer_compound"] = 0.0

df_transcripts["interviewee_neg"] = 0.0
df_transcripts["interviewee_neu"] = 0.0
df_transcripts["interviewee_pos"] = 0.0
df_transcripts["interviewee_compound"] = 0.0

df_transcripts["other_neg"] = 0.0
df_transcripts["other_neu"] = 0.0
df_transcripts["other_pos"] = 0.0
df_transcripts["other_compound"] = 0.0

analyzer = SentimentIntensityAnalyzer()

for index, row in df_transcripts.iterrows():
    vs_overall = analyzer.polarity_scores(row.Transcript)
    vs_interviewer = analyzer.polarity_scores(row.InterviewerTranscript)
    vs_interviewee = analyzer.polarity_scores(row.IntervieweeTranscript)
    vs_other = analyzer.polarity_scores(row.OtherTranscript)
    
    df_transcripts.loc[index,'overall_neg']= vs_overall['neg']
    df_transcripts.loc[index, 'overall_neu']= vs_overall['neu']
    df_transcripts.loc[index, 'overall_pos']= vs_overall['pos']
    df_transcripts.loc[index,'overall_compound']= vs_overall['compound']

    df_transcripts.loc[index,'interviewer_neg']= vs_interviewer['neg']
    df_transcripts.loc[index, 'interviewer_neu']= vs_interviewer['neu']
    df_transcripts.loc[index, 'interviewer_pos']= vs_interviewer['pos']
    df_transcripts.loc[index,'interviewer_compound']= vs_interviewer['compound']

    df_transcripts.loc[index,'interviewee_neg']= vs_interviewee['neg']
    df_transcripts.loc[index, 'interviewee_neu']= vs_interviewee['neu']
    df_transcripts.loc[index, 'interviewee_pos']= vs_interviewee['pos']
    df_transcripts.loc[index,'interviewee_compound']= vs_interviewee['compound']

    df_transcripts.loc[index,'other_neg']= vs_other['neg']
    df_transcripts.loc[index, 'other_neu']= vs_other['neu']
    df_transcripts.loc[index, 'other_pos']= vs_other['pos']
    df_transcripts.loc[index,'other_compound']= vs_other['compound']
    
df_transcripts.head()



# split by interviewer vs interviewee
# do we keep the notes in the brackets about the actions of the participants? (ex. [laughter])


Unnamed: 0,Participant,Transcript,InterviewerTranscript,IntervieweeTranscript,OtherTranscript,overall_neg,overall_neu,overall_pos,overall_compound,interviewer_neg,...,interviewer_pos,interviewer_compound,interviewee_neg,interviewee_neu,interviewee_pos,interviewee_compound,other_neg,other_neu,other_pos,other_compound
0,p1,Interviewer: So how are you doing?|Interviewee...,So how are you doing? Ok well so please te...,Im pretty good. ok uhm so have you looked...,,0.013,0.859,0.128,0.9955,0.055,...,0.243,0.9277,0.015,0.868,0.117,0.9927,0.0,0.0,0.0,0.0
1,p10,Interviewer: So how you doing?|Interviewee: G...,So how you doing? I'm okay. I understand ...,Great how about you? I'm a little [???] by...,[laughter],0.026,0.852,0.122,0.9988,0.059,...,0.156,0.8418,0.026,0.842,0.132,0.9989,0.0,0.0,1.0,0.4939
2,p11,Interviewer: So tell me about yourself. |Inte...,So tell me about yourself. Great okay. C...,Uhh I’m a junior at MIT uhh I’m double maj...,,0.02,0.874,0.105,0.9966,0.029,...,0.236,0.9621,0.018,0.892,0.089,0.9936,0.0,0.0,0.0,0.0
3,p12,Interviewer: So how are you doing today?|Inter...,So how are you doing today? Good. So why d...,I'm good how are you? Ok so I'm a Junior...,(both laugh),0.027,0.842,0.131,0.9978,0.038,...,0.214,0.9654,0.023,0.852,0.125,0.9968,0.0,0.217,0.783,0.5574
4,p13,Interviewer: How are you doing today?|Intervie...,How are you doing today? Good. So why don'...,Good. Ok umm I'm currently a junior at M....,,0.038,0.838,0.124,0.9945,0.05,...,0.272,0.9714,0.051,0.841,0.107,0.9771,0.0,0.0,0.0,0.0
