In [1]:
import numpy as np
import matplotlib.pyplot as plt
import csv
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn import preprocessing
from scipy import stats
import nltk
from nltk.corpus import stopwords
import string

In [3]:
df = pd.read_excel('ClimateFeedback_articles_202101.xlsx')

# tokenize sentences to keep track of sentence numbers
df['tokenized_sentences'] = df.apply(lambda row: nltk.sent_tokenize(row['text']), axis=1)
df['tokenized_sentences'] = df.apply(lambda row: [row['title, original source']] + row['tokenized_sentences'], axis=1)
df['sentence_num'] = df.apply(lambda row: [i for i in range(0, len(row['tokenized_sentences']))], axis=1)

# split sentences into rows
df = df.explode(['tokenized_sentences', 'sentence_num'])

# clean up sentences (lower case, punctuation)
df['tokenized_sentences'] = df.apply(lambda row: row['tokenized_sentences'].lower(), axis=1)
df['tokenized_sentences'] = df.apply(lambda row: row['tokenized_sentences'].translate(str.maketrans('', '', string.digits)), axis=1)
df['tokenized_sentences'] = df.apply(lambda row: row['tokenized_sentences'].translate(str.maketrans('-', '_')), axis=1)
df['tokenized_sentences'] = df.apply(lambda row: row['tokenized_sentences'].translate(str.maketrans('–', '_')), axis=1)
df['tokenized_sentences'] = df.apply(lambda row: row['tokenized_sentences'].translate(str.maketrans('—', '_')), axis=1)
remove = string.punctuation
remove = remove.replace('_', "")
df['tokenized_sentences'] = df.apply(lambda row: row['tokenized_sentences'].translate(str.maketrans('', '', remove)), axis=1)
df['tokenized_sentences'] = df.apply(lambda row: row['tokenized_sentences'].translate(str.maketrans('', '', '“”‘’')), axis=1)

# tokenize each sentence row
df['tokenized_words'] = df.apply(lambda row: nltk.word_tokenize(row['tokenized_sentences']), axis=1)

# remove stop words
stop_words = set(stopwords.words('english'))
df['tokenized_words'] = df.apply(lambda row: [w for w in row['tokenized_words'] if not w in stop_words], axis=1)

# split words into rows
df = df.explode('tokenized_words')

# new vars
df['bin_truerating'] = df['truerating'] > 0
df['round_truerating'] = round(df['truerating'])
df['headline_word'] = df['sentence_num'] == 0

# merge in memorability scores
df_predictions = pd.read_csv('CC_allwords_scores.csv')
df = df.merge(df_predictions, how='inner', left_on='tokenized_words', right_on='word')

# add zscores for memorability scores
df['recog_zscore'] = stats.zscore(df['recognition_predictions'])
df['recall_zscore'] = stats.zscore(df['recall_predictions'])

# clean up columns
df = df.rename(columns={'UniqNO':'articleID', 'recognition_predictions':'recog_score', 'recall_predictions':'recall_score'})
outdf = df[['word', 'articleID', 'truerating', 'bin_truerating', 'round_truerating', 'sentence_num', 'headline_word', 'recog_score', 'recog_zscore', 'recall_score', 'recall_zscore']]


outdf.reset_index()

outdf.to_csv('CCarticles_allwords_memorability_dataset.csv')

outdf

Unnamed: 0,word,articleID,truerating,bin_truerating,round_truerating,sentence_num,headline_word,recog_score,recog_zscore,recall_score,recall_zscore
0,another,1,-2.0,False,-2.0,0,True,0.885513,-0.689255,0.491141,0.519220
1,another,1,-2.0,False,-2.0,3,False,0.885513,-0.689255,0.491141,0.519220
2,another,2,-2.0,False,-2.0,12,False,0.885513,-0.689255,0.491141,0.519220
3,another,15,-2.0,False,-2.0,0,True,0.885513,-0.689255,0.491141,0.519220
4,another,18,-2.0,False,-2.0,14,False,0.885513,-0.689255,0.491141,0.519220
...,...,...,...,...,...,...,...,...,...,...,...
72609,fre_quent,129,-1.8,False,-2.0,13,False,0.912135,0.550954,0.503321,0.899193
72610,palm_oil,129,-1.8,False,-2.0,15,False,0.914344,0.653833,0.467906,-0.205611
72611,advice,129,-1.8,False,-2.0,20,False,0.883824,-0.767944,0.469666,-0.150698
72612,strive,129,-1.8,False,-2.0,20,False,0.905740,0.253022,0.453770,-0.646597
