<a href="https://colab.research.google.com/github/bella-ward/Stem-Away-project-files/blob/main/STEM_Data_Processing_and_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import nltk                                # Python library for NLP
import matplotlib.pyplot as plt            # library for visualization
import random                              # pseudo-random number generator
import pandas as pd
import numpy as np
from base64 import b64decode
from IPython.display import display, Javascript
from google.colab.output import eval_js

from nltk.corpus import twitter_samples    # sample Twitter dataset from NLTK


#Preprocess raw text for Sentiment analysis

In [3]:
# download the stopwords from NLTK
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
import re                                  # library for regular expression operations
import string                              # for string operations

from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings

Tokenize string:

In [5]:
sample = "As a software person (not an engineer but a better than average understanding), I still don’t understand how this system \
works this well. GPT 4 to me seems to have a true understanding of things!!"

# instantiate tokenizer class
tokenizer = TweetTokenizer(preserve_case = False, strip_handles=True, reduce_len=True)

# tokenize tweets
sample_tokens = tokenizer.tokenize(sample)

print('String: ')
print(sample)
print()
print('Tokenized string: ')
print(sample_tokens)

String: 
As a software person (not an engineer but a better than average understanding), I still don’t understand how this system works this well. GPT 4 to me seems to have a true understanding of things!!

Tokenized string: 
['as', 'a', 'software', 'person', '(', 'not', 'an', 'engineer', 'but', 'a', 'better', 'than', 'average', 'understanding', ')', ',', 'i', 'still', 'don', '’', 't', 'understand', 'how', 'this', 'system', 'works', 'this', 'well', '.', 'gpt', '4', 'to', 'me', 'seems', 'to', 'have', 'a', 'true', 'understanding', 'of', 'things', '!', '!']


Remove stop words:

In [6]:
# import the english stop words list from nltk
stopwords_english = stopwords.words('english')

print("Stop words\n")
print(stopwords_english)

print('\nPunctuation\n')
print(string.punctuation)

Stop words

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so

remove stop words and punctuation:

In [7]:
print(sample_tokens)

sample_clean = []

for word in sample_tokens:
    if (word not in stopwords_english and
        word not in string.punctuation):
        sample_clean.append(word)

print('removed stop words and punctuation:')
print(sample_clean)

['as', 'a', 'software', 'person', '(', 'not', 'an', 'engineer', 'but', 'a', 'better', 'than', 'average', 'understanding', ')', ',', 'i', 'still', 'don', '’', 't', 'understand', 'how', 'this', 'system', 'works', 'this', 'well', '.', 'gpt', '4', 'to', 'me', 'seems', 'to', 'have', 'a', 'true', 'understanding', 'of', 'things', '!', '!']
removed stop words and punctuation:
['software', 'person', 'engineer', 'better', 'average', 'understanding', 'still', '’', 'understand', 'system', 'works', 'well', 'gpt', '4', 'seems', 'true', 'understanding', 'things']


Stemming ~ converting a word to its most general form, or stem. This helps in reducing the size of our vocabulary.


In [8]:
print()
print(sample_clean)

# instantiate stemming class
stemmer = PorterStemmer()

# create emply list to store the stems
sample_stem = []

for word in sample_clean:
    stem_word = stemmer.stem(word) # stemming word
    sample_stem.append(stem_word) # append to the list

print('stemmed words: ')
print(sample_stem)


['software', 'person', 'engineer', 'better', 'average', 'understanding', 'still', '’', 'understand', 'system', 'works', 'well', 'gpt', '4', 'seems', 'true', 'understanding', 'things']
stemmed words: 
['softwar', 'person', 'engin', 'better', 'averag', 'understand', 'still', '’', 'understand', 'system', 'work', 'well', 'gpt', '4', 'seem', 'true', 'understand', 'thing']


Process_sample()

In [9]:

# choose the same tweet
tweet = sample

def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')

    # remove pattern using re
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*','',tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)

    # tokenize, lowercase, remove stopwords, and keep stemmer
    tokenizer = TweetTokenizer(preserve_case = False, strip_handles = True,
                               reduce_len = True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweet_clean = []

    for word in tweet_tokens:
        if (word not in stopwords_english and
           word not in string.punctuation):
            stemmed_word = stemmer.stem(word)
            tweet_clean.append(stemmed_word)

    return tweet_clean



print()
print(tweet)


# call the imported function
tweets_stem = process_tweet(tweet) # preprocess a given tweet

print('preprocessed tweet: ')
print(tweets_stem)


As a software person (not an engineer but a better than average understanding), I still don’t understand how this system works this well. GPT 4 to me seems to have a true understanding of things!!
preprocessed tweet: 
['softwar', 'person', 'engin', 'better', 'averag', 'understand', 'still', '’', 'understand', 'system', 'work', 'well', 'gpt', '4', 'seem', 'true', 'understand', 'thing']


# REDDIT DATA

In [15]:
from google.colab import files
uploaded = files.upload()

df = pd.read_csv('Copy of Reddit Data 8_23 - r-output.csv')
# This will print the first 5 rows of the DataFrame
print(df.head())

Saving Copy of Reddit Data 8_23 - r-output.csv to Copy of Reddit Data 8_23 - r-output (1).csv
   Index  Source                                                URL  \
0      1  Reddit  https://www.reddit.com/r/ChatGPT/comments/137v...   
1      2  Reddit  https://www.reddit.com/r/ChatGPT/comments/137v...   
2      3  Reddit  https://www.reddit.com/r/ChatGPT/comments/137v...   
3      4  Reddit  https://www.reddit.com/r/ChatGPT/comments/137v...   
4      5  Reddit  https://www.reddit.com/r/ChatGPT/comments/137v...   

                       Title Search term  \
0  General discussion thread     ChatGPT   
1  General discussion thread     ChatGPT   
2  General discussion thread     ChatGPT   
3  General discussion thread     ChatGPT   
4  General discussion thread     ChatGPT   

                                            Question   Question Timestamp  \
0  To discuss anything and everything related to ...  2023-05-04 11:32:08   
1  To discuss anything and everything related to ...  2023-0

In [16]:
article = df.loc[1]['Answer']


# instantiate tokenizer class
tokenizer = TweetTokenizer(preserve_case = False, strip_handles=True, reduce_len=True)

# tokenize tweets
summary_tokens = tokenizer.tokenize(article)

print()
print('Tokenized string: ')
print(summary_tokens)


Tokenized string: 
["i've", 'used', 'both', 'a', 'lot', ',', 'and', 'bard', 'really', "doesn't", 'hold', 'a', 'candle', '.', 'google', 'are', 'probably', 'in', 'panic', 'stations', 'right', 'now', 'because', 'i', 'work', 'on', 'macosx', 'for', 'my', 'job', 'and', 'i', 'have', 'done', 'something', 'i', 'would', 'have', 'previously', 'thought', 'to', 'be', 'complete', 'insanity', '(', 'installed', 'edge', 'on', 'a', 'mac', ')', '.', "i'm", 'hoping', 'a', 'serious', 'competitor', 'for', 'gpt', '-', '4', 'comes', 'along', 'soon', 'because', 'as', 'it', 'stands', "it's", 'just', 'better', 'than', 'google', 'for', 'essentially', 'everything', "you'd", 'search', 'for', '.', 'has', 'to', 'be', 'the', 'biggest', 'marketing', 'win', 'for', 'microsoft', 'in', 'a', 'decade', '.']


In [17]:
# import the english stop words list from nltk
stopwords_english = stopwords.words('english')

print("Stop words\n")
print(stopwords_english)

print('\nPunctuation\n')
print(string.punctuation)

##

print()
print('\033[92m')
print(summary_tokens)
print('\033[94m')

summary_clean = []

for word in summary_tokens:
    if (word not in stopwords_english and
        word not in string.punctuation):
        summary_clean.append(word)

print('removed stop words and punctuation:')
print(summary_clean)

Stop words

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so

In [18]:
print()
print('\033[92m')
print(summary_clean)
print('\033[94m')

# instantiate stemming class
stemmer = PorterStemmer()

# create emply list to store the stems
summary_stem = []

for word in summary_clean:
    stem_word = stemmer.stem(word) # stemming word
    summary_stem.append(stem_word) # append to the list

print('stemmed words: ')
print(summary_stem)


[92m
["i've", 'used', 'lot', 'bard', 'really', 'hold', 'candle', 'google', 'probably', 'panic', 'stations', 'right', 'work', 'macosx', 'job', 'done', 'something', 'would', 'previously', 'thought', 'complete', 'insanity', 'installed', 'edge', 'mac', "i'm", 'hoping', 'serious', 'competitor', 'gpt', '4', 'comes', 'along', 'soon', 'stands', 'better', 'google', 'essentially', 'everything', 'search', 'biggest', 'marketing', 'win', 'microsoft', 'decade']
[94m
stemmed words: 
["i'v", 'use', 'lot', 'bard', 'realli', 'hold', 'candl', 'googl', 'probabl', 'panic', 'station', 'right', 'work', 'macosx', 'job', 'done', 'someth', 'would', 'previous', 'thought', 'complet', 'insan', 'instal', 'edg', 'mac', "i'm", 'hope', 'seriou', 'competitor', 'gpt', '4', 'come', 'along', 'soon', 'stand', 'better', 'googl', 'essenti', 'everyth', 'search', 'biggest', 'market', 'win', 'microsoft', 'decad']


In [19]:
def process_article(summary):
    """Process tweet function.
    Input:
        article summary (or title or subtitle)
    Output:
        summary_clean: a list of words containing the processed summary

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')

    # remove hyperlinks
    summary = re.sub(r'https?:\/\/.*[\r\n]*','',summary)

    # remove hashtags
    # only removing the hash # sign from the word
    summary = re.sub(r'#','',summary)


    # tokenize, lowercase, remove stopwords, and keep stemmer
    tokenizer = TweetTokenizer(preserve_case = False, strip_handles = True,
                               reduce_len = True)
    summary_tokens = tokenizer.tokenize(summary)

    summary_clean = []

    for word in summary_tokens:
        if (word not in stopwords_english and
           word not in string.punctuation):
            stemmed_word = stemmer.stem(word)
            summary_clean.append(stemmed_word)

    return summary_clean

In [21]:
df_p = []
for i in range (len(df)):
  element = df.loc[i]
  if pd.notna(element['Title']):
    title = process_article(element['Title'])
    if pd.notna(element['Question']):
      question = process_article(element['Question'])
    else:
      question = ''
    if pd.notna(element['Answer']):
      answer = process_article(element['Answer'])
    else:
      answer = ''

    # print('title: ')
    # print(title)
    # print('question: ')
    # print(question)
    # print('answer: ')
    # print(answer)
    df_p.append({"id": i, "processed_text": question})
    df_p.append({"id": i, "processed_text": answer})
dfp = pd.DataFrame(df_p)

print(dfp.head())

   id                                     processed_text
0   0  [discuss, anyth, everyth, relat, chatgpt, open...
1   0  [seem, googl, bard, new, bing, chatgpt, bing, ...
2   1  [discuss, anyth, everyth, relat, chatgpt, open...
3   1  [i'v, use, lot, bard, realli, hold, candl, goo...
4   2  [discuss, anyth, everyth, relat, chatgpt, open...


In [22]:

df_p = []
for i in range(len(df)):
    element = df.loc[i]
    if pd.notna(element['Title']):
        title = process_article(element['Title'])
        if pd.notna(element['Question']):
            question = process_article(element['Question'])
        else:
            question = ''
        if pd.notna(element['Answer']):
            answer = process_article(element['Answer'])
        else:
            answer = ''

        df_p.append({"id": i, "processed_question": question, "processed_answer": answer})

dfp = pd.DataFrame(df_p)
print(dfp.head())

   id                                 processed_question  \
0   0  [discuss, anyth, everyth, relat, chatgpt, open...   
1   1  [discuss, anyth, everyth, relat, chatgpt, open...   
2   2  [discuss, anyth, everyth, relat, chatgpt, open...   
3   3  [discuss, anyth, everyth, relat, chatgpt, open...   
4   4  [discuss, anyth, everyth, relat, chatgpt, open...   

                                    processed_answer  
0  [seem, googl, bard, new, bing, chatgpt, bing, ...  
1  [i'v, use, lot, bard, realli, hold, candl, goo...  
2   [bard, block, countri, way, access, nonetheless]  
3  [liter, noth, ai, replac, job, begin, societi,...  
4  [’, go, slow, sudden, job, make, commerci, tv,...  


In [23]:
dfp.to_csv('reddit-processed-text.csv', index=False)
# ! cat r-output.csv

Sentiment analysis based on question:

In [24]:
from textblob import TextBlob

# Fill missing values with empty strings
df[['Title', 'Question', 'Answer']] = df[['Title', 'Question', 'Answer']].fillna('')

# Calculate sentiment polarity for 'Title', 'Question', and 'Answer'
df['Title Sentiment'] = df['Title'].apply(lambda x: TextBlob(x).sentiment.polarity)
df['Question Sentiment'] = df['Question'].apply(lambda x: TextBlob(x).sentiment.polarity)
df['Answer Sentiment'] = df['Answer'].apply(lambda x: TextBlob(x).sentiment.polarity)
question_sentiment_array_textblob = df['Question Sentiment'].to_numpy()
df[['Question', 'Question Sentiment']]


Unnamed: 0,Question,Question Sentiment
0,To discuss anything and everything related to ...,0.091667
1,To discuss anything and everything related to ...,0.091667
2,To discuss anything and everything related to ...,0.091667
3,To discuss anything and everything related to ...,0.091667
4,To discuss anything and everything related to ...,0.091667
...,...,...
2335,,0.000000
2336,,0.000000
2337,,0.000000
2338,,0.000000


In [25]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# Download the VADER lexicon
nltk.download('vader_lexicon')

# Initialize the VADER sentiment intensity analyzer
sia = SentimentIntensityAnalyzer()

# Fill missing values with empty strings
df[['Title', 'Question', 'Answer']] = df[['Title', 'Question', 'Answer']].fillna('')

# Calculate sentiment polarity for 'Title', 'Question', and 'Answer'
df['Question Sentiment'] = df['Question'].apply(lambda x: sia.polarity_scores(x)['compound'])
# Convert the pandas Series to a numpy array
question_sentiment_array_vader = df['Question Sentiment'].to_numpy()
print(df[['Question', 'Question Sentiment']])


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


                                               Question  Question Sentiment
0     To discuss anything and everything related to ...              0.7184
1     To discuss anything and everything related to ...              0.7184
2     To discuss anything and everything related to ...              0.7184
3     To discuss anything and everything related to ...              0.7184
4     To discuss anything and everything related to ...              0.7184
...                                                 ...                 ...
2335                                                                 0.0000
2336                                                                 0.0000
2337                                                                 0.0000
2338                                                                 0.0000
2339                                                                 0.0000

[2340 rows x 2 columns]


save CSV

In [26]:
df.to_csv('r-output.csv', index=False)
# ! cat r-output.csv


In [27]:
# Compute the difference between the two methods
difference = question_sentiment_array_textblob - question_sentiment_array_vader
# Compute the mean difference
mean_difference = np.mean(difference)

# Compute the absolute difference
absolute_difference = np.abs(difference)

# Compute the mean absolute difference
mean_absolute_difference = np.mean(absolute_difference)

print("Mean Difference:", mean_difference)
print("Mean Absolute Difference:", mean_absolute_difference)

Mean Difference: 0.0027770711743641944
Mean Absolute Difference: 0.10092488524528455


In [28]:
# Create a DataFrame with text and sentiment scores
df_sentiment = pd.DataFrame({
    'Text': df['Question'],
    'TextBlob': question_sentiment_array_textblob,
    'VADER': question_sentiment_array_vader
})

# Compute the difference and absolute difference
df_sentiment['Difference'] = df_sentiment['TextBlob'] - df_sentiment['VADER']
df_sentiment['Absolute Difference'] = np.abs(df_sentiment['Difference'])

# Sort by absolute difference in descending order
df_sentiment = df_sentiment.sort_values(by='Absolute Difference', ascending=False)

# Print the 5 entries with the highest absolute difference
print(df_sentiment.head(5))

                                                  Text  TextBlob   VADER  \
99   I was having a conversation about neural netwo... -0.047047 -0.9996   
111  I was having a conversation about neural netwo... -0.047047 -0.9996   
100  I was having a conversation about neural netwo... -0.047047 -0.9996   
101  I was having a conversation about neural netwo... -0.047047 -0.9996   
102  I was having a conversation about neural netwo... -0.047047 -0.9996   

     Difference  Absolute Difference  
99     0.952553             0.952553  
111    0.952553             0.952553  
100    0.952553             0.952553  
101    0.952553             0.952553  
102    0.952553             0.952553  


In [29]:
!pip install transformers
!pip install xformers

Collecting transformers
  Downloading transformers-4.33.3-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m64.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m56.6 MB/s[0m eta [36m0:00:0

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
plt.style.use('ggplot')

from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity


In [33]:
# Train a Word2Vec model
w2v_model = Word2Vec(dfp['processed_text'], min_count=1, vector_size=100, workers=4)

# Function to vectorize a sentence
def vectorize_sentence(sentence, model):
    vec = [model.wv[word] for word in sentence if word in model.wv]
    return np.mean(vec, axis=0) if vec else np.zeros(model.vector_size)

# Vectorize each article summary
dfp['vec'] = dfp['processed_text'].apply(lambda x: vectorize_sentence(x, w2v_model))

# Calculate the cosine similarity matrix
similarity_matrix = cosine_similarity(np.vstack(dfp['vec'].values))

# Construct a reverse map of indices and article titles
indices = pd.Series(dfp.index, index=dfp['id']).drop_duplicates()


print('w2v_model: ... ')
print(w2v_model)
print('dfp[vec]: ... ')
print(dfp['vec'])
print('similarity_matrix: ... ')
print(similarity_matrix)
print('indices: ... ')
print(indices)


KeyError: ignored

Reccommendation system from - Anya

In [34]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity



# Train a Word2Vec model
w2v_model = Word2Vec(dfp['processed_text'], min_count=1, vector_size=100, workers=4)

# Function to vectorize a sentence
def vectorize_sentence(sentence, model):
    vec = [model.wv[word] for word in sentence if word in model.wv]
    return np.mean(vec, axis=0) if vec else np.zeros(model.vector_size)

# Vectorize each article summary
dfp['vec'] = dfp['processed_text'].apply(lambda x: vectorize_sentence(x, w2v_model))

# Calculate the cosine similarity matrix
similarity_matrix = cosine_similarity(np.vstack(dfp['vec'].values))

# Construct a reverse map of indices and article titles
indices = pd.Series(dfp.index, index=dfp['id']).drop_duplicates()

# Define a recommendation function
def get_recommendations(id, cosine_sim=similarity_matrix):
    idx = indices[id]
    sim_scores = list(enumerate(cosine_sim[idx]))
    # sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Get scores of the 10 most similar articles
    article_indices = [i[0] for i in sim_scores]
    return dfp['id'].iloc[article_indices]
recommendations = get_recommendations(487)
print(dfp['processed_text'][487])
print(recommendations)
for recommendation in recommendations:
  print(dfp['processed_text'][recommendation])


KeyError: ignored