# CAC Project 1 (SNA + RS)

In [None]:
import pandas as pd

members = pd.read_csv('data/pp_members.csv')
recipes = pd.read_csv('data/pp_recipes.csv')
reviews = pd.read_csv('data/pp_reviews.csv')

# Exploratory Data Analysis and Processing

In [None]:
members

In [None]:
recipes

In [None]:
reviews

In [None]:
import ast

def ing_process(x, ing_or_quant):

    try: 
        ing_list = ast.literal_eval(x)
    except:
        print(x)
        return None    

    try:
        res = list(ing_list.values())[0]
    except:
        print(ing_list)
        return None
    
    return [x[ing_or_quant] for x in res]

recipes['ingredients_pp'] = recipes['ingredients'].apply(ing_process, args=(0,))

In [None]:
recipes['ingredients_pp']
recipes['ingredients_pp'][0]

In [None]:
recipes['quantities_pp'] = recipes['ingredients'].apply(ing_process, args=(1,))
recipes['quantities_pp']

In [None]:
recipes['ingredients_pp'].apply(type).unique()

recipes[recipes['ingredients_pp'].apply(type) == type(None)]

recipes = recipes.drop(recipes[recipes['ingredients_pp'].apply(type) == type(None)].index)

In [None]:
import itertools 
from collections import defaultdict

# Create edges for recipes, based on ingredients in common as weight
def ing_freq_edge_weight(df,min_weight=0):
    ingredients_freq = {}
    # frequency of each ingredient save to a dict
    for i in range(len(df)):
        for j in range(len(df.iloc[i]['ingredients_pp'])):
            if df.iloc[i]['ingredients_pp'][j] in ingredients_freq:
                ingredients_freq[df.iloc[i]['ingredients_pp'][j]] += 1
            else:
                ingredients_freq[df.iloc[i]['ingredients_pp'][j]] = 1

    print("ing freq", ingredients_freq)
    long_df = df.explode('ingredients_pp')
    graph_structure = defaultdict(dict)

    for ingredient, rows in long_df.groupby('ingredients_pp'):
        # Get all unique pairs of recipes containing this ingredient
        pairs = itertools.combinations(rows.index.unique(), 2)

        # Calculate weight based on ingredient frequency
        weight = 1 / ingredients_freq[ingredient]

        # Update the graph structure with the weight for each pair
        for a, b in pairs:
            if b in graph_structure[a]:
                graph_structure[a][b] += weight
                graph_structure[b][a] += weight
            else:
                graph_structure[a][b] = weight
                graph_structure[b][a] = weight

    # Convert the graph structure to a list of tuples [(index1, index2, weight), ...]
    index_pairs = [(a, b, graph_structure[a][b]) for a in graph_structure for b in graph_structure[a] if (a < b) and (graph_structure[a][b]>=min_weight)]

    pairs_df = pd.DataFrame(index_pairs, columns=['from', 'to','weight'])
    return pairs_df

In [None]:
pd.set_option('display.max_columns', 39)

# get the top 1000 recipes with the most ratings
top_recipes = recipes.sort_values(by='number_of_ratings', ascending=False)[0:10000]

# get the top 10000 recipes with the most recent dates
top_recent_recipes = recipes.sort_values(by='last_changed_date', ascending=False)[0:10000]

top_recipes = top_recipes.sort_values(by='last_changed_date')
top_recipes.head()

## PLN Analysis

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
#small_count_vectorizer = CountVectorizer(stop_words='english', max_features=40000)
small_count_vectorizer = TfidfVectorizer(stop_words='english', max_features=40000)
pln_top_reviews = top_recipes.dropna(subset=['description'])
print(pln_top_reviews.shape)
mixed_pln_top_reviews = pln_top_reviews.copy()
mixed_pln_top_reviews['description_ingredients'] = mixed_pln_top_reviews.apply(lambda row: ''.join(row['description'] + ' ' + ' '.join(map(str,row['ingredients_pp']))), axis=1)
# extract a dataframe(small_text_sample) from top_recipes that brings description and last_changed_date
#small_text_sample = mixed_pln_top_reviews['description']
mixed_pln_top_reviews = mixed_pln_top_reviews.dropna(subset=['ingredients_pp'])
mixed_pln_top_reviews = mixed_pln_top_reviews.dropna(subset=['description_ingredients'])
#small_text_sample = mixed_pln_top_reviews['description_ingredients']
mixed_pln_top_reviews['ingredients_string'] = mixed_pln_top_reviews.apply(lambda row: ' '.join(map(str,row['ingredients_pp'])),axis=1)
small_text_sample = mixed_pln_top_reviews['ingredients_string']
#for item in small_text_sample:
#    print(item)
small_text_sample.index = mixed_pln_top_reviews['last_changed_date']
len(small_text_sample)


In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = []
ps = PorterStemmer()
sw = set(stopwords.words('english'))
non_string_items = small_text_sample.apply(lambda x: not isinstance(x, str))
non_string_indices = non_string_items[non_string_items].index

# Print out the non-string items
for index in non_string_indices:
    print(f"Index: {index}, Value: {small_text_sample[index]}")

# iterate over the pln_top_reviews['text'] to replace the characters
small_text_sample = small_text_sample.apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
# to lower-case
small_text_sample = small_text_sample.apply(lambda x: x.lower())
# split into tokens, apply stemming and remove stop words
small_text_sample = small_text_sample.apply(lambda x: ' '.join([ps.stem(w) for w in x.split() if w not in sw]))
len(small_text_sample)


In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
print(small_text_sample)

small_document_term_matrix = small_count_vectorizer.fit_transform(small_text_sample)
n_topics = 10


In [None]:
from collections import Counter
def get_keys(topic_matrix):
    keys = topic_matrix.argmax(axis=1).tolist()
    return keys
def keys_to_counts(keys):
    count_pairs = sorted(Counter(keys).items())
    print(count_pairs)
    categories = [pair[0] for pair in sorted(count_pairs)]
    counts = [pair[1] for pair in sorted(count_pairs)]
    return (categories, counts)

In [None]:
import numpy as np
def get_top_n_words(n, keys, document_term_matrix, count_vectorizer):
    '''
    Returns a list of n_topic strings, where each string contains the n most common 
    words in a predicted category, in order.
    '''
    top_words = []
    n_topics = np.unique(keys).size  # Ensure you know the number of unique topics

    for topic in range(n_topics):
        # Initialize a zero vector of the same shape as a row in your document_term_matrix
        temp_vector_sum = np.zeros((1, document_term_matrix.shape[1]))
        for i in range(len(keys)):
            if keys[i] == topic:
                # Increment by the row corresponding to the document associated with the topic
                temp_vector_sum += document_term_matrix[i].toarray()  # convert sparse matrix row to dense

        # Extract the indices of the top n words; these are the columns in the matrix
        top_n_word_indices = np.flip(np.argsort(temp_vector_sum)[0][-n:], 0)
        
        # Retrieve the actual words from the count_vectorizer
        topic_words = [count_vectorizer.get_feature_names_out()[index] for index in top_n_word_indices]
        top_words.append(" ".join(topic_words))
    
    return top_words

In [None]:
def get_mean_topic_vectors(keys, two_dim_vectors):
    '''
    returns a list of centroid vectors from each predicted topic category
    '''
    mean_topic_vectors = []
    for t in range(n_topics):
        articles_in_that_topic = []
        for i in range(len(keys)):
            if keys[i] == t:
                #print(t, two_dim_vectors[i])
                articles_in_that_topic.append(two_dim_vectors[i])    
        print(articles_in_that_topic)
        articles_in_that_topic = np.vstack(articles_in_that_topic)
        mean_article_in_that_topic = np.mean(articles_in_that_topic, axis=0)
        mean_topic_vectors.append(mean_article_in_that_topic)
    return mean_topic_vectors

In [None]:
colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5" ])
colormap = colormap[:n_topics]

In [None]:
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
output_notebook()

In [None]:
lda_model = LatentDirichletAllocation(n_components=n_topics, learning_method='online', 
                                          random_state=0, verbose=0)
lda_topic_matrix = lda_model.fit_transform(small_document_term_matrix)

In [None]:
print(lda_topic_matrix)
lda_keys = get_keys(lda_topic_matrix)
lda_categories, lda_counts = keys_to_counts(lda_keys)
print(lda_keys)
print(lda_counts)
print(lda_categories)


In [None]:
top_n_words_lda = get_top_n_words(10, lda_keys, small_document_term_matrix, small_count_vectorizer)

for i in range(len(top_n_words_lda)):
    print("Topic {}: ".format(i+1), top_n_words_lda[i])

In [None]:
import matplotlib.pyplot as plt

top_3_words = get_top_n_words(3, lda_keys, small_document_term_matrix, small_count_vectorizer)
labels = ['Topic {}: \n'.format(i) + top_3_words[i] for i in range(len(top_3_words))]

fig, ax = plt.subplots(figsize=(16,8))
ax.bar(lda_categories, lda_counts)
ax.set_xticks(lda_categories)
ax.set_xticklabels(labels)
ax.set_title('LDA topic counts')
ax.set_ylabel('Number of headlines')

In [None]:
from sklearn.manifold import TSNE

tsne_lda_model = TSNE(n_components=2, perplexity=50, learning_rate=100, 
                        n_iter=2000, verbose=1, random_state=0, angle=0.75)
tsne_lda_vectors = tsne_lda_model.fit_transform(lda_topic_matrix)

In [None]:
top_3_words_lda = get_top_n_words(3, lda_keys, small_document_term_matrix, small_count_vectorizer)
lda_mean_topic_vectors = get_mean_topic_vectors(lda_keys, tsne_lda_vectors)
print(len(lda_keys))
print(len(tsne_lda_vectors))
print(lda_mean_topic_vectors)

plot = figure(title="t-SNE Clustering of {} LDA Topics".format(n_topics), width=700, height=700)
plot.scatter(x=tsne_lda_vectors[:,0], y=tsne_lda_vectors[:,1], color=colormap[lda_keys])

for t in range(n_topics):
    label = Label(x=lda_mean_topic_vectors[t][0], y=lda_mean_topic_vectors[t][1], 
                  text=top_3_words_lda[t], text_color=colormap[t])
    plot.add_layout(label)

show(plot)

In [None]:
largest_topic_recipes_ids = [i for i in range(len(lda_keys)) if lda_keys[i] == lda_counts.index(max(lda_counts))]

In [None]:
recipes = mixed_pln_top_reviews.iloc[largest_topic_recipes_ids]['new_recipe_id']
print(recipes)

In [None]:
top_reviews_from_topic = reviews[reviews['recipe_id'].isin(recipes)]
top_reviews_from_topic.head()

In [None]:
# Get all reviews associated with recipes in top_recipes
pln_top_reviews = reviews[reviews['recipe_id'].isin(top_recipes['new_recipe_id'])]
# get reviews with rating of 1.0
#pln_top_reviews = pln_top_reviews[pln_top_reviews['rating'] == 1.0]
pln_top_reviews.head()


In [None]:
# save only review_id, recipe_id, member_id, text, rating
pln_top_reviews = pln_top_reviews[['review_id','recipe_id','member_id','text','rating']]
pln_top_reviews.head()

In [None]:
# check if there is any NA in text
pln_top_reviews[pln_top_reviews['text'].isna()]
# drop NA
pln_top_reviews = pln_top_reviews.dropna(subset=['text'])
# print type of text
pln_top_reviews['text'].apply(type).unique()

### Preprocessing Text

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = []
ps = PorterStemmer()
sw = set(stopwords.words('english'))

# iterate over the pln_top_reviews['text'] to replace the characters
pln_top_reviews['text'] = pln_top_reviews['text'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
# to lower-case
pln_top_reviews['text'] = pln_top_reviews['text'].apply(lambda x: x.lower())
# split into tokens, apply stemming and remove stop words
pln_top_reviews['text'] = pln_top_reviews['text'].apply(lambda x: ' '.join([ps.stem(w) for w in x.split() if w not in sw]))


pln_top_reviews.head()

In [None]:
# get size of the dataset
pln_top_reviews.shape

In [None]:
# get a wordcloud from the text column
from wordcloud import WordCloud
import matplotlib.pyplot as plt

#remove words from wordcloud
words_to_remove = ['use', 'recip', 'made', 'make', 'thank', 'love', 'good', 'ad', 'hand']
pln_top_reviews['text'] = pln_top_reviews['text'].apply(lambda x: ' '.join([w for w in x.split() if w not in words_to_remove]))

wordcloud = WordCloud().generate(" ".join(pln_top_reviews['text']))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()



In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1,1))
X = vectorizer.fit_transform(pln_top_reviews['text']).toarray()

print(X.shape)

In [None]:
print(vectorizer.get_feature_names_out())

# get the representation vector of the row 124
print(X[1])

# get which columns on the row 124 have 1s
print([vectorizer.get_feature_names_out()[i] for i in range(len(X[1])) if X[1][i] == 1])

In [None]:
y = pln_top_reviews['rating']
#create df with y
#y = pd.DataFrame(y)

# convert y in 0 or 1, it is 0 if the rating is below 4, it is 1 if it is equal or higher than 4
y = y.apply(lambda x: 0 if x < 4 else 1)


print(X.shape, y.shape)

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

clf = SVC()
scores = cross_val_score(clf, X, y, cv=10)

print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=0, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

print("\nLabel distribution in the training set:")
print(y_train.value_counts())

print("\nLabel distribution in the test set:")
print(y_test.value_counts())

In [None]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print(y_pred)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# confusion matrix
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test,y_pred))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))
print(f1_score(y_test,y_pred))

In [None]:
import os
rev = input("Enter review: ")

rev = re.sub('[^a-zA-Z]', ' ', rev)
rev = rev.lower()
rev = ' '.join([ps.stem(w) for w in rev.split() if w not in sw])

V = vectorizer.transform([rev]).toarray()

print(rev)
print(V.shape)
print(V)
print([vectorizer.get_feature_names_out()[i] for i in range(len(V[0])) if V[0][i] == 1])

if(clf.predict(V) == [1]):
    print('positive review (+)')
else:
    print('negative review (-)')


In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [None]:
y_pred = []
for rev in pln_top_reviews['text']:
    y_pred.append(1 if analyzer.polarity_scores(rev)['compound'] > 0 else 0)

print(y_pred)

In [None]:
print(confusion_matrix(y, y_pred))
print('Accuracy: ', accuracy_score(y, y_pred))
print('Precision: ', precision_score(y, y_pred))
print('Recall: ', recall_score(y, y_pred))
print('F1: ', f1_score(y, y_pred))