In [11]:
import pandas as pd
import io
import requests
url="https://raw.githubusercontent.com/megagonlabs/HappyDB/master/happydb/data/cleaned_hm.csv"
s=requests.get(url).content
happyDB=pd.read_csv(io.StringIO(s.decode('utf-8')))
pd.set_option('display.max_columns', None)
happyDB.head()

#Here we are loading the dataset. The HappyDB is a publicly available dataset 
#containing text excerpts that describe moments of happiness shared by individuals. 
#It was created by collecting and anonymizing personal diary entries, social media posts, 
#and survey responses, providing insights into what makes people happy in various aspects of their lives, 
#such as relationships, activities, and experiences. Researchers and data analysts use this dataset to 
#study and understand the factors contributing to happiness.

Unnamed: 0,hmid,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category
0,27673,2053,24h,I went on a successful date with someone I fel...,I went on a successful date with someone I fel...,True,1,,affection
1,27674,2,24h,I was happy when my son got 90% marks in his e...,I was happy when my son got 90% marks in his e...,True,1,,affection
2,27675,1936,24h,I went to the gym this morning and did yoga.,I went to the gym this morning and did yoga.,True,1,,exercise
3,27676,206,24h,We had a serious talk with some friends of our...,We had a serious talk with some friends of our...,True,2,bonding,bonding
4,27677,6227,24h,I went with grandchildren to butterfly display...,I went with grandchildren to butterfly display...,True,1,,affection


# Lexicon-based Analysis

In [33]:
relationship_lexicon = pd.read_csv("C:/Users/yashu/Downloads/people-dict.csv",header=None)
relationship_lexicon.columns=["Relationship"]

#We are analyzing the happy moments dataset using LEXICON based analysis. We are 
#using the lexicon, people_dict which contains names, identifiers like "Mom", "Dad",
#"best friend" and so on. Researchers and analysts can use the people dictionary to track mentions of 
#specific individuals in the text data. This can be valuable for understanding the social aspects of happiness, 
#including who is contributing to people's happiness and the nature of their relationships.
#Combining both dictionaries, researchers can analyze happy moments in the HappyDB dataset by 
#categorizing them into topics (using the topic dictionary) and identifying the individuals 
#involved (using the people dictionary). This dual approach allows for a more nuanced 
#understanding of the factors and relationships that contribute to happiness in people's lives.

Unnamed: 0,Relationship
0,aunt
1,auntie
2,aunties
3,aunts
4,aunty
...,...
234,act(or|ress)
235,lady
236,teacher
237,celebrity


In [36]:
pd.set_option('display.max_rows', None)
print(relationship_lexicon)

             Relationship
0                    aunt
1                  auntie
2                 aunties
3                   aunts
4                   aunty
5                  babies
6                    baby
7                     bae
8             best friend
9              bestfriend
10            bestfriends
11                 bestie
12                besties
13                     bf
14                    bff
15                   bffs
16                    boy
17              boyfriend
18                   boys
19                    bro
20                brother
21         brother-in-law
22               brothers
23                buddies
24                  buddy
25                  child
26               children
27           close friend
28              co-worker
29              colleague
30                 cousin
31                cousins
32               coworker
33                    dad
34                  daddy
35                   dads
36               daughter
37        da

In [29]:
#[1]Use the lexicon to find the top three social relationships mentioned in happy moments
# Initialize a dictionary to store the counts of social relationships
relationship_counts = {}

# Iterate through the "cleaned happy moment" column
for moment in happyDB['cleaned_hm']:
    # Convert the text to lowercase for case-insensitive matching
    moment = moment.lower()
    
    # Check for the presence of social relationship terms in the lexicon
    for relationship in relationship_lexicon['Relationship']:
        if relationship in moment:
            if relationship in relationship_counts:
                relationship_counts[relationship] += 1
            else:
                relationship_counts[relationship] = 1
                


{'someone': 784, 'son': 6274, 'his ex': 44, 'friend': 13419, 'friends': 4527, 'child': 1584, 'children': 632, 'grandchild': 72, 'grandchildren': 54, 'bro': 3147, 'brother': 1511, 'elder brother': 16, 'mom': 4282, 'moms': 29, 'teen': 123, 'neighbour': 46, 'man': 3410, 'girl': 2921, 'family': 4222, 'sister': 1756, 'sis': 2211, 'people': 1237, 'grandmother': 248, 'mother': 1901, 'husband': 2683, 'customer': 200, 'men': 7038, 'daughter': 3478, 'mama': 46, 'everyone': 454, 'kid': 1427, 'kids': 1098, 'them': 2156, 'childhood friend': 103, 'neighbor': 597, 'fiance': 394, 'fiancee': 131, 'colleague': 327, 'colleagues': 218, 'best friend': 958, 'partner': 357, 'close friend': 251, 'girlfriend': 1993, 'baby': 1189, 'women': 91, 'chick': 457, 'boy': 1697, 'boyfriend': 1265, 'daughters': 199, 'granddaughter': 148, 'granddaughters': 9, 'uncle': 419, 'parent': 1251, 'parents': 1116, 'siblings': 52, 'sibling': 77, 'dad': 936, 'professor': 92, 'ladies': 21, 'lad': 587, 'wife': 2716, 'ppl': 622, 'famil

In [30]:
#Here we are finding the top 3 listed social relationships in the dataset. 
#For that we are using the sorted() function from the Python lists where we 
#sort in the decsending order

# Sort the social relationships by count in descending order
sorted_relationships = sorted(relationship_counts.items(), key=lambda x: x[1], reverse=True)

# Select the top three social relationships
top_three_relationships = sorted_relationships[:3]

# Print the top three social relationships and their counts
for relationship, count in top_three_relationships:
    print(f"{relationship}: {count}")

friend: 13419
men: 7038
son: 6274


In [56]:
#[2]Use your world knowledge to assess the strengths and weaknesses of the “people dictionary” in 
#terms of answering the question “with whom do people spend happy moments?” You can define 
#“people”, “strength” and “weakness” in your own way here. If you think this dictionary is already perfect, 
#you can articulate your argument and skip task #3 instead.

#The strength of the dictionary is that it includes all kinds of social relationships
#including the slang terms used for these relationships like "bae", "bff", "kiddos", "grannies"
#However, according to me the weakness of the lexicon is that there are generic words also included such as
#"men", "guy", "someone", "people", "girl". These words do not indicate any kind of social relationship in a society.
#I believe that these words have to be removed from the lexicon to generate a revised lexicon. 

generic_words = ["someone","man","girl","people","men","everyone","them","women","boy","ladies","lad","ppl","guy","guys",
                 "person","woman","lady","boys","girls","somebody","everybody","adults"]

In [55]:
#[3]Modify the “people dictionary” to fix the weaknesses that you have identified. 
#Use the revised lexicon to redo task #1.

#Generic words from the list are deleted from the dictionary built in the previous step
#This dictionary is specific to the particular dataset. 

for word in generic_words:
    if word in relationship_counts:
        del relationship_counts[word]


# Sort the social relationships by count in descending order
sorted_relationships = sorted(relationship_counts.items(), key=lambda x: x[1], reverse=True)

# Select the top three social relationships
top_three_relationships = sorted_relationships[:3]

# Print the top three social relationships and their counts
for relationship, count in top_three_relationships:
    print(f"{relationship}: {count}")
    
#Here we can see that the output shows that happy moments are usually seen when a person is spending time
#with a friend, son or multiple number of friends. 

friend: 13419
son: 6274
friends: 4527


In [73]:
for r in top_three_relationships:
    print(r)

friend
son
friends


In [75]:
# Define the frequently mentioned social connections
frequent_social_connections = ["friend", "men", "son"]

import pandas as pd
import re
from collections import Counter
import nltk
nltk.download('stopwords')

# Create a counter to store context words and their frequencies
context_word_counter = Counter()

# Get the NLTK stop words list
stop_words = set(stopwords.words("english"))

# Extract context words around each social connection
for connection in frequent_social_connections:
    # Iterate through each row of the dataset
    for i, row in happyDB.iterrows():
        text = row["cleaned_hm"]
        # Find instances of the social connection using regular expressions
        instances = re.finditer(rf'\b{re.escape(connection)}\b', text, flags=re.IGNORECASE)
        for instance in instances:
            # Create a context window of five words before and after the instance
            start_i = max(instance.start() - 5, 0)
            end_i = min(instance.end() + 5, len(text))
            context = text[start_i:end_i]
            # Tokenize the context into words
            words = context.split()
            # Remove the social connection word and stop words from the context words
            words = [word for word in words if word.lower() != connection.lower() and word.lower() not in stop_words]
            # Update the context word counter
            context_word_counter.update(words)

# Sort context words by frequency in descending order
sorted_context_words = context_word_counter.most_common()

# Explore patterns in the 100 most frequent context words
top_100_context_words = sorted_context_words[:100]

# Print the list of 100 most frequent context words
for word, frequency in top_100_context_words:
    print(f"Context Word: {word}, Frequency: {frequency}")
    

#The output here indicates that "old","best","friend" are some of the context words 
#which occur when a person is happy. This indicates that whenever a person is having a 
#happy time with the best of his relationships then those are charectreized by these words


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yashu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Context Word: old, Frequency: 721
Context Word: best, Frequency: 692
Context Word: friend., Frequency: 664
Context Word: h, Frequency: 558
Context Word: th, Frequency: 485
Context Word: friend's, Frequency: 371
Context Word: got, Frequency: 265
Context Word: son's, Frequency: 245
Context Word: son., Frequency: 230
Context Word: g, Frequency: 225
Context Word: n, Frequency: 225
Context Word: good, Frequency: 208
Context Word: ha, Frequency: 206
Context Word: e, Frequency: 169
Context Word: came, Frequency: 164
Context Word: r, Frequency: 147
Context Word: om, Frequency: 146
Context Word: gave, Frequency: 140
Context Word: lose, Frequency: 139
Context Word: told, Frequency: 130
Context Word: friend,, Frequency: 116
Context Word: f, Frequency: 116
Context Word: bi, Frequency: 111
Context Word: hool, Frequency: 110
Context Word: afte, Frequency: 106
Context Word: new, Frequency: 101
Context Word: son,, Frequency: 96
Context Word: BEST, Frequency: 94
Context Word: k, Frequency: 91
Context W