In [2]:
import sys, numpy
print(sys.executable)
print(numpy.__version__)

/opt/miniconda3/envs/social/bin/python
2.4.0


In [3]:
# conda create -n social python=3.11 -y
# conda activate social
# conda install -c conda-forge -y numpy pandas pyarrow ipykernel

In [4]:
import numpy as np
import pandas as pd
import pyarrow as pa

print("numpy:", np.__version__)
print("pandas:", pd.__version__)
print("pyarrow:", pa.__version__)


numpy: 2.4.0
pandas: 2.3.3
pyarrow: 19.0.0


Loading the dataset

In [5]:
social_media_data = pd.read_csv('social_media_data.csv')

Data Processing

Lexical and Morphological Analysis

In [6]:
# It focuses on identifying and processing words (or lexemes) in a text. 
# Breaks down the input text into individual tokens that are meaningful units of language such as words or phrases.
# Tokenization, Part-of-Speech Tagging
import nltk
from nltk import pos_tag
nltk.download('universal_tagset')
nltk.download("punkt_tab")
nltk.download('averaged_perceptron_tagger_eng')
from nltk.tokenize import sent_tokenize, word_tokenize
social_media_data['text_content'] = social_media_data['text_content'].str.lower()
tokens = social_media_data['text_content'].apply(lambda x: word_tokenize(str(x)))
social_media_data['tokens'] = tokens
tags = social_media_data['tokens'].apply(lambda x: pos_tag(x, tagset='universal'))
social_media_data['tags'] = tags
print(social_media_data['tags'])

[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/yoitsal/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/yoitsal/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/yoitsal/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


0        [(just, ADV), (tried, VERB), (the, DET), (chro...
1        [(just, ADV), (saw, VERB), (an, DET), (ad, NOU...
2        [(what, PRON), ('s, VERB), (your, PRON), (opin...
3        [(bummed, VERB), (out, PRT), (with, ADP), (my,...
4        [(just, ADV), (tried, VERB), (the, DET), (coro...
                               ...                        
11995    [(comparing, VERB), (toyota, NOUN), (camry, NO...
11996    [(my, PRON), (two, NUM), (days, NOUN), (review...
11997    [(just, ADV), (unboxed, ADJ), (my, PRON), (new...
11998    [(comparing, VERB), (toyota, NOUN), (camry, NO...
11999    [(just, ADV), (saw, VERB), (an, DET), (ad, NOU...
Name: tags, Length: 12000, dtype: object


In [None]:
# Morphological Analysis - morphemes (smallest unit of word)
# Stemming: Reducing words to their root form, chopping off endings (running -> runn)
# Lemmatization: Converting words to their base from context, linguistic knowledge (running -> run)
# In this case, we will use lemmatization for higher accuracy

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
universal_to_wordnet = {
    "NOUN": wordnet.NOUN,
    "VERB": wordnet.VERB,
    "ADV" : wordnet.ADV,
    "ADJ" : wordnet.ADJ
    }

keep_tags = {"NOUN", "VERB", "ADV", "ADJ"}

def lemmatize_pos_tagged(tags):
    return [
        lemmatizer.lemmatize(word, universal_to_wordnet.get(tag,wordnet.NOUN))
        for word, tag in tags 
        if tag in keep_tags
    ]

lemmas = tags.apply(lemmatize_pos_tagged)
print(lemmas) 




[nltk_data] Downloading package wordnet to /Users/yoitsal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0        [just, try, chromebook, google, best, purchase...
1        [just, saw, ad, microsoft, surface, laptop, sp...
2        ['s, opinion, nike, epic, react, promo, food, ...
3        [bum, new, diet, pepsi, pepsi, disappoint, qua...
4        [just, try, corolla, toyota, absolutely, love,...
                               ...                        
11995    [compare, toyota, camry, competition, best, pu...
11996    [day, review, apple, airpods, pro, highly, rec...
11997    [just, unboxed, new, dri-fit, nike, best, purc...
11998    [compare, toyota, camry, competition, do, job,...
11999    [just, saw, ad, apple, imac, innovationx, have...
Name: tokens, Length: 12000, dtype: object
