# Gathering Data and Preprocessing

### 1. Gathering Data

In [None]:
import sys
sys.path.append('/home/mca/Opinion-Mining-Project/src')
from testdata import get_data

df = get_data()
df



### 2. Change to a csv File

In [None]:
df.to_csv('../../data/test_sample.csv')
df.head()

### 3. Lowercasing

In [None]:
df['text'] 
print(type(df['text']))  
df['clean_text'] = df['text'].apply(lambda word:word.lower())
df[['text', 'clean_text']]


###  4. Remove Punctuation and Symbols

In [None]:
import string
print(string.punctuation)
mytable = str.maketrans('','',string.punctuation)
df['clean_text'] = df['clean_text'].apply(lambda word : word.translate(mytable)) 
df['clean_text']


### 5. Remove Stopwords

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
print(stopwords.words('english'))
stop_words = set(stopwords.words('english'))
df['no_stopwords'] = df['clean_text'].apply(
  lambda text :  ''.join(word for word in text.split() if word not in stop_words)
)
df

### 6. Remove Emojis

In [None]:
import emoji

df['no_emoji'] = df['no_stopwords'].apply(lambda row: emoji.replace_emoji(row, replace=''))
df



### 7. Tokenization

In [None]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from wordsegment import load, segment

# Download 'punkt' to a custom directory and append it to nltk path
nltk.download('punkt')

# Initialize wordsegment
load()
# Check for NaN or empty strings
df['no_emoji'] = df['no_emoji'].fillna('').astype(str)
print(df['no_emoji'].head())  # Confirm it prints strings like 'absolutelyamazingservice...'

df['segmented'] = df['no_emoji'].apply(lambda text: ' '.join(segment(text)))
tokenizer = lambda text: re.findall(r'\b\w+\b', text.lower())
df['tokenized'] = df['segmented'].apply(tokenizer)
df


### 8. Lemmatization Function

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
import nltk
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('punkt_tab')
def get_wordnet_pos(treebank_tag):
    # Map POS tag to WordNet POS tag for lemmatizer
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default
def lemmatize_tokens(token_list):
    #Takes a list of words (tokens) from your sentence.
    #Assigns a Part-of-Speech (POS) tag to each word, like noun, verb, adjective, etc.
    #Returns a list of tuples: (word, POS_tag) for every token.
    pos_tags = pos_tag(token_list)
    print("POS tag : ",pos_tags)
    lemmatizer = WordNetLemmatizer()
    print(lemmatizer)
    lemmatized_sentence = []
    for token, tag in pos_tags:
        wn_tag = get_wordnet_pos(tag)
        lemma = lemmatizer.lemmatize(token, wn_tag)
        lemmatized_sentence.append(lemma)
    
    print("Original:", token_list)
    print("Lemmatized:", " ".join(lemmatized_sentence))
    # "bats" → "bat"

    # "are" → "be"

    # "hanging" → "hang"

    # "feet" → "foot"    
    return lemmatized_sentence


### 9. Lemmatization

In [None]:
import pandas as pd

# Apply lemmatization
df['lemmatized'] = df['tokenized'].apply(lemmatize_tokens)
df['lemmatized'] = df['lemmatized'].apply(lambda tokens: " ".join(tokens))
df['lemmatized'] 
df
df.to_csv('../../data/test_sample.csv')

### Tokenization Function   

In [None]:
import re
from wordsegment import load, segment

# Load wordsegment once
load()

def tokenize_sentence(sentence):
    # Step 1: Segment merged words (e.g., "amazingsupport" → "amazing support")
    segmented = ' '.join(segment(sentence))
    
    # Step 2: Tokenize (split into lowercase words)
    tokens = re.findall(r'\b\w+\b', segmented.lower())
    
    return tokens
text = "I recently watched the new action movie and honestly, it was thrilling from start to finish. The plot kept me on the edge of my seat, and the acting was top-notch. However, I felt the soundtrack didn’t quite match the intensity of the scenes. The movie was released in May 2023 and runs for approximately 2 hours and 15 minutes. It grossed over $300 million worldwide in its first week. Many critics praised the director for his bold choices, while some audiences thought the pacing was too fast. Personally, I think the cinematography was stunning, especially in the desert scenes. According to IMDb, the film has a rating of 7.8 out of 10.The lead actor previously starred in a hit sci-fi franchise.In my opinion, this is his best performance yet."
tokenize_sentence(text)