<a href="https://colab.research.google.com/github/Zdracox/Demo/blob/main/Text_Mining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Text Analysis

In [None]:
!pip install nltk

In [None]:
import nltk
nltk.download('all')

In [None]:
# Tokenization

text = "This is a Big Data course in CCTB."
tokens = nltk.word_tokenize(text)
print(tokens)

['This', 'is', 'a', 'Big', 'Data', 'course', 'in', 'CCTB', '.']


In [None]:
paragraph_text = "This is a Big Data course in CCTB. This is our second semester. We are loving it."
sent_tokens = nltk.sent_tokenize(paragraph_text)
print(sent_tokens)

['This is a Big Data course in CCTB.', 'This is our second semester.', 'We are loving it.']


In [None]:
# Counter Tokens
from collections import Counter

word_counts = Counter(tokens)
print(word_counts)

Counter({'This': 1, 'is': 1, 'a': 1, 'Big': 1, 'Data': 1, 'course': 1, 'in': 1, 'CCTB': 1, '.': 1})


In [None]:
# StopWords (The, an , or ,etc.)

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print(filtered_tokens)

['Big', 'Data', 'course', 'CCTB', '.']


In [None]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [None]:
# Stemming and Lemitizing (Back to root form)

from nltk.stem import PorterStemmer, WordNetLemmatizer

stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in tokens]
print(stemmed_words)

lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
print(lemmatized_words)

['thi', 'is', 'a', 'big', 'data', 'cours', 'in', 'cctb', '.']
['This', 'is', 'a', 'Big', 'Data', 'course', 'in', 'CCTB', '.']


In [None]:
print(stemmer.stem('ammunition'))
print(lemmatizer.lemmatize('ammunition'))

ammunit
ammunition


In [None]:
# Sentiment Analysis

from nltk.sentiment import SentimentIntensityAnalyzer

text = "I love this course! This is very exciting and amazing!"
sia = SentimentIntensityAnalyzer()
sentiment_scores = sia.polarity_scores(text)
print(sentiment_scores)

{'neg': 0.0, 'neu': 0.327, 'pos': 0.673, 'compound': 0.9237}


compound value is:
  >.>0.5 - positive\
  <-0.5 - negative\
  -0.5 to 0.5 - neutral

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer

text = "I Hate this course! Its very hard and confusing"
sia = SentimentIntensityAnalyzer()
sentiment_scores = sia.polarity_scores(text)
print(sentiment_scores)

{'neg': 0.611, 'neu': 0.389, 'pos': 0.0, 'compound': -0.7813}


In [None]:
# Text Classification

import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Training data
documents = [
    ("I love this course", "positive"),
    ("I hate this program", "negative"),
    ("This was an awesome movie", "positive"),
    ("The course was terrible", "negative")
]

# Prepare features and labels
vectorizer = CountVectorizer()
features = vectorizer.fit_transform([doc[0] for doc in documents])
labels = [doc[1] for doc in documents]

# Train a classifier (Naive Bayes)
classifier = MultinomialNB()
classifier.fit(features, labels)

# Test with a new example
new_example = vectorizer.transform(["I really enjoyed watching this film"])
prediction = classifier.predict(new_example)
print(prediction)

['positive']
