In [6]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
import pandas as pd

In [14]:
# Sample movie reviews with sentiment labels
docs = ["I love this movie", "This movie is terrible",
"Amazing film", "Horrible acting",
"Best movie ever", "Waste of time" , "I hate wasting time"]

# 1=positive, 0=negative
labels = [1, 0, 1, 0, 1, 0 , 0]

In [15]:
# Create feature vectors
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(docs)

print(vectorizer.get_feature_names_out())
df = pd.DataFrame(X.toarray() , columns = vectorizer.get_feature_names_out())
df

['acting' 'amazing' 'best' 'ever' 'film' 'hate' 'horrible' 'is' 'love'
 'movie' 'of' 'terrible' 'this' 'time' 'waste' 'wasting']


Unnamed: 0,acting,amazing,best,ever,film,hate,horrible,is,love,movie,of,terrible,this,time,waste,wasting
0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0
1,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,0
2,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0
6,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1


In [16]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)


# Train Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

In [17]:
# Test on new review
new_review = ["I hate this film"]
prediction = model.predict(vectorizer.transform(new_review))
print(f"Prediction for '{new_review[0]}': {'Positive' if prediction[0] == 1 else 'Negative'}")

Prediction for 'I hate this film': Negative


**Using Transformer Library**

In [18]:
from transformers import pipeline

In [19]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


In [20]:
# Example text to summarize
text = """
Natural Language Processing is a field of artificial intelligence
that enables computers to understand, interpret, and generate human
language. It combines linguistics, computer science, and machine
learning techniques to bridge the gap between human communication
and computer understanding. NLP powers applications like chatbots,
translation services, sentiment analysis tools, and speech recognition
systems that we use every day. As deep learning advances, NLP models
are becoming increasingly sophisticated at understanding context,
nuance, and even humor in human language.
"""

In [24]:
response = summarizer(text, max_length=50, min_length=30, do_sample=False)[0]
print(response['summary_text'])

Natural Language Processing is a field of artificial intelligence that enables computers to understand, interpret, and generate humanlanguage. NLP powers applications like chatbots, translation services, sentiment analysis tools, and speech recognition systems.


**OCR**

In [26]:
!pip install pytesseract

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [28]:
import pytesseract
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np

In [30]:
# Load image containing text
image_path = "/content/testocr.png" # Path to your image
img = Image.open(image_path)

In [31]:
# Extract text from image
text = pytesseract.image_to_string(img)

In [32]:
# Display results
print("Extracted Text:")
print(text)

Extracted Text:
This is a lot of 12 point text to test the
ocr code and see if it works on all types
of file format.

The quick brown dog jumped over the
lazy fox. The quick brown dog jumped
over the lazy fox. The quick brown dog
jumped over the lazy fox. The quick
brown dog jumped over the lazy fox.



In [33]:
#Optional: Pre-processing for better OCR results
def preprocess_image(image):
  # Convert to grayscale
  gray = image.convert('L')
  # Binarization (thresholding)
  thresh = np.array(gray) > 150 # Adjust threshold as needed
  binary_img = Image.fromarray(thresh.astype(np.uint8) * 255)
  return binary_img

# Process and extract text from preprocessed image
preprocessed = preprocess_image(img)
improved_text = pytesseract.image_to_string(preprocessed)
print("\nText after preprocessing:")
print(improved_text)


Text after preprocessing:
This is a lot of 12 point text to test the
ocr code and see if it works on all types
of file format.

The quick brown dog jumped over the
lazy fox. The quick brown dog jumped
over the lazy fox. The quick brown dog
jumped over the lazy fox. The quick
brown dog jumped over the lazy fox.

