In [1]:
!pip install transformers datasets torch scikit-learn

Collecting datasets
  Downloading datasets-4.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.4.1,>=0.3.0 (from datasets)
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.6.0-cp313-cp313-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.19 (from datasets)
  Downloading multiprocess-0.70.18-py313-none-any.whl.metadata (7.2 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<=2025.10.0,>=2023.1.0->datasets)
  Downloading aiohttp-3.13.3-cp313-cp313-win_amd64.whl.metadata (8.4 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.10.0,>=2023.1.0->datasets)
  Downloading aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)
Collecting aiosignal>=1.4.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.10.0,>=2023.1.0->datasets)
  Downloading aiosignal-1.4.0-py3-none-any.whl.metadata (3.7 kB)
Collecting frozenlist>=1.1.1 (from aiohttp!=4.0.0a0,!=4.0.0a1->fs


[notice] A new release of pip is available: 25.1.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# 1️ Sample Hindi Dataset
data = {
    "text": [
        "यह फिल्म बहुत अच्छी है",
        "मुझे यह मोबाइल पसंद नहीं आया",
        "सेवा बहुत खराब थी",
        "आज मौसम अच्छा है",
        "यह ठीक है",
        "बहुत बेकार अनुभव",
        "मुझे यह पसंद आया",
        "यह शानदार है",
        "यह उत्पाद खराब है",
        "मैं बहुत खुश हूँ",
        "यह औसत है",
        "मुझे यह पसंद नहीं है"
    ],
    "label": [
        "Positive",
        "Negative",
        "Negative",
        "Positive",
        "Neutral",
        "Negative",
        "Positive",
        "Positive",
        "Negative",
        "Positive",
        "Neutral",
        "Negative"
    ]
}


df = pd.DataFrame(data)

# 2️ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df["text"],
    df["label"],
    test_size=0.3,
    random_state=42,
    stratify=df["label"]   # IMPORTANT FIX
)

# 3️ TF-IDF Vectorization
vectorizer = TfidfVectorizer(
    ngram_range=(1,2),      # unigrams + bigrams
    min_df=1
)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 4️ Logistic Regression Model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# 5️ Predictions
y_pred = model.predict(X_test_tfidf)

# 7️ Test Custom Sentence
new_text = ["यह मोबाइल बहुत अच्छा है"]
new_text_tfidf = vectorizer.transform(new_text)
prediction = model.predict(new_text_tfidf)

print("\nSentiment Prediction:", prediction[0])



Sentiment Prediction: Positive


In [3]:
test_sentences = [
    "यह मोबाइल बहुत शानदार है",
    "यह बहुत खराब है",
    "यह औसत उत्पाद है"
]

test_tfidf = vectorizer.transform(test_sentences)
predictions = model.predict(test_tfidf)

for text, pred in zip(test_sentences, predictions):
    print(f"Text: {text}")
    print(f"Predicted Sentiment: {pred}")
    print("------")


Text: यह मोबाइल बहुत शानदार है
Predicted Sentiment: Positive
------
Text: यह बहुत खराब है
Predicted Sentiment: Negative
------
Text: यह औसत उत्पाद है
Predicted Sentiment: Positive
------
