In [8]:
!pip install gensim nltk scikit-learn --quiet

import nltk

nltk.download('punkt')
nltk.download('stopwords')

nltk.download('all')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_da

True

In [9]:
!pip install gensim nltk scikit-learn

import nltk
nltk.download('punkt')
nltk.download('stopwords')

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import KeyedVectors
import gensim.downloader as api
import re


df = pd.read_csv("/content/spam.csv", encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'message']
df['label'] = df['label'].map({'ham': 0, 'spam': 1})


def preprocess(text):
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w.isalpha() and w not in stop_words]
    return tokens

df['tokens'] = df['message'].apply(preprocess)

print("Downloading word2vec-google-news-300 model (takes ~2 minutes)...")
w2v_model = api.load("word2vec-google-news-300")
print("Word2Vec model loaded!")


def get_vector(tokens):
    vectors = [w2v_model[word] for word in tokens if word in w2v_model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(300)

df['vector'] = df['tokens'].apply(get_vector)


X = np.vstack(df['vector'].values)
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

def predict_message_class(model, w2v_model, message):
    tokens = preprocess(message)
    vec = get_vector(tokens)
    pred = model.predict([vec])[0]
    return 'spam' if pred == 1 else 'ham'

example = "Congratulations! You've won a free ticket. Call now!"
print("Prediction:", predict_message_class(clf, w2v_model, example))




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['label'].map({'ham': 0, 'spam': 1})


Downloading word2vec-google-news-300 model (takes ~2 minutes)...
Word2Vec model loaded!
Test Accuracy: 0.9417040358744395
Prediction: spam


In [11]:
!pip install gensim nltk scikit-learn contractions --quiet

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

import pandas as pd
import numpy as np
import re
import string
import contractions
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import gensim.downloader as api


df = pd.read_csv("/content/Tweets.csv")[['airline_sentiment', 'text']]

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = contractions.fix(text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w.isalpha() and w not in stop_words]
    return tokens

df['tokens'] = df['text'].apply(clean_text)

def get_vector(tokens):
    vectors = [w2v_model[word] for word in tokens if word in w2v_model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(300)

df['vector'] = df['tokens'].apply(get_vector)

X = np.vstack(df['vector'].values)
y = df['airline_sentiment'].map({'positive': 0, 'neutral': 1, 'negative': 2})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['positive', 'neutral', 'negative']))

def predict_tweet_sentiment(model, w2v_model, tweet):
    tokens = clean_text(tweet)
    vec = get_vector(tokens)
    pred = model.predict([vec])[0]
    return {0: 'positive', 1: 'neutral', 2: 'negative'}[pred]

example = "The flight was delayed and nobody told us anything. Terrible service!"
print("Prediction:", predict_tweet_sentiment(clf, w2v_model, example))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Accuracy: 0.7721994535519126

Classification Report:
               precision    recall  f1-score   support

    positive       0.76      0.63      0.69       459
     neutral       0.57      0.42      0.48       580
    negative       0.81      0.92      0.86      1889

    accuracy                           0.77      2928
   macro avg       0.72      0.65      0.68      2928
weighted avg       0.76      0.77      0.76      2928

Prediction: negative
