In [2]:
import pandas as pd
import re
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
!pip install scikit-learn




In [3]:
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/wordnet')
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    print("Downloading necessary NLTK data...")
    nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger', 'omw-1.4', 'averaged_perceptron_tagger_eng'], quiet=True)
    print("NLTK data download complete.")

Downloading necessary NLTK data...
NLTK data download complete.


[nltk_data] <urlopen error [Errno 11001] getaddrinfo failed>


In [4]:
tweet_tokenizer = TweetTokenizer()
lemmatizer = WordNetLemmatizer()

In [5]:
def load_data(data_path='twitter_training.csv', text_column='text'):
    """Loads the dataset and prepares the DataFrame."""
    try:
        df = pd.read_csv(data_path, header=None, encoding='latin1')
        df.columns = ['ID', 'Entity', 'sentiment', text_column]
        df = df.dropna(subset=[text_column]).reset_index(drop=True)
        return df
    except FileNotFoundError:
        print(f"Error: Data file not found at {data_path}.")
        return None


In [6]:
def get_wordnet_pos(tag):
    """Map NLTK POS tags to WordNet POS tags"""
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [7]:
def extract_features(text):
    hashtags = re.findall(r'#(\w+)', text)
    mentions = re.findall(r'@(\w+)', text)
    return ' '.join(hashtags), ' '.join(mentions)

In [None]:
def clean_and_tokenize(text):
    if pd.isna(text) or not isinstance(text, str):
        return []

    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    tokens = tweet_tokenizer.tokenize(text)

    return tokens

In [9]:
def lemmatize_tokens(tokens):
    """Lemmatizes tokens using POS tagging for context-aware normalization."""
    tagged_tokens = nltk.pos_tag(tokens)

    lemmatized_tokens = []
    for word, tag in tagged_tokens:
        wntag = get_wordnet_pos(tag)
        if isinstance(word, str):
            lemma = lemmatizer.lemmatize(word, pos=wntag)
            lemmatized_tokens.append(lemma)

    return lemmatized_tokens

In [10]:
def preprocess_data(df, text_column='text'):
    df[text_column] = df[text_column].fillna('')

    df['hashtags'], df['mentions'] = zip(*df[text_column].apply(extract_features))

    df['tokens'] = df[text_column].apply(clean_and_tokenize)

    df['lemmas'] = df['tokens'].apply(lemmatize_tokens)

    df['processed_text'] = df['lemmas'].apply(lambda x: ' '.join(x))

    return df


In [11]:
def vectorize_data(df, text_column='processed_text'):

    tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True)

    tfidf_matrix = tfidf_vectorizer.fit_transform(df[text_column])

    feature_names = tfidf_vectorizer.get_feature_names_out()
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

    return tfidf_df, tfidf_vectorizer

In [12]:
TEXT_COLUMN = 'text'

print("--- 1. Loading Data ---")
df = load_data(text_column=TEXT_COLUMN)

if df is None:
    print("Exiting script due to data loading error.")
else:
    df_sample = df.head(100).copy()

    print(f"Successfully loaded {len(df)} rows. Processing a sample of {len(df_sample)} rows.")
    print("\n--- Original Data Sample (First 5 rows) ---")
    print(df_sample[[TEXT_COLUMN]].head().to_markdown(index=False))

    print("\n--- 2. Applying Preprocessing Pipeline ---")
    processed_df = preprocess_data(df_sample, text_column=TEXT_COLUMN)

    print("\n--- 3. Processed Data Sample (First 5 rows) ---")
    print(processed_df[[TEXT_COLUMN, 'hashtags', 'mentions', 'processed_text']].head().to_markdown(index=False))

    print("\n--- 4. Applying TF-IDF Vectorization (sublinear_tf=True) ---")
    tfidf_df, vectorizer = vectorize_data(processed_df)

    print(f"\nTotal features (vocabulary size): {len(vectorizer.get_feature_names_out())}")

    print("\n--- 5. TF-IDF Vectorization Sample (First 5 rows, first 10 features) ---")

    feature_names = vectorizer.get_feature_names_out()[:10]
    tfidf_sample_output = tfidf_df.iloc[:5, :10]
    tfidf_sample_output.columns = feature_names
    tfidf_sample_output.index = [f"Doc {i+1}" for i in range(5)]

    print(tfidf_sample_output.to_markdown())

    print("\n--- Script Execution Complete ---")

--- 1. Loading Data ---
Successfully loaded 73996 rows. Processing a sample of 100 rows.

--- Original Data Sample (First 5 rows) ---
| text                                                      |
|:----------------------------------------------------------|
| im getting on borderlands and i will murder you all ,     |
| I am coming to the borders and I will kill you all,       |
| im getting on borderlands and i will kill you all,        |
| im coming on borderlands and i will murder you all,       |
| im getting on borderlands 2 and i will murder you me all, |

--- 2. Applying Preprocessing Pipeline ---

--- 3. Processed Data Sample (First 5 rows) ---
| text                                                      | hashtags   | mentions   | processed_text                                      |
|:----------------------------------------------------------|:-----------|:-----------|:----------------------------------------------------|
| im getting on borderlands and i will murder you all ,

In [13]:
import os
os.getcwd()

'c:\\Users\\SIgma\\OneDrive\\Documents\\GitHub\\Team-Route\\notebooks'

In [117]:
df=load_data(r"c:\Users\SIgma\OneDrive\Documents\GitHub\Team-Route\notebooks\twitter_training.csv")
df=preprocess_data(df, text_column='text')


In [119]:
display(df.head())


Unnamed: 0,text,sentiment
0,im getting on borderlands and i will murder yo...,Positive
1,i am coming to the borders and i will kill you...,Positive
2,im getting on borderlands and i will kill you all,Positive
3,im coming on borderlands and i will murder you...,Positive
4,im getting on borderlands and i will murder y...,Positive


In [120]:
from sklearn.preprocessing import LabelEncoder

In [121]:
le=LabelEncoder()

display(df['sentiment'].unique())
df['sentiment']=le.fit_transform(df['sentiment'])

display(df.head())
df.info()
display(df['sentiment'].value_counts())

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

Unnamed: 0,text,sentiment
0,im getting on borderlands and i will murder yo...,3
1,i am coming to the borders and i will kill you...,3
2,im getting on borderlands and i will kill you all,3
3,im coming on borderlands and i will murder you...,3
4,im getting on borderlands and i will murder y...,3


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73996 entries, 0 to 73995
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       73996 non-null  object
 1   sentiment  73996 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.1+ MB


sentiment
1    22358
3    20655
2    18108
0    12875
Name: count, dtype: int64

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


[{'label': 'positive', 'score': 0.9660703539848328}]
[{'label': 'negative', 'score': 0.8278113603591919}]
accuracy DistilBERT (RoBERTa-twitter) المطور لـ4 فئات: 0.0000


In [None]:
# =--------- DistilBERT ---------=
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score
import numpy as np

model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4, ignore_mismatched_sizes=True)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=128)


train_df = df.sample(frac=0.1, random_state=42)  
val_df = df.sample(frac=0.02, random_state=99)   

train_dataset = Dataset.from_pandas(train_df[['text', 'sentiment']])
val_dataset = Dataset.from_pandas(val_df[['text', 'sentiment']])

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.rename_column("sentiment", "labels")
val_dataset = val_dataset.rename_column("sentiment", "labels")

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy_score(labels, preds)}

training_args = TrainingArguments(
    output_dir="./fast_model",
    num_train_epochs=2,                  
    per_device_train_batch_size=32,       
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_steps=20,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("جاري التدريب السريع... هيخلّص في أقل من دقيقة!")
trainer.train()

results = trainer.evaluate()
print(f"الدقة السريعة: {results['eval_accuracy']:.4f}")  


trainer.save_model("my_fast_model")
tokenizer.save_pretrained("my_fast_model")
print("خلّص وحفظ النموذج!")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

Map:   0%|          | 0/7400 [00:00<?, ? examples/s]

Map:   0%|          | 0/1480 [00:00<?, ? examples/s]

  trainer = Trainer(


جاري التدريب السريع... هيخلّص في أقل من دقيقة!




Epoch,Training Loss,Validation Loss


In [None]:
# ---------------------------------------------------------------------------------------- LSTM -----------------------------------------------
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['text'])
X = tokenizer.texts_to_sequences(df['text'])
X = pad_sequences(X, maxlen=50) 

y = pd.get_dummies(df['sentiment']).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
display(y)
display(X)

model_lstm = Sequential()
model_lstm.add(Embedding(5000, 128, input_length=50))
model_lstm.add(LSTM(128, dropout=0.5))
model_lstm.add(Dense(4, activation='softmax')) # ['Positive', 'Neutral', 'Negative', 'Irrelevant'],

model_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



model_lstm.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), verbose=2)

acc_lstm = model_lstm.evaluate(X_test, y_test)[1]
print(f"دقة LSTM: {acc_lstm:.4f}")  

array([[False, False, False,  True],
       [False, False, False,  True],
       [False, False, False,  True],
       ...,
       [False, False, False,  True],
       [False, False, False,  True],
       [False, False, False,  True]], shape=(73996, 4))

array([[   0,    0,    0, ..., 1628,   12,   26],
       [   0,    0,    0, ...,  399,   12,   26],
       [   0,    0,    0, ...,  399,   12,   26],
       ...,
       [   0,    0,    0, ...,  112,  278, 1979],
       [   0,    0,    0, ..., 1865,  157, 1979],
       [   0,    0,    0, ...,    2,  278, 1979]],
      shape=(73996, 50), dtype=int32)

Epoch 1/5




925/925 - 34s - 37ms/step - accuracy: 0.5664 - loss: 1.0334 - val_accuracy: 0.6526 - val_loss: 0.8716
Epoch 2/5
925/925 - 31s - 34ms/step - accuracy: 0.6980 - loss: 0.7684 - val_accuracy: 0.7082 - val_loss: 0.7513
Epoch 3/5
925/925 - 33s - 36ms/step - accuracy: 0.7526 - loss: 0.6431 - val_accuracy: 0.7388 - val_loss: 0.6824
Epoch 4/5
925/925 - 35s - 38ms/step - accuracy: 0.7861 - loss: 0.5576 - val_accuracy: 0.7611 - val_loss: 0.6427
Epoch 5/5
925/925 - 69s - 74ms/step - accuracy: 0.8091 - loss: 0.4996 - val_accuracy: 0.7724 - val_loss: 0.6190
[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.7724 - loss: 0.6190
دقة LSTM: 0.7724


In [None]:
# ---------------------------------------------------------------------------------------- CNN -----------------------------------------------
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D

model_cnn = Sequential()
model_cnn.add(Embedding(5000, 128, input_length=50))
model_cnn.add(Conv1D(128, 5, activation='relu'))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(64, activation='relu'))
model_cnn.add(Dropout(0.5))
model_cnn.add(Dense(4, activation='softmax'))

model_cnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


model_cnn.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), verbose=2)

acc_cnn = model_cnn.evaluate(X_test, y_test)[1]
print(f" CNN: {acc_cnn:.4f}")  

Epoch 1/5
925/925 - 13s - 14ms/step - accuracy: 0.5840 - loss: 1.0121 - val_accuracy: 0.7016 - val_loss: 0.7781
Epoch 2/5
925/925 - 12s - 13ms/step - accuracy: 0.7929 - loss: 0.5742 - val_accuracy: 0.7920 - val_loss: 0.5542
Epoch 3/5
925/925 - 10s - 11ms/step - accuracy: 0.8890 - loss: 0.3176 - val_accuracy: 0.8205 - val_loss: 0.5043
Epoch 4/5
925/925 - 10s - 11ms/step - accuracy: 0.9253 - loss: 0.2048 - val_accuracy: 0.8304 - val_loss: 0.5181
Epoch 5/5
925/925 - 9s - 10ms/step - accuracy: 0.9404 - loss: 0.1600 - val_accuracy: 0.8376 - val_loss: 0.5322
[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8376 - loss: 0.5322
 CNN: 0.8376


In [None]:


print("="*50)
print(f"LSTM        → {acc_lstm:.4f}")
print(f"CNN         → {acc_cnn:.4f}")
print(f"DistilBERT  → {acc:.4f}")
print("="*50)


LSTM        → 0.7724
CNN         → 0.8376
DistilBERT  → 0.1660
