In [1]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier



In [2]:
import pandas as pd
import re

# تحميل الملف
with open("../data/sentiment.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

data = []
for line in lines:
    match = re.match(r"\[(.*?)\]\s+(.*)", line.strip())
    if match:
        label = list(map(float, match.group(1).split()))
        text = match.group(2)
        data.append((label, text))

# تحويل لقائمة بها النص والتصنيف كـ index (مثلاً التصنيف رقم كام)
df = pd.DataFrame(data, columns=["label", "text"])
df["target"] = df["label"].apply(lambda x: x.index(1.0))

print(df.head())


                                 label  \
0  [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]   
1  [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]   
2  [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]   
3  [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]   
4  [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]   

                                                text  target  
0  During the period of falling in love, each tim...       0  
1         When I was involved in a traffic accident.       1  
2  When I was driving home after  several days of...       2  
3   When I lost the person who meant the most to me.       3  
4  The time I knocked a deer down - the sight of ...       4  


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7480 entries, 0 to 7479
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   7480 non-null   object
 1   text    7480 non-null   object
 2   target  7480 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 175.4+ KB


Clean Text

In [4]:
import nltk

nltk.download('punkt')  # Ensure it's downloaded

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Eman\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
df.shape

(7480, 3)

In [6]:
df.isnull().sum()

label     0
text      0
target    0
dtype: int64

In [7]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

# Ensure the column is treated as a string to avoid unexpected errors
df["text"] = df["text"].astype(str)

In [8]:
import re
# Defining a function to clean up the text
def Clean(Text):
    sms = re.sub('[^a-zA-Z]', ' ', Text) #Replacing all non-alphabetic characters with a space
    sms = re.sub(r'\n', ' ', sms)  # إزالة الأسطر الجديد
    sms = re.sub(r'\d+', '', sms)  # إزالة الأرقام
    sms = re.sub(r'[^a-zA-Z\s]', '', sms)
    sms = re.sub(r'\b[a-zA-Z]\b', '', sms)  # إزالة الحروف المنفردة مثل "a" أو "i"
    sms = sms.lower() #converting to lowecase
    sms = sms.split()
    sms = ' '.join(sms)
    return sms

df["text"] = df["text"].apply(Clean)
df.head(100)

Unnamed: 0,label,text,target
0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",during the period of falling in love each time...,0
1,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",when was involved in traffic accident,1
2,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",when was driving home after several days of ha...,2
3,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]",when lost the person who meant the most to me,3
4,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]",the time knocked deer down the sight of the an...,4
...,...,...,...
95,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]",bad smelling cucumber,4
96,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]",nearly caught masturbating,5
97,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]",do not help out enough at home,6
98,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",felt very happy when won the football pools,0


In [9]:
# Add new columns efficiently
df["No_of_Characters"] = df["text"].apply(len)
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
target,7480.0,2.977273,1.999269,0.0,1.0,3.0,5.0,6.0
No_of_Characters,7480.0,106.538369,69.979243,6.0,56.0,92.0,142.0,811.0


In [10]:
#Teckonization
from nltk.tokenize import word_tokenize 
df["text"]=df.apply(lambda row: nltk.word_tokenize(row["text"]), axis=1)

df.head(5)

Unnamed: 0,label,text,target,No_of_Characters
0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[during, the, period, of, falling, in, love, e...",0,107
1,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[when, was, involved, in, traffic, accident]",1,37
2,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]","[when, was, driving, home, after, several, day...",2,165
3,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[when, lost, the, person, who, meant, the, mos...",3,45
4,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[the, time, knocked, deer, down, the, sight, o...",4,205


In [11]:
#Remove Stopwords
from nltk.corpus import stopwords   
nltk.download('stopwords')
# Removing the stopwords function
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    filtered_text = [word for word in text if word not in stop_words]
    return filtered_text

df["text"] = df["text"].apply(remove_stopwords)

df.head(5)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Eman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,label,text,target,No_of_Characters
0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[period, falling, love, time, met, especially,...",0,107
1,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[involved, traffic, accident]",1,37
2,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]","[driving, home, several, days, hard, work, mot...",2,165
3,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[lost, person, meant]",3,45
4,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[time, knocked, deer, sight, animal, injuries,...",4,205


In [12]:
#Lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()
# lemmatize string
def lemmatize_word(text):
    #word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in text]
    return lemmas

df["text"] = df["text"].apply(lemmatize_word)
df.head(5)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Eman\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,label,text,target,No_of_Characters
0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[period, fall, love, time, meet, especially, m...",0,107
1,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[involve, traffic, accident]",1,37
2,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]","[drive, home, several, days, hard, work, motor...",2,165
3,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[lose, person, mean]",3,45
4,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[time, knock, deer, sight, animal, injuries, h...",4,205


In [13]:
#Creating a corpus of text feature to encode further into vectorized form
corpus= []
for i in df["text"]:
    msg = ' '.join([row for row in i])
    corpus.append(msg)
corpus

['period fall love time meet especially meet long time',
 'involve traffic accident',
 'drive home several days hard work motorist ahead drive km hour refuse despite low speeed let overtake',
 'lose person mean',
 'time knock deer sight animal injuries helplessness realization animal badly hurt put animal scream moment death',
 'speak truth',
 'cause problems somebody could keep appoint time lead various consequences',
 'get letter offer summer job apply',
 'go home alone one night paris man come behind ask afraid alone late night',
 'talk party first time long friend come interrupt us leave',
 'friends ask go new year party',
 'saw drink kid years old town walpurgis night',
 'could remember say presentation task account meet',
 'uncle neighbour come home police escort',
 'days feel close partner friends feel peace also experience close contact people regard greatly',
 'every time imagine someone love could contact serious illness even death',
 'obviously unjustly treat possibility elu

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
#Changing text data in to numbers.
tfidf = TfidfVectorizer()

X = tfidf.fit_transform(corpus).toarray()
#Let's have a look at our feature
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [15]:
from sklearn.preprocessing import LabelEncoder
#Label encode the Target and use it as y
label_encoder = LabelEncoder()
df["target"] = label_encoder.fit_transform(df["target"])

In [16]:
# تقسيم البيانات
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['target'], test_size=0.2, random_state=42)

<a id="5"></a>
# <p style="background-color:#E598D8;font-family:newtimeroman;font-size:150%;color:#E1F16B;text-align:center;border-radius:20px 60px;">LogisticRegression</p>



In [17]:
X_train,X_test,y_train,y_test = train_test_split(X,df['target'],test_size=0.4,random_state=42)
lr = LogisticRegression()

mnb = MultinomialNB()

In [18]:
lr.fit(X_train,y_train)

In [19]:
from sklearn.metrics import accuracy_score
accuracy_score(lr.predict(X_test),y_test),accuracy_score(lr.predict(X_train),y_train)

(0.55548128342246, 0.8384581105169341)

<a id="5"></a>
# <p style="background-color:#E598D8;font-family:newtimeroman;font-size:150%;color:#E1F16B;text-align:center;border-radius:20px 60px;">NaiveBase</p>


In [20]:
mnb.fit(X_train,y_train)

In [21]:
accuracy_score(mnb.predict(X_test),y_test),accuracy_score(mnb.predict(X_train),y_train)

(0.5461229946524064, 0.8108288770053476)

<a id="5"></a>
# <p style="background-color:#E598D8;font-family:newtimeroman;font-size:150%;color:#E1F16B;text-align:center;border-radius:20px 60px;">DecisionTreeClassifier</p>


In [22]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)


In [23]:
accuracy_score(dt.predict(X_test),y_test),accuracy_score(dt.predict(X_train),y_train)

(0.47961229946524064, 0.9968805704099821)

<a id="5"></a>
# <p style="background-color:#E598D8;font-family:newtimeroman;font-size:150%;color:#E1F16B;text-align:center;border-radius:20px 60px;">RandomForestClassifier</p>

In [24]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)

In [25]:
accuracy_score(rf.predict(X_test),y_test),accuracy_score(rf.predict(X_train),y_train)


(0.5424465240641712, 0.9968805704099821)

In [26]:
print(classification_report(rf.predict(X_train),y_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       629
           1       1.00      1.00      1.00       655
           2       1.00      1.00      1.00       649
           3       1.00      0.99      0.99       627
           4       1.00      1.00      1.00       654
           5       0.99      0.99      0.99       639
           6       1.00      1.00      1.00       635

    accuracy                           1.00      4488
   macro avg       1.00      1.00      1.00      4488
weighted avg       1.00      1.00      1.00      4488



In [27]:
print(classification_report(rf.predict(X_test),y_test))

              precision    recall  f1-score   support

           0       0.69      0.59      0.63       530
           1       0.66      0.59      0.62       477
           2       0.41      0.44      0.42       400
           3       0.54      0.64      0.59       387
           4       0.61      0.56      0.58       442
           5       0.43      0.48      0.45       367
           6       0.44      0.48      0.46       389

    accuracy                           0.54      2992
   macro avg       0.54      0.54      0.54      2992
weighted avg       0.55      0.54      0.55      2992



In [28]:
from xgboost import XGBClassifier

XGB = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
XGB.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [29]:
accuracy_score(XGB.predict(X_test),y_test),accuracy_score(XGB.predict(X_train),y_train)

(0.5337566844919787, 0.8705436720142602)

Save Random forest Model (rf) 

In [30]:
import joblib
joblib.dump(rf, '../models/sentiment_model.pkl')

['../models/sentiment_model.pkl']

Save sentiment tfidf vectorizer

In [31]:
# Save the TF-IDF vectorizer
import joblib
joblib.dump(rf, '../models/senttfidf_vectorizer.pkl')


['../models/senttfidf_vectorizer.pkl']

In [33]:
import pickle

# Save the model
with open("../models/sentiment_model.pkl", "wb") as model_file:
    pickle.dump(rf, model_file)

# Save the TF-IDF vectorizer
with open("../models/sentiment_vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(tfidf, vectorizer_file)

print("Model and vectorizer saved successfully!")

Model and vectorizer saved successfully!


In [34]:
# Load the model
with open("../models/sentiment_model.pkl", "rb") as model_file:
    loaded_model = pickle.load(model_file)

# Load the TF-IDF vectorizer
with open("../models/sentiment_vectorizer.pkl", "rb") as vectorizer_file:
    loaded_vectorizer = pickle.load(vectorizer_file)

In [37]:
test_messages = [
    "I hate this product! It's amazing.","When I got a letter offering me the Summer job that I had applied for"
]
# Transform test messages using the trained vectorizer
test_messages_tfidf = loaded_vectorizer.transform(test_messages)

# Make predictions
predictions = loaded_model.predict(test_messages_tfidf)

# Print results
for msg, pred in zip(test_messages, predictions):
    print(pred)

6
0
